From 7af80f6618adc4aa5337f94038676f7bc1b5a967 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 21 Dec 2021 05:37:32 -0500
Subject: [PATCH] Convert docstrings of modeling files (#14850)
* Convert file_utils docstrings to Markdown
* Test on BERT
* Return block indent
* Temporarily disable doc styler
* Remove from quality checks as well
* Remove doc styler mess
* Remove check from circleCI
* Fix typo
* Convert file_utils docstrings to Markdown
* Test on BERT
* Return block indent
* Temporarily disable doc styler
* Remove from quality checks as well
* Remove doc styler mess
* Remove check from circleCI
* Fix typo
* Let's go on all other model files
* Add templates too
* Styling and quality
---
.circleci/config.yml | 2 +-
Makefile | 4 +-
src/transformers/file_utils.py | 831 ++++++++++--------
src/transformers/modeling_flax_outputs.py | 356 ++++----
src/transformers/modeling_outputs.py | 500 +++++------
src/transformers/modeling_tf_outputs.py | 457 +++++-----
.../models/albert/modeling_albert.py | 150 ++--
.../models/albert/modeling_flax_albert.py | 78 +-
.../models/albert/modeling_tf_albert.py | 145 ++-
src/transformers/models/bart/modeling_bart.py | 374 ++++----
.../models/bart/modeling_flax_bart.py | 176 ++--
.../models/bart/modeling_tf_bart.py | 247 +++---
src/transformers/models/beit/modeling_beit.py | 153 ++--
.../models/beit/modeling_flax_beit.py | 92 +-
src/transformers/models/bert/modeling_bert.py | 302 +++----
.../models/bert/modeling_flax_bert.py | 91 +-
.../models/bert/modeling_tf_bert.py | 225 +++--
.../modeling_bert_generation.py | 136 +--
.../models/big_bird/modeling_big_bird.py | 247 +++---
.../models/big_bird/modeling_flax_big_bird.py | 98 +--
.../modeling_bigbird_pegasus.py | 369 ++++----
.../models/blenderbot/modeling_blenderbot.py | 354 ++++----
.../blenderbot/modeling_flax_blenderbot.py | 166 ++--
.../blenderbot/modeling_tf_blenderbot.py | 247 +++---
.../modeling_blenderbot_small.py | 353 ++++----
.../modeling_flax_blenderbot_small.py | 176 ++--
.../modeling_tf_blenderbot_small.py | 247 +++---
.../models/camembert/modeling_camembert.py | 22 +-
.../models/camembert/modeling_tf_camembert.py | 48 +-
.../models/canine/modeling_canine.py | 135 ++-
src/transformers/models/clip/modeling_clip.py | 212 ++---
.../models/clip/modeling_flax_clip.py | 199 +++--
.../models/convbert/modeling_convbert.py | 94 +-
.../models/convbert/modeling_tf_convbert.py | 128 ++-
src/transformers/models/ctrl/modeling_ctrl.py | 109 ++-
.../models/ctrl/modeling_tf_ctrl.py | 128 ++-
.../models/deberta/modeling_deberta.py | 151 ++--
.../models/deberta/modeling_tf_deberta.py | 160 ++--
.../models/deberta_v2/modeling_deberta_v2.py | 151 ++--
.../deberta_v2/modeling_tf_deberta_v2.py | 160 ++--
src/transformers/models/deit/modeling_deit.py | 89 +-
src/transformers/models/detr/modeling_detr.py | 371 ++++----
.../models/distilbert/modeling_distilbert.py | 139 ++-
.../distilbert/modeling_flax_distilbert.py | 43 +-
.../distilbert/modeling_tf_distilbert.py | 110 ++-
src/transformers/models/dpr/modeling_dpr.py | 154 ++--
.../models/dpr/modeling_tf_dpr.py | 180 ++--
.../models/electra/modeling_electra.py | 142 ++-
.../models/electra/modeling_flax_electra.py | 93 +-
.../models/electra/modeling_tf_electra.py | 156 ++--
.../modeling_encoder_decoder.py | 189 ++--
.../modeling_flax_encoder_decoder.py | 239 +++--
.../modeling_tf_encoder_decoder.py | 182 ++--
.../models/flaubert/modeling_flaubert.py | 86 +-
.../models/flaubert/modeling_tf_flaubert.py | 137 ++-
src/transformers/models/fnet/modeling_fnet.py | 159 ++--
src/transformers/models/fsmt/modeling_fsmt.py | 138 ++-
.../models/funnel/modeling_funnel.py | 129 ++-
.../models/funnel/modeling_tf_funnel.py | 136 ++-
.../models/gpt2/modeling_flax_gpt2.py | 68 +-
src/transformers/models/gpt2/modeling_gpt2.py | 226 +++--
.../models/gpt2/modeling_tf_gpt2.py | 225 +++--
.../models/gpt_neo/modeling_flax_gpt_neo.py | 68 +-
.../models/gpt_neo/modeling_gpt_neo.py | 113 ++-
.../models/gptj/modeling_flax_gptj.py | 68 +-
src/transformers/models/gptj/modeling_gptj.py | 116 +--
.../models/hubert/modeling_hubert.py | 75 +-
.../models/hubert/modeling_tf_hubert.py | 153 ++--
.../models/ibert/modeling_ibert.py | 102 +--
.../models/imagegpt/modeling_imagegpt.py | 208 ++---
.../models/layoutlm/modeling_layoutlm.py | 234 +++--
.../models/layoutlm/modeling_tf_layoutlm.py | 263 +++---
.../models/layoutlmv2/modeling_layoutlmv2.py | 176 ++--
src/transformers/models/led/modeling_led.py | 535 ++++++-----
.../models/led/modeling_tf_led.py | 372 ++++----
.../models/longformer/modeling_longformer.py | 438 +++++----
.../longformer/modeling_tf_longformer.py | 399 ++++-----
src/transformers/models/luke/modeling_luke.py | 367 ++++----
.../models/lxmert/modeling_lxmert.py | 208 +++--
.../models/lxmert/modeling_tf_lxmert.py | 183 ++--
.../models/m2m_100/modeling_m2m_100.py | 283 +++---
.../models/marian/modeling_flax_marian.py | 176 ++--
.../models/marian/modeling_marian.py | 353 ++++----
.../models/marian/modeling_tf_marian.py | 247 +++---
.../models/mbart/modeling_flax_mbart.py | 176 ++--
.../models/mbart/modeling_mbart.py | 372 ++++----
.../models/mbart/modeling_tf_mbart.py | 253 +++---
.../megatron_bert/modeling_megatron_bert.py | 254 +++---
src/transformers/models/mmbt/modeling_mmbt.py | 126 ++-
.../models/mobilebert/modeling_mobilebert.py | 183 ++--
.../mobilebert/modeling_tf_mobilebert.py | 143 ++-
.../models/mpnet/modeling_mpnet.py | 85 +-
.../models/mpnet/modeling_tf_mpnet.py | 117 ++-
.../models/mt5/modeling_flax_mt5.py | 54 +-
src/transformers/models/mt5/modeling_mt5.py | 75 +-
.../models/mt5/modeling_tf_mt5.py | 75 +-
.../models/openai/modeling_openai.py | 151 ++--
.../models/openai/modeling_tf_openai.py | 162 ++--
.../models/pegasus/modeling_flax_pegasus.py | 168 ++--
.../models/pegasus/modeling_pegasus.py | 378 ++++----
.../models/pegasus/modeling_tf_pegasus.py | 251 +++---
.../models/perceiver/modeling_perceiver.py | 599 +++++++------
.../models/prophetnet/modeling_prophetnet.py | 470 +++++-----
.../models/qdqbert/modeling_qdqbert.py | 204 +++--
src/transformers/models/rag/modeling_rag.py | 663 +++++++-------
.../models/rag/modeling_tf_rag.py | 621 +++++++------
.../models/reformer/modeling_reformer.py | 163 ++--
.../models/rembert/modeling_rembert.py | 173 ++--
.../models/rembert/modeling_tf_rembert.py | 175 ++--
.../models/retribert/modeling_retribert.py | 32 +-
.../models/roberta/modeling_flax_roberta.py | 54 +-
.../models/roberta/modeling_roberta.py | 179 ++--
.../models/roberta/modeling_tf_roberta.py | 175 ++--
.../models/roformer/modeling_roformer.py | 166 ++--
.../models/roformer/modeling_tf_roformer.py | 126 ++-
.../models/segformer/modeling_segformer.py | 92 +-
src/transformers/models/sew/modeling_sew.py | 57 +-
.../models/sew_d/modeling_sew_d.py | 122 ++-
.../modeling_speech_encoder_decoder.py | 208 +++--
.../speech_to_text/modeling_speech_to_text.py | 295 +++----
.../modeling_speech_to_text_2.py | 186 ++--
.../models/splinter/modeling_splinter.py | 94 +-
.../squeezebert/modeling_squeezebert.py | 131 ++-
.../models/t5/modeling_flax_t5.py | 179 ++--
src/transformers/models/t5/modeling_t5.py | 212 +++--
src/transformers/models/t5/modeling_tf_t5.py | 213 +++--
.../models/tapas/modeling_tapas.py | 438 ++++-----
.../models/tapas/modeling_tf_tapas.py | 376 ++++----
.../transfo_xl/modeling_tf_transfo_xl.py | 142 ++-
.../models/transfo_xl/modeling_transfo_xl.py | 136 ++-
.../models/trocr/modeling_trocr.py | 186 ++--
.../models/unispeech/modeling_unispeech.py | 189 ++--
.../unispeech_sat/modeling_unispeech_sat.py | 152 ++--
.../modeling_flax_vision_encoder_decoder.py | 200 +++--
.../modeling_vision_encoder_decoder.py | 176 ++--
.../modeling_flax_vision_text_dual_encoder.py | 151 ++--
.../modeling_vision_text_dual_encoder.py | 211 ++---
.../visual_bert/modeling_visual_bert.py | 376 ++++----
.../models/vit/modeling_flax_vit.py | 45 +-
.../models/vit/modeling_tf_vit.py | 90 +-
src/transformers/models/vit/modeling_vit.py | 68 +-
.../models/wav2vec2/modeling_flax_wav2vec2.py | 122 ++-
.../models/wav2vec2/modeling_tf_wav2vec2.py | 155 ++--
.../models/wav2vec2/modeling_wav2vec2.py | 266 +++---
.../models/wavlm/modeling_wavlm.py | 124 ++-
.../models/xlm/modeling_tf_xlm.py | 149 ++--
src/transformers/models/xlm/modeling_xlm.py | 185 ++--
.../xlm_prophetnet/modeling_xlm_prophetnet.py | 145 +--
.../xlm_roberta/modeling_tf_xlm_roberta.py | 50 +-
.../xlm_roberta/modeling_xlm_roberta.py | 22 +-
.../models/xlnet/modeling_tf_xlnet.py | 325 ++++---
.../models/xlnet/modeling_xlnet.py | 403 +++++----
...ax_{{cookiecutter.lowercase_modelname}}.py | 240 +++--
...tf_{{cookiecutter.lowercase_modelname}}.py | 419 +++++----
...ng_{{cookiecutter.lowercase_modelname}}.py | 589 ++++++-------
155 files changed, 15406 insertions(+), 16137 deletions(-)
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 097a6dea80..109ff537fd 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -848,7 +848,7 @@ jobs:
- run: isort --check-only examples tests src utils
- run: python utils/custom_init_isort.py --check_only
- run: flake8 examples tests src utils
- - run: python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
+# - run: python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
check_repository_consistency:
working_directory: ~/transformers
diff --git a/Makefile b/Makefile
index 75b7526625..1a205d0e6d 100644
--- a/Makefile
+++ b/Makefile
@@ -48,13 +48,13 @@ quality:
isort --check-only $(check_dirs)
python utils/custom_init_isort.py --check_only
flake8 $(check_dirs)
- python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
+# python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
# Format source code automatically and check is there are any problems left that need manual fixing
extra_style_checks:
python utils/custom_init_isort.py
- python utils/style_doc.py src/transformers docs/source --max_len 119
+# python utils/style_doc.py src/transformers docs/source --max_len 119
# this target runs checks on all files and potentially modifies some of them
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 6d6490bb38..5fc0e7c393 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -832,16 +832,21 @@ def add_start_docstrings(*docstr):
def add_start_docstrings_to_model_forward(*docstr):
def docstring_decorator(fn):
- class_name = f":class:`~transformers.{fn.__qualname__.split('.')[0]}`"
- intro = f" The {class_name} forward method, overrides the :func:`__call__` special method."
+ docstring = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+ class_name = f"[`{fn.__qualname__.split('.')[0]}`]"
+ intro = f" The {class_name} forward method, overrides the `__call__` special method."
note = r"""
- .. note::
- Although the recipe for forward pass needs to be defined within this function, one should call the
- :class:`Module` instance afterwards instead of this since the former takes care of running the pre and post
- processing steps while the latter silently ignores them.
- """
- fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+
+
+ Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]
+ instance afterwards instead of this since the former takes care of running the pre and post processing steps while
+ the latter silently ignores them.
+
+
+"""
+
+ fn.__doc__ = intro + note + docstring
return fn
return docstring_decorator
@@ -857,18 +862,18 @@ def add_end_docstrings(*docstr):
PT_RETURN_INTRODUCTION = r"""
Returns:
- :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: A :class:`~{full_output_type}` or a tuple of
- :obj:`torch.FloatTensor` (if ``return_dict=False`` is passed or when ``config.return_dict=False``) comprising
- various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs.
+ [`{full_output_type}`] or `tuple(torch.FloatTensor)`: A [`{full_output_type}`] or a tuple of
+ `torch.FloatTensor` (if `return_dict=False` is passed or when `config.return_dict=False`) comprising various
+ elements depending on the configuration ([`{config_class}`]) and inputs.
"""
TF_RETURN_INTRODUCTION = r"""
Returns:
- :class:`~{full_output_type}` or :obj:`tuple(tf.Tensor)`: A :class:`~{full_output_type}` or a tuple of
- :obj:`tf.Tensor` (if ``return_dict=False`` is passed or when ``config.return_dict=False``) comprising various
- elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs.
+ [`{full_output_type}`] or `tuple(tf.Tensor)`: A [`{full_output_type}`] or a tuple of `tf.Tensor` (if
+ `return_dict=False` is passed or when `config.return_dict=False`) comprising various elements depending on the
+ configuration ([`{config_class}`]) and inputs.
"""
@@ -905,284 +910,325 @@ def _convert_output_args_doc(output_args_doc):
return "\n".join(blocks)
-def _prepare_output_docstrings(output_type, config_class):
+def _prepare_output_docstrings(output_type, config_class, min_indent=None):
"""
Prepares the return part of the docstring using `output_type`.
"""
- docstrings = output_type.__doc__
+ output_docstring = output_type.__doc__
# Remove the head of the docstring to keep the list of args only
- lines = docstrings.split("\n")
+ lines = output_docstring.split("\n")
i = 0
while i < len(lines) and re.search(r"^\s*(Args|Parameters):\s*$", lines[i]) is None:
i += 1
if i < len(lines):
- docstrings = "\n".join(lines[(i + 1) :])
- docstrings = _convert_output_args_doc(docstrings)
+ params_docstring = "\n".join(lines[(i + 1) :])
+ params_docstring = _convert_output_args_doc(params_docstring)
# Add the return introduction
full_output_type = f"{output_type.__module__}.{output_type.__name__}"
intro = TF_RETURN_INTRODUCTION if output_type.__name__.startswith("TF") else PT_RETURN_INTRODUCTION
intro = intro.format(full_output_type=full_output_type, config_class=config_class)
- return intro + docstrings
+ result = intro + params_docstring
+
+ # Apply minimum indent if necessary
+ if min_indent is not None:
+ lines = result.split("\n")
+ # Find the indent of the first nonempty line
+ i = 0
+ while len(lines[i]) == 0:
+ i += 1
+ indent = len(_get_indent(lines[i]))
+ # If too small, add indentation to all nonempty lines
+ if indent < min_indent:
+ to_add = " " * (min_indent - indent)
+ lines = [(f"{to_add}{line}" if len(line) > 0 else line) for line in lines]
+ result = "\n".join(lines)
+
+ return result
PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> import torch
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import torch
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1
- >>> outputs = model(**inputs, labels=labels)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
+ >>> outputs = model(**inputs, labels=labels)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```
"""
PT_QUESTION_ANSWERING_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> import torch
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import torch
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
- >>> inputs = tokenizer(question, text, return_tensors='pt')
- >>> start_positions = torch.tensor([1])
- >>> end_positions = torch.tensor([3])
+ >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+ >>> inputs = tokenizer(question, text, return_tensors='pt')
+ >>> start_positions = torch.tensor([1])
+ >>> end_positions = torch.tensor([3])
- >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
- >>> loss = outputs.loss
- >>> start_scores = outputs.start_logits
- >>> end_scores = outputs.end_logits
+ >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
+ >>> loss = outputs.loss
+ >>> start_scores = outputs.start_logits
+ >>> end_scores = outputs.end_logits
+ ```
"""
PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
- Example of single-label classification::
+ Example of single-label classification:
- >>> from transformers import {processor_class}, {model_class}
- >>> import torch
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import torch
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
- >>> outputs = model(**inputs, labels=labels)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
+ >>> outputs = model(**inputs, labels=labels)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```
- Example of multi-label classification::
+ Example of multi-label classification:
- >>> from transformers import {processor_class}, {model_class}
- >>> import torch
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import torch
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}', problem_type="multi_label_classification")
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}', problem_type="multi_label_classification")
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> labels = torch.tensor([[1, 1]], dtype=torch.float) # need dtype=float for BCEWithLogitsLoss
- >>> outputs = model(**inputs, labels=labels)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> labels = torch.tensor([[1, 1]], dtype=torch.float) # need dtype=float for BCEWithLogitsLoss
+ >>> outputs = model(**inputs, labels=labels)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```
"""
PT_MASKED_LM_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> import torch
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import torch
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
- >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
+ >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
+ >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
- >>> outputs = model(**inputs, labels=labels)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
+ >>> outputs = model(**inputs, labels=labels)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```
"""
PT_BASE_MODEL_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> import torch
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import torch
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> last_hidden_states = outputs.last_hidden_state
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```
"""
PT_MULTIPLE_CHOICE_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> import torch
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import torch
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
- >>> choice0 = "It is eaten with a fork and a knife."
- >>> choice1 = "It is eaten while held in the hand."
- >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+ >>> choice0 = "It is eaten with a fork and a knife."
+ >>> choice1 = "It is eaten while held in the hand."
+ >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
- >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='pt', padding=True)
- >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1
+ >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='pt', padding=True)
+ >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1
- >>> # the linear classifier still needs to be trained
- >>> loss = outputs.loss
- >>> logits = outputs.logits
+ >>> # the linear classifier still needs to be trained
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```
"""
PT_CAUSAL_LM_SAMPLE = r"""
- Example::
+ Example:
- >>> import torch
- >>> from transformers import {processor_class}, {model_class}
+ ```python
+ >>> import torch
+ >>> from transformers import {processor_class}, {model_class}
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs, labels=inputs["input_ids"])
- >>> loss = outputs.loss
- >>> logits = outputs.logits
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs, labels=inputs["input_ids"])
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```
"""
PT_SPEECH_BASE_MODEL_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> from datasets import load_dataset
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> from datasets import load_dataset
- >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
- >>> sampling_rate = dataset.features["audio"].sampling_rate
+ >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+ >>> sampling_rate = dataset.features["audio"].sampling_rate
- >>> processor = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> processor = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> # audio file is decoded on the fly
- >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> # audio file is decoded on the fly
+ >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> last_hidden_states = outputs.last_hidden_state
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```
"""
PT_SPEECH_CTC_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> from datasets import load_dataset
- >>> import torch
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> from datasets import load_dataset
+ >>> import torch
- >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
- >>> sampling_rate = dataset.features["audio"].sampling_rate
+ >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+ >>> sampling_rate = dataset.features["audio"].sampling_rate
- >>> processor = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> processor = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> # audio file is decoded on the fly
- >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
- >>> logits = model(**inputs).logits
- >>> predicted_ids = torch.argmax(logits, dim=-1)
+ >>> # audio file is decoded on the fly
+ >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+ >>> logits = model(**inputs).logits
+ >>> predicted_ids = torch.argmax(logits, dim=-1)
- >>> # transcribe speech
- >>> transcription = processor.batch_decode(predicted_ids)
+ >>> # transcribe speech
+ >>> transcription = processor.batch_decode(predicted_ids)
- >>> # compute loss
- >>> with processor.as_target_processor():
- ... inputs["labels"] = processor(dataset[0]["text"], return_tensors="pt").input_ids
+ >>> # compute loss
+ >>> with processor.as_target_processor():
+ ... inputs["labels"] = processor(dataset[0]["text"], return_tensors="pt").input_ids
- >>> loss = model(**inputs).loss
+ >>> loss = model(**inputs).loss
+ ```
"""
PT_SPEECH_SEQ_CLASS_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> from datasets import load_dataset
- >>> import torch
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> from datasets import load_dataset
+ >>> import torch
- >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
- >>> sampling_rate = dataset.features["audio"].sampling_rate
+ >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+ >>> sampling_rate = dataset.features["audio"].sampling_rate
- >>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> # audio file is decoded on the fly
- >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt")
- >>> logits = model(**inputs).logits
- >>> predicted_class_ids = torch.argmax(logits, dim=-1)
- >>> predicted_label = model.config.id2label[predicted_class_ids]
+ >>> # audio file is decoded on the fly
+ >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt")
+ >>> logits = model(**inputs).logits >>> predicted_class_ids = torch.argmax(logits, dim=-1)
+ >>> predicted_label = model.config.id2label[predicted_class_ids]
- >>> # compute loss - target_label is e.g. "down"
- >>> target_label = model.config.id2label[0]
- >>> inputs["labels"] = torch.tensor([model.config.label2id[target_label]])
- >>> loss = model(**inputs).loss
+ >>> # compute loss - target_label is e.g. "down"
+ >>> target_label = model.config.id2label[0]
+ >>> inputs["labels"] = torch.tensor([model.config.label2id[target_label]])
+ >>> loss = model(**inputs).loss
+ ```
"""
PT_SPEECH_FRAME_CLASS_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> from datasets import load_dataset
- >>> import torch
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> from datasets import load_dataset
+ >>> import torch
- >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
- >>> sampling_rate = dataset.features["audio"].sampling_rate
+ >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+ >>> sampling_rate = dataset.features["audio"].sampling_rate
- >>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> # audio file is decoded on the fly
- >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt")
- >>> logits = model(**inputs).logits
- >>> probabilities = torch.sigmoid(logits[0])
- >>> # labels is a one-hot array of shape (num_frames, num_speakers)
- >>> labels = (probabilities > 0.5).long()
+ >>> # audio file is decoded on the fly
+ >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt")
+ >>> logits = model(**inputs).logits
+ >>> probabilities = torch.sigmoid(logits[0])
+ >>> # labels is a one-hot array of shape (num_frames, num_speakers)
+ >>> labels = (probabilities > 0.5).long()
+ ```
"""
PT_SPEECH_XVECTOR_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> from datasets import load_dataset
- >>> import torch
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> from datasets import load_dataset
+ >>> import torch
- >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
- >>> sampling_rate = dataset.features["audio"].sampling_rate
+ >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+ >>> sampling_rate = dataset.features["audio"].sampling_rate
- >>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> # audio file is decoded on the fly
- >>> inputs = feature_extractor(dataset[:2]["audio"]["array"], return_tensors="pt")
- >>> embeddings = model(**inputs).embeddings
- >>> embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()
+ >>> # audio file is decoded on the fly
+ >>> inputs = feature_extractor(dataset[:2]["audio"]["array"], return_tensors="pt")
+ >>> embeddings = model(**inputs).embeddings
+ >>> embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()
- >>> # the resulting embeddings can be used for cosine similarity-based retrieval
- >>> cosine_sim = torch.nn.CosineSimilarity(dim=-1)
- >>> similarity = cosine_sim(embeddings[0], embeddings[1])
- >>> threshold = 0.7 # the optimal threshold is dataset-dependent
- >>> if similarity < threshold:
- ... print("Speakers are not the same!")
+ >>> # the resulting embeddings can be used for cosine similarity-based retrieval
+ >>> cosine_sim = torch.nn.CosineSimilarity(dim=-1)
+ >>> similarity = cosine_sim(embeddings[0], embeddings[1])
+ >>> threshold = 0.7 # the optimal threshold is dataset-dependent
+ >>> if similarity < threshold:
+ ... print("Speakers are not the same!")
+ ```
"""
PT_SAMPLE_DOCSTRINGS = {
@@ -1202,124 +1248,138 @@ PT_SAMPLE_DOCSTRINGS = {
TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> import tensorflow as tf
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import tensorflow as tf
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
- >>> input_ids = inputs["input_ids"]
- >>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+ >>> input_ids = inputs["input_ids"]
+ >>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
- >>> outputs = model(inputs)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
+ >>> outputs = model(inputs)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```
"""
TF_QUESTION_ANSWERING_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> import tensorflow as tf
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import tensorflow as tf
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
- >>> input_dict = tokenizer(question, text, return_tensors='tf')
- >>> outputs = model(input_dict)
- >>> start_logits = outputs.start_logits
- >>> end_logits = outputs.end_logits
+ >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+ >>> input_dict = tokenizer(question, text, return_tensors='tf')
+ >>> outputs = model(input_dict)
+ >>> start_logits = outputs.start_logits
+ >>> end_logits = outputs.end_logits
- >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
- >>> answer = ' '.join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0]+1])
+ >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
+ >>> answer = ' '.join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0]+1])
+ ```
"""
TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> import tensorflow as tf
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import tensorflow as tf
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
- >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+ >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
- >>> outputs = model(inputs)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
+ >>> outputs = model(inputs)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```
"""
TF_MASKED_LM_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> import tensorflow as tf
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import tensorflow as tf
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf")
- >>> inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]
+ >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf")
+ >>> inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]
- >>> outputs = model(inputs)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
+ >>> outputs = model(inputs)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```
"""
TF_BASE_MODEL_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> import tensorflow as tf
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import tensorflow as tf
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
- >>> outputs = model(inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+ >>> outputs = model(inputs)
- >>> last_hidden_states = outputs.last_hidden_state
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```
"""
TF_MULTIPLE_CHOICE_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> import tensorflow as tf
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import tensorflow as tf
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
- >>> choice0 = "It is eaten with a fork and a knife."
- >>> choice1 = "It is eaten while held in the hand."
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+ >>> choice0 = "It is eaten with a fork and a knife."
+ >>> choice1 = "It is eaten while held in the hand."
- >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='tf', padding=True)
- >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
- >>> outputs = model(inputs) # batch size is 1
+ >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='tf', padding=True)
+ >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
+ >>> outputs = model(inputs) # batch size is 1
- >>> # the linear classifier still needs to be trained
- >>> logits = outputs.logits
+ >>> # the linear classifier still needs to be trained
+ >>> logits = outputs.logits
+ ```
"""
TF_CAUSAL_LM_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
- >>> import tensorflow as tf
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
+ >>> import tensorflow as tf
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
- >>> outputs = model(inputs)
- >>> logits = outputs.logits
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+ >>> outputs = model(inputs)
+ >>> logits = outputs.logits
+ ```
"""
TF_SAMPLE_DOCSTRINGS = {
@@ -1334,108 +1394,121 @@ TF_SAMPLE_DOCSTRINGS = {
FLAX_TOKEN_CLASSIFICATION_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax')
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax')
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ ```
"""
FLAX_QUESTION_ANSWERING_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
- >>> inputs = tokenizer(question, text, return_tensors='jax')
+ >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+ >>> inputs = tokenizer(question, text, return_tensors='jax')
- >>> outputs = model(**inputs)
- >>> start_scores = outputs.start_logits
- >>> end_scores = outputs.end_logits
+ >>> outputs = model(**inputs)
+ >>> start_scores = outputs.start_logits
+ >>> end_scores = outputs.end_logits
+ ```
"""
FLAX_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax')
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax')
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ ```
"""
FLAX_MASKED_LM_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors='jax')
+ >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors='jax')
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ ```
"""
FLAX_BASE_MODEL_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax')
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax')
+ >>> outputs = model(**inputs)
- >>> last_hidden_states = outputs.last_hidden_state
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```
"""
FLAX_MULTIPLE_CHOICE_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
- >>> choice0 = "It is eaten with a fork and a knife."
- >>> choice1 = "It is eaten while held in the hand."
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+ >>> choice0 = "It is eaten with a fork and a knife."
+ >>> choice1 = "It is eaten while held in the hand."
- >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='jax', padding=True)
- >>> outputs = model(**{{k: v[None, :] for k,v in encoding.items()}})
+ >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='jax', padding=True)
+ >>> outputs = model(**{{k: v[None, :] for k,v in encoding.items()}})
- >>> logits = outputs.logits
+ >>> logits = outputs.logits ```
"""
FLAX_CAUSAL_LM_SAMPLE = r"""
- Example::
+ Example:
- >>> from transformers import {processor_class}, {model_class}
+ ```python
+ >>> from transformers import {processor_class}, {model_class}
- >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
- >>> model = {model_class}.from_pretrained('{checkpoint}')
+ >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
+ >>> model = {model_class}.from_pretrained('{checkpoint}')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+ >>> outputs = model(**inputs)
- >>> # retrieve logts for next token
- >>> next_token_logits = outputs.logits[:, -1]
+ >>> # retrieve logts for next token
+ >>> next_token_logits = outputs.logits[:, -1]
+ ```
"""
FLAX_SAMPLE_DOCSTRINGS = {
@@ -1500,9 +1573,10 @@ def add_code_sample_docstrings(
else:
raise ValueError(f"Docstring can't be built for model {model_class}")
- output_doc = _prepare_output_docstrings(output_type, config_class) if output_type is not None else ""
+ func_doc = (fn.__doc__ or "") + "".join(docstr)
+ output_doc = "" if output_type is None else _prepare_output_docstrings(output_type, config_class)
built_doc = code_sample.format(**doc_kwargs)
- fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + output_doc + built_doc
+ fn.__doc__ = func_doc + output_doc + built_doc
return fn
return docstring_decorator
@@ -1510,19 +1584,21 @@ def add_code_sample_docstrings(
def replace_return_docstrings(output_type=None, config_class=None):
def docstring_decorator(fn):
- docstrings = fn.__doc__
- lines = docstrings.split("\n")
+ func_doc = fn.__doc__
+ lines = func_doc.split("\n")
i = 0
while i < len(lines) and re.search(r"^\s*Returns?:\s*$", lines[i]) is None:
i += 1
if i < len(lines):
- lines[i] = _prepare_output_docstrings(output_type, config_class)
- docstrings = "\n".join(lines)
+ indent = len(_get_indent(lines[i]))
+ lines[i] = _prepare_output_docstrings(output_type, config_class, min_indent=indent)
+ func_doc = "\n".join(lines)
else:
raise ValueError(
- f"The function {fn} should have an empty 'Return:' or 'Returns:' in its docstring as placeholder, current docstring is:\n{docstrings}"
+ f"The function {fn} should have an empty 'Return:' or 'Returns:' in its docstring as placeholder, "
+ f"current docstring is:\n{func_doc}"
)
- fn.__doc__ = docstrings
+ fn.__doc__ = func_doc
return fn
return docstring_decorator
@@ -1591,8 +1667,8 @@ def url_to_filename(url: str, etag: Optional[str] = None) -> str:
def filename_to_url(filename, cache_dir=None):
"""
- Return the url and etag (which may be ``None``) stored for `filename`. Raise ``EnvironmentError`` if `filename` or
- its stored metadata do not exist.
+ Return the url and etag (which may be `None`) stored for *filename*. Raise `EnvironmentError` if *filename* or its
+ stored metadata do not exist.
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
@@ -1617,16 +1693,16 @@ def filename_to_url(filename, cache_dir=None):
def get_cached_models(cache_dir: Union[str, Path] = None) -> List[Tuple]:
"""
- Returns a list of tuples representing model binaries that are cached locally. Each tuple has shape
- :obj:`(model_url, etag, size_MB)`. Filenames in :obj:`cache_dir` are use to get the metadata for each model, only
- urls ending with `.bin` are added.
+ Returns a list of tuples representing model binaries that are cached locally. Each tuple has shape `(model_url,
+ etag, size_MB)`. Filenames in `cache_dir` are use to get the metadata for each model, only urls ending with *.bin*
+ are added.
Args:
- cache_dir (:obj:`Union[str, Path]`, `optional`):
+ cache_dir (`Union[str, Path]`, *optional*):
The cache directory to search for models within. Will default to the transformers cache if unset.
Returns:
- List[Tuple]: List of tuples each with shape :obj:`(model_url, etag, size_MB)`
+ List[Tuple]: List of tuples each with shape `(model_url, etag, size_MB)`
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
@@ -1986,23 +2062,23 @@ def get_list_of_files(
local_files_only: bool = False,
) -> List[str]:
"""
- Gets the list of files inside :obj:`path_or_repo`.
+ Gets the list of files inside `path_or_repo`.
Args:
- path_or_repo (:obj:`str` or :obj:`os.PathLike`):
- Can be either the id of a repo on huggingface.co or a path to a `directory`.
- revision (:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ path_or_repo (`str` or `os.PathLike`):
+ Can be either the id of a repo on huggingface.co or a path to a *directory*.
+ revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- use_auth_token (:obj:`str` or `bool`, `optional`):
- The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
- generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
- local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+ when running `transformers-cli login` (stored in `~/.huggingface`).
+ local_files_only (`bool`, *optional*, defaults to `False`):
Whether or not to only rely on local files and not to attempt to download any files.
Returns:
- :obj:`List[str]`: The list of files available in :obj:`path_or_repo`.
+ `List[str]`: The list of files available in `path_or_repo`.
"""
path_or_repo = str(path_or_repo)
# If path_or_repo is a folder, we just return what is inside (subdirectories included).
@@ -2083,8 +2159,7 @@ def is_torch_fx_proxy(x):
def is_tensor(x):
"""
- Tests if ``x`` is a :obj:`torch.Tensor`, :obj:`tf.Tensor`, obj:`jaxlib.xla_extension.DeviceArray` or
- :obj:`np.ndarray`.
+ Tests if `x` is a `torch.Tensor`, `tf.Tensor`, `jaxlib.xla_extension.DeviceArray` or `np.ndarray`.
"""
if is_torch_fx_proxy(x):
return True
@@ -2177,13 +2252,16 @@ def to_numpy(obj):
class ModelOutput(OrderedDict):
"""
- Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
- a tuple) or strings (like a dictionary) that will ignore the ``None`` attributes. Otherwise behaves like a regular
+ Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
+ tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
python dictionary.
- .. warning::
- You can't unpack a :obj:`ModelOutput` directly. Use the :meth:`~transformers.file_utils.ModelOutput.to_tuple`
- method to convert it to a tuple before.
+
+
+ You can't unpack a `ModelOutput` directly. Use the [`~file_utils.ModelOutput.to_tuple`] method to convert it to a
+ tuple before.
+
+
"""
def __post_init__(self):
@@ -2263,7 +2341,7 @@ class ModelOutput(OrderedDict):
def to_tuple(self) -> Tuple[Any]:
"""
- Convert self to a tuple containing all the attributes/keys that are not ``None``.
+ Convert self to a tuple containing all the attributes/keys that are not `None`.
"""
return tuple(self[k] for k in self.keys())
@@ -2282,8 +2360,8 @@ class ExplicitEnum(Enum):
class PaddingStrategy(ExplicitEnum):
"""
- Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
- in an IDE.
+ Possible values for the `padding` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an
+ IDE.
"""
LONGEST = "longest"
@@ -2293,7 +2371,7 @@ class PaddingStrategy(ExplicitEnum):
class TensorType(ExplicitEnum):
"""
- Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
+ Possible values for the `return_tensors` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for
tab-completion in an IDE.
"""
@@ -2413,55 +2491,56 @@ class PushToHubMixin:
) -> str:
"""
Upload the {object_files} to the 🤗 Model Hub while synchronizing a local clone of the repo in
- :obj:`repo_path_or_name`.
+ `repo_path_or_name`.
Parameters:
- repo_path_or_name (:obj:`str`, `optional`):
+ repo_path_or_name (`str`, *optional*):
Can either be a repository name for your {object} in the Hub or a path to a local folder (in which case
the repository will have the name of that local folder). If not specified, will default to the name
- given by :obj:`repo_url` and a local directory with that name will be created.
- repo_url (:obj:`str`, `optional`):
+ given by `repo_url` and a local directory with that name will be created.
+ repo_url (`str`, *optional*):
Specify this in case you want to push to an existing repository in the hub. If unspecified, a new
- repository will be created in your namespace (unless you specify an :obj:`organization`) with
- :obj:`repo_name`.
- use_temp_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether or not to clone the distant repo in a temporary directory or in :obj:`repo_path_or_name` inside
- the current working directory. This will slow things down if you are making changes in an existing repo
+ repository will be created in your namespace (unless you specify an `organization`) with `repo_name`.
+ use_temp_dir (`bool`, *optional*, defaults to `False`):
+ Whether or not to clone the distant repo in a temporary directory or in `repo_path_or_name` inside the
+ current working directory. This will slow things down if you are making changes in an existing repo
since you will need to clone the repo before every push.
- commit_message (:obj:`str`, `optional`):
- Message to commit while pushing. Will default to :obj:`"add {object}"`.
- organization (:obj:`str`, `optional`):
+ commit_message (`str`, *optional*):
+ Message to commit while pushing. Will default to `"add {object}"`.
+ organization (`str`, *optional*):
Organization in which you want to push your {object} (you must be a member of this organization).
- private (:obj:`bool`, `optional`):
+ private (`bool`, *optional*):
Whether or not the repository created should be private (requires a paying subscription).
- use_auth_token (:obj:`bool` or :obj:`str`, `optional`):
- The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
- generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). Will default to
- :obj:`True` if :obj:`repo_url` is not specified.
+ use_auth_token (`bool` or `str`, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+ when running `transformers-cli login` (stored in `~/.huggingface`). Will default to `True` if
+ `repo_url` is not specified.
Returns:
- :obj:`str`: The url of the commit of your {object} in the given repository.
+ `str`: The url of the commit of your {object} in the given repository.
- Examples::
+ Examples:
- from transformers import {object_class}
+ ```python
+ from transformers import {object_class}
- {object} = {object_class}.from_pretrained("bert-base-cased")
+ {object} = {object_class}.from_pretrained("bert-base-cased")
- # Push the {object} to your namespace with the name "my-finetuned-bert" and have a local clone in the
- # `my-finetuned-bert` folder.
- {object}.push_to_hub("my-finetuned-bert")
+ # Push the {object} to your namespace with the name "my-finetuned-bert" and have a local clone in the
+ # *my-finetuned-bert* folder.
+ {object}.push_to_hub("my-finetuned-bert")
- # Push the {object} to your namespace with the name "my-finetuned-bert" with no local clone.
- {object}.push_to_hub("my-finetuned-bert", use_temp_dir=True)
+ # Push the {object} to your namespace with the name "my-finetuned-bert" with no local clone.
+ {object}.push_to_hub("my-finetuned-bert", use_temp_dir=True)
- # Push the {object} to an organization with the name "my-finetuned-bert" and have a local clone in the
- # `my-finetuned-bert` folder.
- {object}.push_to_hub("my-finetuned-bert", organization="huggingface")
+ # Push the {object} to an organization with the name "my-finetuned-bert" and have a local clone in the
+ # *my-finetuned-bert* folder.
+ {object}.push_to_hub("my-finetuned-bert", organization="huggingface")
- # Make a change to an existing repo that has been cloned locally in `my-finetuned-bert`.
- {object}.push_to_hub("my-finetuned-bert", repo_url="https://huggingface.co/sgugger/my-finetuned-bert")
+ # Make a change to an existing repo that has been cloned locally in *my-finetuned-bert*.
+ {object}.push_to_hub("my-finetuned-bert", repo_url="https://huggingface.co/sgugger/my-finetuned-bert")
+ ```
"""
if use_temp_dir:
# Make sure we use the right `repo_name` for the `repo_url` before replacing it.
diff --git a/src/transformers/modeling_flax_outputs.py b/src/transformers/modeling_flax_outputs.py
index c748a4f72e..f2a23fa52b 100644
--- a/src/transformers/modeling_flax_outputs.py
+++ b/src/transformers/modeling_flax_outputs.py
@@ -25,16 +25,15 @@ class FlaxBaseModelOutput(ModelOutput):
Base class for model's outputs, with potential hidden states and attentions.
Args:
- last_hidden_state (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -51,19 +50,18 @@ class FlaxBaseModelOutputWithPast(ModelOutput):
Base class for model's outputs, with potential hidden states and attentions.
Args:
- last_hidden_state (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- past_key_values (:obj:`Dict[str, jnp.ndarray]`):
+ past_key_values (`Dict[str, jnp.ndarray]`):
Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
- auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -81,20 +79,19 @@ class FlaxBaseModelOutputWithPooling(ModelOutput):
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
- last_hidden_state (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- pooler_output (:obj:`jnp.ndarray` of shape :obj:`(batch_size, hidden_size)`):
+ pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
prediction (classification) objective during pretraining.
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -112,34 +109,30 @@ class FlaxBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
- last_hidden_state (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
- 1, hidden_size)` is output.
- past_key_values (:obj:`tuple(tuple(jnp.ndarray))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(jnp.ndarray)` of length :obj:`config.n_layers`, with each tuple having 2 tensors of
- shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
- ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads,
- encoder_sequence_length, embed_size_per_head)`.
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
+ past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
- ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
- cross_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
@@ -159,45 +152,41 @@ class FlaxSeq2SeqModelOutput(ModelOutput):
decoding.
Args:
- last_hidden_state (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
- If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
- 1, hidden_size)` is output.
- past_key_values (:obj:`tuple(tuple(jnp.ndarray))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(jnp.ndarray)` of length :obj:`config.n_layers`, with each tuple having 2 tensors of
- shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
+ past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
@@ -219,32 +208,30 @@ class FlaxCausalLMOutputWithCrossAttentions(ModelOutput):
Base class for causal language model (or autoregressive) outputs.
Args:
- logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
- cross_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Cross attentions weights after the attention softmax, used to compute the weighted average in the
cross-attention heads.
- past_key_values (:obj:`tuple(tuple(jnp.ndarray))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`jnp.ndarray` tuples of length :obj:`config.n_layers`, with each tuple containing the cached
+ past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `jnp.ndarray` tuples of length `config.n_layers`, with each tuple containing the cached
key, value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
- setting. Only relevant if ``config.is_decoder = True``.
+ setting. Only relevant if `config.is_decoder = True`.
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
+ `past_key_values` input) to speed up sequential decoding.
"""
logits: jnp.ndarray = None
@@ -260,16 +247,15 @@ class FlaxMaskedLMOutput(ModelOutput):
Base class for masked language models outputs.
Args:
- logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -289,42 +275,39 @@ class FlaxSeq2SeqLMOutput(ModelOutput):
Base class for sequence-to-sequence language models outputs.
Args:
- logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- past_key_values (:obj:`tuple(tuple(jnp.ndarray))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(jnp.ndarray)` of length :obj:`config.n_layers`, with each tuple having 2 tensors of
- shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
@@ -346,17 +329,16 @@ class FlaxNextSentencePredictorOutput(ModelOutput):
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
- logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, 2)`):
+ logits (`jnp.ndarray` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -373,16 +355,15 @@ class FlaxSequenceClassifierOutput(ModelOutput):
Base class for outputs of sentence classification models.
Args:
- logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -399,42 +380,39 @@ class FlaxSeq2SeqSequenceClassifierOutput(ModelOutput):
Base class for outputs of sequence-to-sequence sentence classification models.
Args:
- logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- past_key_values (:obj:`tuple(tuple(jnp.ndarray))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(jnp.ndarray)` of length :obj:`config.n_layers`, with each tuple having 2 tensors of
- shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
@@ -456,18 +434,17 @@ class FlaxMultipleChoiceModelOutput(ModelOutput):
Base class for outputs of multiple choice models.
Args:
- logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, num_choices)`):
- `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+ logits (`jnp.ndarray` of shape `(batch_size, num_choices)`):
+ *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -484,16 +461,15 @@ class FlaxTokenClassifierOutput(ModelOutput):
Base class for outputs of token classification models.
Args:
- logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+ logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -510,18 +486,17 @@ class FlaxQuestionAnsweringModelOutput(ModelOutput):
Base class for outputs of question answering models.
Args:
- start_logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
- end_logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -539,44 +514,41 @@ class FlaxSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
Base class for outputs of sequence-to-sequence question answering models.
Args:
- start_logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
- end_logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
- past_key_values (:obj:`tuple(tuple(jnp.ndarray))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(jnp.ndarray)` of length :obj:`config.n_layers`, with each tuple having 2 tensors of
- shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index 7cc9229a50..34bed2119b 100644
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -26,16 +26,15 @@ class BaseModelOutput(ModelOutput):
Base class for model's outputs, with potential hidden states and attentions.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -52,21 +51,20 @@ class BaseModelOutputWithPooling(ModelOutput):
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
+ pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) after further processing
through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
the classification token after processing through a linear layer and a tanh activation function. The linear
layer weights are trained from the next sentence prediction (classification) objective during pretraining.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -84,28 +82,25 @@ class BaseModelOutputWithPast(ModelOutput):
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
- 1, hidden_size)` is output.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
- ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads,
- encoder_sequence_length, embed_size_per_head)`.
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
- ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -123,22 +118,20 @@ class BaseModelOutputWithCrossAttentions(ModelOutput):
Base class for model's outputs, with potential hidden states and attentions.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
@@ -156,39 +149,36 @@ class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
+ pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) after further processing
through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
the classification token after processing through a linear layer and a tanh activation function. The linear
layer weights are trained from the next sentence prediction (classification) objective during pretraining.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
- ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads,
- encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
- ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see
+ `past_key_values` input) to speed up sequential decoding.
"""
last_hidden_state: torch.FloatTensor = None
@@ -205,34 +195,30 @@ class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
- 1, hidden_size)` is output.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
- ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads,
- encoder_sequence_length, embed_size_per_head)`.
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
- ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
@@ -252,45 +238,41 @@ class Seq2SeqModelOutput(ModelOutput):
decoding.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
- If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
- 1, hidden_size)` is output.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
@@ -312,18 +294,17 @@ class CausalLMOutput(ModelOutput):
Base class for causal language model (or autoregressive) outputs.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -341,24 +322,23 @@ class CausalLMOutputWithPast(ModelOutput):
Base class for causal language model (or autoregressive) outputs.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -377,34 +357,32 @@ class CausalLMOutputWithCrossAttentions(ModelOutput):
Base class for causal language model (or autoregressive) outputs.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Cross attentions weights after the attention softmax, used to compute the weighted average in the
cross-attention heads.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`torch.FloatTensor` tuples of length :obj:`config.n_layers`, with each tuple containing the
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the
cached key, value states of the self-attention and the cross-attention layers if model is used in
- encoder-decoder setting. Only relevant if ``config.is_decoder = True``.
+ encoder-decoder setting. Only relevant if `config.is_decoder = True`.
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
+ `past_key_values` input) to speed up sequential decoding.
"""
loss: Optional[torch.FloatTensor] = None
@@ -421,24 +399,23 @@ class SequenceClassifierOutputWithPast(ModelOutput):
Base class for outputs of sentence classification models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -457,18 +434,17 @@ class MaskedLMOutput(ModelOutput):
Base class for masked language models outputs.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Masked language modeling (MLM) loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -486,44 +462,41 @@ class Seq2SeqLMOutput(ModelOutput):
Base class for sequence-to-sequence language models outputs.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
@@ -546,19 +519,18 @@ class NextSentencePredictorOutput(ModelOutput):
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `next_sentence_label` is provided):
Next sequence prediction (classification) loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -576,18 +548,17 @@ class SequenceClassifierOutput(ModelOutput):
Base class for outputs of sentence classification models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -605,44 +576,41 @@ class Seq2SeqSequenceClassifierOutput(ModelOutput):
Base class for outputs of sequence-to-sequence sentence classification models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
Classification (or regression if config.num_labels==1) loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
@@ -665,20 +633,19 @@ class MultipleChoiceModelOutput(ModelOutput):
Base class for outputs of multiple choice models.
Args:
- loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
Classification loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
- `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+ logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
+ *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -696,18 +663,17 @@ class TokenClassifierOutput(ModelOutput):
Base class for outputs of token classification models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
Classification loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -725,20 +691,19 @@ class QuestionAnsweringModelOutput(ModelOutput):
Base class for outputs of question answering models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
- start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+ start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
- end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+ end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -757,46 +722,43 @@ class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
Base class for outputs of sequence-to-sequence question answering models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
- start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+ start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
- end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+ end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
diff --git a/src/transformers/modeling_tf_outputs.py b/src/transformers/modeling_tf_outputs.py
index 123ae76db7..581f491243 100644
--- a/src/transformers/modeling_tf_outputs.py
+++ b/src/transformers/modeling_tf_outputs.py
@@ -26,16 +26,15 @@ class TFBaseModelOutput(ModelOutput):
Base class for model's outputs, with potential hidden states and attentions.
Args:
- last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- hidden_states (:obj:`tuple(tf.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -52,23 +51,22 @@ class TFBaseModelOutputWithPooling(ModelOutput):
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
- last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
+ pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
prediction (classification) objective during pretraining.
This output is usually *not* a good summary of the semantic content of the input, you're often better with
averaging or pooling the sequence of hidden-states for the whole input sequence.
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -86,35 +84,32 @@ class TFBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
- last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
+ pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
prediction (classification) objective during pretraining.
This output is usually *not* a good summary of the semantic content of the input, you're often better with
averaging or pooling the sequence of hidden-states for the whole input sequence.
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
- cross_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
@@ -134,25 +129,22 @@ class TFBaseModelOutputWithPast(ModelOutput):
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
- last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
- 1, hidden_size)` is output.
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -170,22 +162,20 @@ class TFBaseModelOutputWithCrossAttentions(ModelOutput):
Base class for model's outputs, with potential hidden states and attentions.
Args:
- last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- hidden_states (:obj:`tuple(tf.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
- cross_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
@@ -203,31 +193,27 @@ class TFBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
- last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
- 1, hidden_size)` is output.
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(tf.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
- cross_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
@@ -247,44 +233,39 @@ class TFSeq2SeqModelOutput(ModelOutput):
decoding.
Args:
- last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
- If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
- 1, hidden_size)` is output.
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
@@ -306,18 +287,17 @@ class TFCausalLMOutput(ModelOutput):
Base class for causal language model (or autoregressive) outputs.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(n,)`, `optional`, where n is the number of non-masked labels, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -335,24 +315,22 @@ class TFCausalLMOutputWithPast(ModelOutput):
Base class for causal language model (or autoregressive) outputs.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(n,)`, `optional`, where n is the number of non-masked labels, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -371,33 +349,30 @@ class TFCausalLMOutputWithCrossAttentions(ModelOutput):
Base class for causal language model (or autoregressive) outputs.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(n,)`, `optional`, where n is the number of non-masked labels, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
- cross_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
+ `past_key_values` input) to speed up sequential decoding.
"""
loss: Optional[tf.Tensor] = None
@@ -414,18 +389,17 @@ class TFMaskedLMOutput(ModelOutput):
Base class for masked language models outputs.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(n,)`, `optional`, where n is the number of non-masked labels, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Masked language modeling (MLM) loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -443,43 +417,39 @@ class TFSeq2SeqLMOutput(ModelOutput):
Base class for sequence-to-sequence language models outputs.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(n,)`, `optional`, where n is the number of non-masked labels, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Language modeling loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
@@ -502,19 +472,18 @@ class TFNextSentencePredictorOutput(ModelOutput):
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(n,)`, `optional`, where n is the number of non-masked labels, returned when :obj:`next_sentence_label` is provided):
+ loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `next_sentence_label` is provided):
Next sentence prediction loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`):
+ logits (`tf.Tensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -532,18 +501,17 @@ class TFSequenceClassifierOutput(ModelOutput):
Base class for outputs of sentence classification models.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`, `optional`, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -561,37 +529,34 @@ class TFSeq2SeqSequenceClassifierOutput(ModelOutput):
Base class for outputs of sequence-to-sequence sentence classification models.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+ loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `label` is provided):
Classification (or regression if config.num_labels==1) loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
@@ -613,20 +578,19 @@ class TFMultipleChoiceModelOutput(ModelOutput):
Base class for outputs of multiple choice models.
Args:
- loss (:obj:`tf.Tensor` of shape `(batch_size, )`, `optional`, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape *(batch_size, )*, *optional*, returned when `labels` is provided):
Classification loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
- `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+ logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
+ *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -644,18 +608,17 @@ class TFTokenClassifierOutput(ModelOutput):
Base class for outputs of token classification models.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(n,)`, `optional`, where n is the number of unmasked labels, returned when ``labels`` is provided) :
+ loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of unmasked labels, returned when `labels` is provided) :
Classification loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -673,20 +636,19 @@ class TFQuestionAnsweringModelOutput(ModelOutput):
Base class for outputs of question answering models.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`, `optional`, returned when :obj:`start_positions` and :obj:`end_positions` are provided):
+ loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `start_positions` and `end_positions` are provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
- start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
- end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -705,39 +667,36 @@ class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
Base class for outputs of sequence-to-sequence question answering models.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
- start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
- end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
@@ -760,24 +719,22 @@ class TFSequenceClassifierOutputWithPast(ModelOutput):
Base class for outputs of sentence classification models.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`, `optional`, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
- ``past_key_values`` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 95fd9fe235..a7771852b4 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -517,25 +517,24 @@ class AlbertPreTrainedModel(PreTrainedModel):
@dataclass
class AlbertForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.AlbertForPreTraining`.
+ Output type of [`AlbertForPreTraining`].
Args:
- loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+ loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.
- prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- sop_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+ sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -550,69 +549,67 @@ class AlbertForPreTrainingOutput(ModelOutput):
ALBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Args:
- config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
+ config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
ALBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.AlbertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.__call__` and :meth:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`AlbertTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -787,32 +784,31 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
return_dict=None,
):
r"""
- labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
- (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates original order (sequence
- A, then sequence B), ``1`` indicates switched order (sequence B, then sequence A).
+ (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence
+ A, then sequence B), `1` indicates switched order (sequence B, then sequence A).
Returns:
- Example::
+ Example:
- >>> from transformers import AlbertTokenizer, AlbertForPreTraining
- >>> import torch
+ ```python
+ >>> from transformers import AlbertTokenizer, AlbertForPreTraining
+ >>> import torch
- >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
- >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2')
+ >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+ >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2')
- >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
- >>> outputs = model(input_ids)
+ >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
+ >>> outputs = model(input_ids)
- >>> prediction_logits = outputs.prediction_logits
- >>> sop_logits = outputs.sop_logits
-
- """
+ >>> prediction_logits = outputs.prediction_logits
+ >>> sop_logits = outputs.sop_logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.albert(
@@ -938,10 +934,9 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1018,10 +1013,9 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ...,
- config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
- If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1125,9 +1119,8 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1217,13 +1210,13 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1316,10 +1309,9 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see
- `input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
+ *input_ids* above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py
index 6d13bc7043..7ff4552163 100644
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -56,22 +56,21 @@ _TOKENIZER_FOR_DOC = "AlbertTokenizer"
@flax.struct.dataclass
class FlaxAlbertForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.FlaxAlbertForPreTraining`.
+ Output type of [`FlaxAlbertForPreTraining`].
Args:
- prediction_logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- sop_logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, 2)`):
+ sop_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -85,70 +84,67 @@ class FlaxAlbertForPreTrainingOutput(ModelOutput):
ALBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading, saving and converting weights from
PyTorch models)
- This model is also a Flax Linen `flax.linen.Module
- `__ subclass. Use it as a regular Flax linen Module
+ This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. Use it as a regular Flax linen Module
and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
+ config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
ALBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`):
+ input_ids (`numpy.ndarray` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.AlbertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`AlbertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index 9859caffa4..56ced8ec4d 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -163,7 +163,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
if input_ids is None and inputs_embeds is None:
raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
@@ -648,22 +648,21 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
@dataclass
class TFAlbertForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.TFAlbertForPreTraining`.
+ Output type of [`TFAlbertForPreTraining`].
Args:
- prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- sop_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`):
+ sop_logits (`tf.Tensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -678,92 +677,92 @@ class TFAlbertForPreTrainingOutput(ModelOutput):
ALBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
+ config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
ALBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.AlbertTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`AlbertTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1019,10 +1018,9 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
**kwargs,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
func=self.call,
@@ -1124,10 +1122,9 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
**kwargs,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ...,
- config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
- If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
func=self.call,
@@ -1233,9 +1230,8 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
**kwargs,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1335,13 +1331,13 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
**kwargs,
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
- start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
inputs = input_processing(
@@ -1462,10 +1458,9 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
**kwargs,
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
inputs = input_processing(
func=self.call,
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 81fa8a354e..16281ce6bf 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -294,13 +294,13 @@ class BartEncoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -377,19 +377,19 @@ class BartDecoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
- encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for cross-attention heads in a given layer of
- size `(decoder_attention_heads,)`.
- past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+ size *(decoder_attention_heads,)*.
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -517,19 +517,19 @@ class PretrainedBartModel(BartPretrainedModel):
BART_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.BartConfig`):
+ config ([`BartConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BART_GENERATION_EXAMPLE = r"""
@@ -566,110 +566,106 @@ BART_GENERATION_EXAMPLE = r"""
BART_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.BartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- Bart uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
+ Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should read :func:`modeling_bart._prepare_decoder_inputs` and
- modify to your needs. See diagram 1 in `the paper `__ for more
+ If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_inputs`] and
+ modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in ``[0,
- 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
- have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
- :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
+ have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+ `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
- If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
- takes the value of :obj:`inputs_embeds`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+ takes the value of `inputs_embeds`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class BartEncoder(BartPretrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`BartEncoderLayer`.
+ [`BartEncoderLayer`].
Args:
config: BartConfig
@@ -721,40 +717,40 @@ class BartEncoder(BartPretrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -844,7 +840,7 @@ class BartEncoder(BartPretrainedModel):
class BartDecoder(BartPretrainedModel):
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`BartDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BartDecoderLayer`]
Args:
config: BartConfig
@@ -916,71 +912,68 @@ class BartDecoder(BartPretrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
- cross-attention on hidden heads. Mask values selected in ``[0, 1]``:
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1303,10 +1296,9 @@ class BartForConditionalGeneration(BartPretrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
"""
@@ -1446,9 +1438,8 @@ class BartForSequenceClassification(BartPretrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
@@ -1571,13 +1562,13 @@ class BartForQuestionAnswering(BartPretrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1649,7 +1640,7 @@ class BartForQuestionAnswering(BartPretrainedModel):
class BartDecoderWrapper(BartPretrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
- used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
+ used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
@@ -1710,88 +1701,87 @@ class BartForCausalLM(BartPretrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
- in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
model.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels in ``[0, ...,
- config.vocab_size]``.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
Returns:
- Example::
+ Example:
- >>> from transformers import BartTokenizer, BartForCausalLM
+ ```python
+ >>> from transformers import BartTokenizer, BartForCausalLM
- >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
- >>> model = BartForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
- >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+ >>> model = BartForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+ >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- """
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py
index eed4fe0c98..b001b5bf58 100644
--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -59,170 +59,165 @@ _TOKENIZER_FOR_DOC = "BartTokenizer"
BART_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a Flax Linen `flax.nn.Module
- `__ subclass. Use it as a regular Flax
+ This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
+ config ([`BartConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
BART_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.BartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in `the paper
- `__ for more information on the default strategy.
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ range `[0, config.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
BART_ENCODE_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.BartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
BART_DECODE_INPUTS_DOCSTRING = r"""
Args:
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`):
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- encoder_outputs (:obj:`tuple(tuple(jnp.ndarray)`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(jnp.ndarray)`):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- encoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in `the paper
- `__ for more information on the default strategy.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.max_position_embeddings - 1]``.
- past_key_values (:obj:`Dict[str, np.ndarray]`, `optional`, returned by ``init_cache`` or when passing previous ``past_key_values``):
+ range `[0, config.max_position_embeddings - 1]`.
+ past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
- auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -967,15 +962,14 @@ class FlaxBartPreTrainedModel(FlaxPreTrainedModel):
def init_cache(self, batch_size, max_length, encoder_outputs):
r"""
Args:
- batch_size (:obj:`int`):
+ batch_size (`int`):
batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
- max_length (:obj:`int`):
+ max_length (`int`):
maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
cache.
- encoder_outputs (:obj:`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
- ``encoder_outputs`` consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`,
- `optional`: :obj:`attentions`). :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length,
- hidden_size)`, `optional`) is a sequence of hidden-states at the output of the last layer of the
+ encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+ `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`,
+ *optional*: `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the
encoder. Used in the cross-attention of the decoder.
"""
# init input variables to retrieve cache
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index fc7823e73a..abf5b4bb4e 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -296,11 +296,11 @@ class TFBartEncoderLayer(tf.keras.layers.Layer):
def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*
"""
residual = hidden_states
hidden_states, self_attn_weights, _ = self.self_attn(
@@ -372,17 +372,17 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`tf.Tensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`tf.Tensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(decoder_attention_heads,)`
- cross_attn_layer_head_mask (:obj:`tf.Tensor`): mask for heads of the cross-attention module.
- `(decoder_attention_heads,)`
- past_key_value (:obj:`Tuple(tf.Tensor)`): cached past key and value projection states
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(decoder_attention_heads,)*
+ cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
+ *(decoder_attention_heads,)*
+ past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
"""
residual = hidden_states
@@ -472,37 +472,39 @@ class TFBartPretrainedModel(TFPreTrainedModel):
BART_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(input_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
+ config ([`BartConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
@@ -538,80 +540,80 @@ BART_GENERATION_EXAMPLE = r"""
BART_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- Bart uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
+ Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
+ encoder_outputs (`tf.FloatTensor`, *optional*):
hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
- of shape :obj:`(batch_size, sequence_length, hidden_size)` is a sequence of
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -622,7 +624,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
config_class = BartConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`TFBartEncoderLayer`.
+ [`TFBartEncoderLayer`].
Args:
config: BartConfig
@@ -666,40 +668,39 @@ class TFBartEncoder(tf.keras.layers.Layer):
):
"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
inputs = input_processing(
func=self.call,
@@ -785,7 +786,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
class TFBartDecoder(tf.keras.layers.Layer):
config_class = BartConfig
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`TFBartDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBartDecoderLayer`]
Args:
config: BartConfig
@@ -834,65 +835,62 @@ class TFBartDecoder(tf.keras.layers.Layer):
):
r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
inputs = input_processing(
func=self.call,
@@ -1348,10 +1346,9 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel, TFCausalLanguageMode
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index 5ff43d4c56..bd8071ba70 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -46,23 +46,22 @@ BEIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
@dataclass
class BeitModelOutputWithPooling(BaseModelOutputWithPooling):
"""
- Class for outputs of :class:`~transformers.BeitModel`.
+ Class for outputs of [`BeitModel`].
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
- Average of the last layer hidden states of the patch tokens (excluding the `[CLS]` token) if
- `config.use_mean_pooling` is set to True. If set to False, then the final hidden state of the `[CLS]` token
+ pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+ Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
+ *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
will be returned.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -548,37 +547,37 @@ class BeitPreTrainedModel(PreTrainedModel):
BEIT_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ subclass. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.BeitConfig`): Model configuration class with all the parameters of the model.
+ config ([`BeitConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
BEIT_INPUTS_DOCSTRING = r"""
Args:
- pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)`):
- Pixel values. Pixel values can be obtained using :class:`~transformers.BeitFeatureExtractor`. See
- :meth:`transformers.BeitFeatureExtractor.__call__` for details.
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`BeitFeatureExtractor`]. See
+ [`BeitFeatureExtractor.__call__`] for details.
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -733,32 +732,32 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel):
return_dict=None,
):
r"""
- bool_masked_pos (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, num_patches)`):
+ bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import BeitFeatureExtractor, BeitForMaskedImageModeling
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import BeitFeatureExtractor, BeitForMaskedImageModeling
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
- >>> model = BeitForMaskedImageModeling.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
+ >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
+ >>> model = BeitForMaskedImageModeling.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.beit(
@@ -823,32 +822,32 @@ class BeitForImageClassification(BeitPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import BeitFeatureExtractor, BeitForImageClassification
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import BeitFeatureExtractor, BeitForImageClassification
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224')
- >>> model = BeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224')
+ >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224')
+ >>> model = BeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224')
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- >>> # model predicts one of the 1000 ImageNet classes
- >>> predicted_class_idx = logits.argmax(-1).item()
- >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ >>> # model predicts one of the 1000 ImageNet classes
+ >>> predicted_class_idx = logits.argmax(-1).item()
+ >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.beit(
@@ -1157,30 +1156,30 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, height, width)`, `optional`):
- Ground truth semantic segmentation maps for computing the loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels > 1`, a classification loss is computed
+ labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+ Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed
(Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import BeitFeatureExtractor, BeitForSemanticSegmentation
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import BeitFeatureExtractor, BeitForSemanticSegmentation
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-finetuned-ade-640-640')
- >>> model = BeitForSemanticSegmentation.from_pretrained('microsoft/beit-base-finetuned-ade-640-640')
+ >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-finetuned-ade-640-640')
+ >>> model = BeitForSemanticSegmentation.from_pretrained('microsoft/beit-base-finetuned-ade-640-640')
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> # logits are of shape (batch_size, num_labels, height/4, width/4)
- >>> logits = outputs.logits
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> # logits are of shape (batch_size, num_labels, height/4, width/4)
+ >>> logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py
index e276b34fb8..5a1b0c25af 100644
--- a/src/transformers/models/beit/modeling_flax_beit.py
+++ b/src/transformers/models/beit/modeling_flax_beit.py
@@ -44,76 +44,74 @@ from .configuration_beit import BeitConfig
@flax.struct.dataclass
class FlaxBeitModelOutputWithPooling(FlaxBaseModelOutputWithPooling):
"""
- Class for outputs of :class:`~transformers.FlaxBeitModel`.
+ Class for outputs of [`FlaxBeitModel`].
Args:
- last_hidden_state (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- pooler_output (:obj:`jnp.ndarray` of shape :obj:`(batch_size, hidden_size)`):
- Average of the last layer hidden states of the patch tokens (excluding the `[CLS]` token) if
- `config.use_mean_pooling` is set to True. If set to False, then the final hidden state of the `[CLS]` token
+ pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
+ Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
+ *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
will be returned.
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each
layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
"""
BEIT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading, saving and converting weights from
PyTorch models)
- This model is also a Flax Linen `flax.linen.Module
- `__ subclass. Use it as a regular Flax linen Module
+ This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. Use it as a regular Flax linen Module
and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.BeitConfig`): Model configuration class with all the parameters of the model.
+ config ([`BeitConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
BEIT_INPUTS_DOCSTRING = r"""
Args:
- pixel_values (:obj:`numpy.ndarray` of shape :obj:`(batch_size, num_channels, height, width)`):
- Pixel values. Pixel values can be obtained using :class:`~transformers.BeitFeatureExtractor`. See
- :meth:`transformers.BeitFeatureExtractor.__call__` for details.
+ pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`BeitFeatureExtractor`]. See
+ [`BeitFeatureExtractor.__call__`] for details.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -810,26 +808,28 @@ class FlaxBeitForMaskedImageModeling(FlaxBeitPreTrainedModel):
FLAX_BEIT_MLM_DOCSTRING = """
- bool_masked_pos (:obj:`numpy.ndarray` of shape :obj:`(batch_size, num_patches)`):
+ bool_masked_pos (`numpy.ndarray` of shape `(batch_size, num_patches)`):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
Returns:
- Examples::
+ Examples:
- >>> from transformers import BeitFeatureExtractor, BeitForMaskedImageModeling
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import BeitFeatureExtractor, BeitForMaskedImageModeling
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
- >>> model = BeitForMaskedImageModeling.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
+ >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
+ >>> model = BeitForMaskedImageModeling.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
- >>> inputs = feature_extractor(images=image, return_tensors="np")
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
+ >>> inputs = feature_extractor(images=image, return_tensors="np")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ ```
"""
overwrite_call_docstring(FlaxBeitForMaskedImageModeling, FLAX_BEIT_MLM_DOCSTRING)
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 5af77eec00..01bb9fbc17 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -744,25 +744,25 @@ class BertPreTrainedModel(PreTrainedModel):
@dataclass
class BertForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.BertForPreTraining`.
+ Output type of [`BertForPreTraining`].
Args:
- loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+ loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.
- prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+ seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -777,69 +777,67 @@ class BertForPreTrainingOutput(ModelOutput):
BERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
- methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
- pruning heads etc.)
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
- This model is also a PyTorch `torch.nn.Module `__
- subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
- general usage and behavior.
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
Parameters:
- config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
+ config ([`BertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
- weights.
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
- details.
+ Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+ 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
- vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -851,14 +849,13 @@ class BertModel(BertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
- cross-attention is added between the self-attention layers, following the architecture described in `Attention is
- all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+ cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+ all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
- To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
- set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
- argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
- input to the forward pass.
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+ to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+ `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
"""
def __init__(self, config, add_pooling_layer=True):
@@ -911,24 +908,24 @@ class BertModel(BertPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1064,34 +1061,36 @@ class BertForPreTraining(BertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
- Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
- (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+ the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
+ pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
- - 0 indicates sequence B is a continuation of sequence A,
- - 1 indicates sequence B is a random sequence.
- kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
- Used to hide legacy arguments that have been deprecated.
+ - 0 indicates sequence B is a continuation of sequence A,
+ - 1 indicates sequence B is a random sequence.
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+ Used to hide legacy arguments that have been deprecated.
Returns:
- Example::
+ Example:
- >>> from transformers import BertTokenizer, BertForPreTraining
- >>> import torch
+ ```python
+ >>> from transformers import BertTokenizer, BertForPreTraining
+ >>> import torch
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+ >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.prediction_logits
- >>> seq_relationship_logits = outputs.seq_relationship_logits
+ >>> prediction_logits = outputs.prediction_logits
+ >>> seq_relationship_logits = outputs.seq_relationship_logits
+ ```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1176,45 +1175,49 @@ class BertLMHeadModel(BertPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
- the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+ if the model is configured as a decoder.
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+ in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
- ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
- Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be
+ in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100`
+ are ignored (masked), the loss is only computed for the tokens with labels n `[0, ...,
+ config.vocab_size]`
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up
+ decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+ that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+ all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
Returns:
- Example::
+ Example:
- >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
- >>> import torch
+ ```python
+ >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+ >>> import torch
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
- >>> config = BertConfig.from_pretrained("bert-base-cased")
- >>> config.is_decoder = True
- >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+ >>> config = BertConfig.from_pretrained("bert-base-cased")
+ >>> config.is_decoder = True
+ >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.logits
+ >>> prediction_logits = outputs.logits
+ ```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
@@ -1329,10 +1332,10 @@ class BertForMaskedLM(BertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1418,30 +1421,32 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
**kwargs,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
- (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+ (see `input_ids` docstring). Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
Returns:
- Example::
+ Example:
- >>> from transformers import BertTokenizer, BertForNextSentencePrediction
- >>> import torch
+ ```python
+ >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+ >>> import torch
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+ >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
- >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
- >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+ >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+ >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
- >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
- >>> logits = outputs.logits
- >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+ >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+ >>> logits = outputs.logits
+ >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+ ```
"""
if "next_sentence_label" in kwargs:
@@ -1530,10 +1535,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1630,10 +1635,10 @@ class BertForMultipleChoice(BertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1729,9 +1734,8 @@ class BertForTokenClassification(BertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1821,14 +1825,14 @@ class BertForQuestionAnswering(BertPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
- sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
- sequence are not taken into account for computing the loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py
index 5c66feabb5..6ffcc9d221 100644
--- a/src/transformers/models/bert/modeling_flax_bert.py
+++ b/src/transformers/models/bert/modeling_flax_bert.py
@@ -57,22 +57,21 @@ _TOKENIZER_FOR_DOC = "BertTokenizer"
@flax.struct.dataclass
class FlaxBertForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.BertForPreTraining`.
+ Output type of [`BertForPreTraining`].
Args:
- prediction_logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- seq_relationship_logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, 2)`):
+ seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -86,89 +85,85 @@ class FlaxBertForPreTrainingOutput(ModelOutput):
BERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading, saving and converting weights from
PyTorch models)
- This model is also a Flax Linen `flax.linen.Module
- `__ subclass. Use it as a regular Flax linen Module
+ This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. Use it as a regular Flax linen Module
and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
+ config ([`BertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
BERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`):
+ input_ids (`numpy.ndarray` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- head_mask (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ head_mask (`numpy.ndarray` of shape `({0})`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 23e4b8e270..6c3ceaa5bf 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -181,7 +181,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
if input_ids is None and inputs_embeds is None:
raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
@@ -913,7 +913,7 @@ class TFBertPreTrainedModel(TFPreTrainedModel):
Dummy inputs to build the network.
Returns:
- :obj:`Dict[str, tf.Tensor]`: The dummy inputs.
+ `Dict[str, tf.Tensor]`: The dummy inputs.
"""
dummy = {"input_ids": tf.constant(DUMMY_INPUTS)}
# Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
@@ -929,22 +929,21 @@ class TFBertPreTrainedModel(TFPreTrainedModel):
@dataclass
class TFBertForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.TFBertForPreTraining`.
+ Output type of [`TFBertForPreTraining`].
Args:
- prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- seq_relationship_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`):
+ seq_relationship_logits (`tf.Tensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -959,92 +958,92 @@ class TFBertForPreTrainingOutput(ModelOutput):
BERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
+ config ([`BertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
BERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`):
+ input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1086,24 +1085,24 @@ class TFBertModel(TFBertPreTrainedModel):
**kwargs,
) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
r"""
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
"""
inputs = input_processing(
func=self.call,
@@ -1212,33 +1211,32 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
**kwargs,
) -> Union[TFBertForPreTrainingOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` of shape ``(batch_size, sequence_length)``, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- next_sentence_label (``tf.Tensor`` of shape ``(batch_size,)``, `optional`):
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ next_sentence_label (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
- (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+ (see `input_ids` docstring) Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
- kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
Return:
- Examples::
+ Examples:
- >>> import tensorflow as tf
- >>> from transformers import BertTokenizer, TFBertForPreTraining
+ ```python
+ >>> import tensorflow as tf
+ >>> from transformers import BertTokenizer, TFBertForPreTraining
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- >>> model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
- >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
- >>> outputs = model(input_ids)
- >>> prediction_scores, seq_relationship_scores = outputs[:2]
-
- """
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+ >>> model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
+ >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
+ >>> outputs = model(input_ids)
+ >>> prediction_scores, seq_relationship_scores = outputs[:2]
+ ```"""
inputs = input_processing(
func=self.call,
config=self.config,
@@ -1354,10 +1352,9 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
**kwargs,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
func=self.call,
@@ -1474,27 +1471,26 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
**kwargs,
) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
r"""
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the cross entropy classification loss. Indices should be in ``[0, ...,
- config.vocab_size - 1]``.
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., config.vocab_size - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1723,10 +1719,9 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
**kwargs,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
func=self.call,
@@ -1833,10 +1828,9 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
**kwargs,
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
inputs = input_processing(
func=self.call,
@@ -1992,9 +1986,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
**kwargs,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -2100,13 +2093,13 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
**kwargs,
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
- start_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
inputs = input_processing(
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index 653e585e68..b126fae2e4 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -195,61 +195,60 @@ class BertGenerationPreTrainedModel(PreTrainedModel):
BERT_GENERATION_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.BertGenerationConfig`): Model configuration class with all the parameters of the model.
+ config ([`BertGenerationConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
BERT_GENERATION_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertGenerationTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.__call__` and :meth:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`BertGenerationTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -261,17 +260,17 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
- cross-attention is added between the self-attention layers, following the architecture described in `Attention is
- all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+ cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+ all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
This model should be used when leveraging Bert or Roberta checkpoints for the
- :class:`~transformers.EncoderDecoderModel` class as described in `Leveraging Pre-trained Checkpoints for Sequence
- Generation Tasks `__ by Sascha Rothe, Shashi Narayan, and Aliaksei Severyn.
+ [`EncoderDecoderModel`] class as described in [Leveraging Pre-trained Checkpoints for Sequence
+ Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, and Aliaksei Severyn.
- To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
- set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
- argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration
+ set to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
+ argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an
input to the forward pass.
"""
@@ -322,22 +321,22 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: ``1`` for
- tokens that are NOT MASKED, ``0`` for MASKED tokens.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: `1` for
+ tokens that are NOT MASKED, `0` for MASKED tokens.
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -485,46 +484,47 @@ class BertGenerationDecoder(BertGenerationPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
- ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
Returns:
- Example::
+ Example:
- >>> from transformers import BertGenerationTokenizer, BertGenerationDecoder, BertGenerationConfig
- >>> import torch
+ ```python
+ >>> from transformers import BertGenerationTokenizer, BertGenerationDecoder, BertGenerationConfig
+ >>> import torch
- >>> tokenizer = BertGenerationTokenizer.from_pretrained('google/bert_for_seq_generation_L-24_bbc_encoder')
- >>> config = BertGenerationConfig.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
- >>> config.is_decoder = True
- >>> model = BertGenerationDecoder.from_pretrained('google/bert_for_seq_generation_L-24_bbc_encoder', config=config)
+ >>> tokenizer = BertGenerationTokenizer.from_pretrained('google/bert_for_seq_generation_L-24_bbc_encoder')
+ >>> config = BertGenerationConfig.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
+ >>> config.is_decoder = True
+ >>> model = BertGenerationDecoder.from_pretrained('google/bert_for_seq_generation_L-24_bbc_encoder', config=config)
- >>> inputs = tokenizer("Hello, my dog is cute", return_token_type_ids=False, return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_token_type_ids=False, return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.logits
- """
+ >>> prediction_logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
use_cache = False
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index 2119009f04..3699aa3a63 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -1781,90 +1781,87 @@ class BigBirdPreTrainedModel(PreTrainedModel):
BIG_BIRD_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.BigBirdConfig`): Model configuration class with all the parameters of the model.
+ config ([`BigBirdConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
BIG_BIRD_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.BigBirdTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BigBirdTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@dataclass
class BigBirdForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.BigBirdForPreTraining`.
+ Output type of [`BigBirdForPreTraining`].
Args:
- loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+ loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.
- prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+ seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -1883,22 +1880,21 @@ class BigBirdForQuestionAnsweringModelOutput(ModelOutput):
Base class for outputs of question answering models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
- start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+ start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
- end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+ end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
- pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 1)`):
+ pooler_output (`torch.FloatTensor` of shape `(batch_size, 1)`):
pooler output from BigBigModel
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -1920,13 +1916,13 @@ class BigBirdModel(BigBirdPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
- cross-attention is added between the self-attention layers, following the architecture described in `Attention is
- all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+ cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+ all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
- To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
- set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
- argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration
+ set to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
+ argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an
input to the forward pass.
"""
@@ -1997,23 +1993,23 @@ class BigBirdModel(BigBirdPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -2289,36 +2285,36 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. If specified, nsp loss will be
- added to masked_lm loss. Input should be a sequence pair (see :obj:`input_ids` docstring) Indices should be
- in ``[0, 1]``:
+ added to masked_lm loss. Input should be a sequence pair (see `input_ids` docstring) Indices should be
+ in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
- kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
Returns:
- Example::
+ Example:
- >>> from transformers import BigBirdTokenizer, BigBirdForPreTraining
- >>> import torch
+ ```python
+ >>> from transformers import BigBirdTokenizer, BigBirdForPreTraining
+ >>> import torch
- >>> tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
- >>> model = BigBirdForPreTraining.from_pretrained('google/bigbird-roberta-base')
+ >>> tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
+ >>> model = BigBirdForPreTraining.from_pretrained('google/bigbird-roberta-base')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.prediction_logits
- >>> seq_relationship_logits = outputs.seq_relationship_logits
- """
+ >>> prediction_logits = outputs.prediction_logits
+ >>> seq_relationship_logits = outputs.seq_relationship_logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
@@ -2404,10 +2400,9 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2504,45 +2499,46 @@ class BigBirdForCausalLM(BigBirdPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
- ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
Returns:
- Example::
+ Example:
- >>> from transformers import BigBirdTokenizer, BigBirdForCausalLM, BigBirdConfig
- >>> import torch
+ ```python
+ >>> from transformers import BigBirdTokenizer, BigBirdForCausalLM, BigBirdConfig
+ >>> import torch
- >>> tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
- >>> config = BigBirdConfig.from_pretrained("google/bigbird-roberta-base")
- >>> config.is_decoder = True
- >>> model = BigBirdForCausalLM.from_pretrained('google/bigbird-roberta-base', config=config)
+ >>> tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
+ >>> config = BigBirdConfig.from_pretrained("google/bigbird-roberta-base")
+ >>> config.is_decoder = True
+ >>> model = BigBirdForCausalLM.from_pretrained('google/bigbird-roberta-base', config=config)
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.logits
- """
+ >>> prediction_logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
@@ -2670,10 +2666,9 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2768,10 +2763,9 @@ class BigBirdForMultipleChoice(BigBirdPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -2864,9 +2858,8 @@ class BigBirdForTokenClassification(BigBirdPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2975,13 +2968,13 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py
index c43b1a1285..b1ed49cd36 100644
--- a/src/transformers/models/big_bird/modeling_flax_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py
@@ -55,22 +55,21 @@ _TOKENIZER_FOR_DOC = "BigBirdTokenizer"
@flax.struct.dataclass
class FlaxBigBirdForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.BigBirdForPreTraining`.
+ Output type of [`BigBirdForPreTraining`].
Args:
- prediction_logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- seq_relationship_logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, 2)`):
+ seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -88,20 +87,19 @@ class FlaxBigBirdForQuestionAnsweringModelOutput(ModelOutput):
Base class for outputs of question answering models.
Args:
- start_logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
- end_logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
- pooled_output (:obj:`jnp.ndarray` of shape :obj:`(batch_size, hidden_size)`):
+ pooled_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
pooled_output returned by FlaxBigBirdModel.
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -116,76 +114,72 @@ class FlaxBigBirdForQuestionAnsweringModelOutput(ModelOutput):
BIG_BIRD_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading, saving and converting weights from
PyTorch models)
- This model is also a Flax Linen `flax.linen.Module
- `__ subclass. Use it as a regular Flax linen Module
+ This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. Use it as a regular Flax linen Module
and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.BigBirdConfig`): Model configuration class with all the parameters of the model.
+ config ([`BigBirdConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
BIG_BIRD_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`):
+ input_ids (`numpy.ndarray` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BigBirdTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BigBirdTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- head_mask (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ head_mask (`numpy.ndarray` of shape `({0})`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 06259aaff4..814d63e103 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -1369,11 +1369,11 @@ class BigBirdPegasusEncoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape :obj:`(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -1471,19 +1471,19 @@ class BigBirdPegasusDecoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for cross-attention heads in a given layer of
- size `(decoder_attention_heads,)`.
- past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+ size *(decoder_attention_heads,)*.
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -1603,19 +1603,19 @@ class BigBirdPegasusPreTrainedModel(PreTrainedModel):
BIGBIRD_PEGASUS_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings
etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.BigBirdPegasusConfig`):
+ config ([`BigBirdPegasusConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BIGBIRD_PEGASUS_GENERATION_EXAMPLE = r"""
@@ -1636,113 +1636,110 @@ BIGBIRD_PEGASUS_GENERATION_EXAMPLE = r"""
BIGBIRD_PEGASUS_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Provide for translation and summarization training. By default, the model will create this tensor by
- shifting the :obj:`input_ids` to the right, following the paper.
- decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ shifting the `input_ids` to the right, following the paper.
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
If you want to change padding behavior, you should read
- :func:`modeling_bigbird_pegasus._prepare_decoder_inputs` and modify to your needs. See diagram 1 in `the
- paper `__ for more information on the default strategy.
+ [`modeling_bigbird_pegasus._prepare_decoder_inputs`] and modify to your needs. See diagram 1 in [the
+ paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
- have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
- :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
+ have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+ `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
- If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
- takes the value of :obj:`inputs_embeds`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+ takes the value of `inputs_embeds`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
BIGBIRD_PEGASUS_STANDALONE_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.ProphetNetTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`ProphetNetTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ [What are attention masks?](../glossary#attention-mask)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`BigBirdPegasusEncoderLayer`.
+ [`BigBirdPegasusEncoderLayer`].
Args:
config: BigBirdPegasusConfig
@@ -1791,35 +1788,35 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
+ [What are attention masks?](../glossary#attention-mask)
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -2040,7 +2037,7 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
- :class:`BigBirdPegasusDecoderLayer`
+ [`BigBirdPegasusDecoderLayer`]
Args:
config: BigBirdPegasusConfig
@@ -2113,71 +2110,68 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in decoder to avoid performing
- cross-attention on hidden heads. Mask values selected in ``[0, 1]``:
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -2502,10 +2496,9 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
"""
@@ -2646,9 +2639,8 @@ class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
@@ -2772,13 +2764,13 @@ class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2851,7 +2843,7 @@ class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
class BigBirdPegasusDecoderWrapper(BigBirdPegasusPreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
- used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
+ used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
@@ -2913,88 +2905,87 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
- in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
model.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels in ``[0, ...,
- config.vocab_size]``.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
Returns:
- Example::
+ Example:
- >>> from transformers import PegasusTokenizer, BigBirdPegasusForCausalLM
+ ```python
+ >>> from transformers import PegasusTokenizer, BigBirdPegasusForCausalLM
- >>> tokenizer = PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
- >>> model = BigBirdPegasusForCausalLM.from_pretrained("google/bigbird-pegasus-large-arxiv", add_cross_attention=False)
- >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> tokenizer = PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
+ >>> model = BigBirdPegasusForCausalLM.from_pretrained("google/bigbird-pegasus-large-arxiv", add_cross_attention=False)
+ >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- """
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 208d997f2b..d92678a76e 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -297,13 +297,13 @@ class BlenderbotEncoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -381,19 +381,19 @@ class BlenderbotDecoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for cross-attention heads in a given layer of
- size `(decoder_attention_heads,)`.
- past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+ size *(decoder_attention_heads,)*.
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -489,19 +489,19 @@ class BlenderbotPreTrainedModel(PreTrainedModel):
BLENDERBOT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.BlenderbotConfig`):
+ config ([`BlenderbotConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BLENDERBOT_GENERATION_EXAMPLE = r"""
@@ -531,102 +531,98 @@ BLENDERBOT_GENERATION_EXAMPLE = r"""
BLENDERBOT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- Blenderbot uses the :obj:`bos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ Blenderbot uses the `bos_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in ``[0,
- 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
- have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
- :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
+ have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+ `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
- If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
- takes the value of :obj:`inputs_embeds`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+ takes the value of `inputs_embeds`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class BlenderbotEncoder(BlenderbotPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`BlenderbotEncoderLayer`.
+ [`BlenderbotEncoderLayer`].
Args:
config: BlenderbotConfig
@@ -672,40 +668,40 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -795,7 +791,7 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel):
class BlenderbotDecoder(BlenderbotPreTrainedModel):
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`BlenderbotDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BlenderbotDecoderLayer`]
Args:
config: BlenderbotConfig
@@ -868,72 +864,68 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
- cross-attention on hidden heads. Mask values selected in ``[0, 1]``:
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1288,10 +1280,9 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
"""
@@ -1386,7 +1377,7 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
class BlenderbotDecoderWrapper(BlenderbotPreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
- used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
+ used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
@@ -1448,88 +1439,87 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
- in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
model.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels in ``[0, ...,
- config.vocab_size]``.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
Returns:
- Example::
+ Example:
- >>> from transformers import BlenderbotTokenizer, BlenderbotForCausalLM
+ ```python
+ >>> from transformers import BlenderbotTokenizer, BlenderbotForCausalLM
- >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/bart-large')
- >>> model = BlenderbotForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
- >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/bart-large')
+ >>> model = BlenderbotForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+ >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- """
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
index fb5be5faac..068161dbcf 100644
--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -57,158 +57,153 @@ _CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill"
BLENDERBOT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a Flax Linen `flax.nn.Module
- `__ subclass. Use it as a regular Flax
+ This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.BlenderbotConfig`): Model configuration class with all the parameters of the model.
+ config ([`BlenderbotConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
BLENDERBOT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in `the paper
- `__ for more information on the default strategy.
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ range `[0, config.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
BLENDERBOT_ENCODE_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
BLENDERBOT_DECODE_INPUTS_DOCSTRING = r"""
Args:
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`):
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- encoder_outputs (:obj:`tuple(tuple(jnp.ndarray)`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(jnp.ndarray)`):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- encoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in `the paper
- `__ for more information on the default strategy.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.max_position_embeddings - 1]``.
- past_key_values (:obj:`Dict[str, np.ndarray]`, `optional`, returned by ``init_cache`` or when passing previous ``past_key_values``):
+ range `[0, config.max_position_embeddings - 1]`.
+ past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
- auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -928,15 +923,14 @@ class FlaxBlenderbotPreTrainedModel(FlaxPreTrainedModel):
def init_cache(self, batch_size, max_length, encoder_outputs):
r"""
Args:
- batch_size (:obj:`int`):
+ batch_size (`int`):
batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
- max_length (:obj:`int`):
+ max_length (`int`):
maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
cache.
- encoder_outputs (:obj:`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
- ``encoder_outputs`` consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`,
- `optional`: :obj:`attentions`). :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length,
- hidden_size)`, `optional`) is a sequence of hidden-states at the output of the last layer of the
+ encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+ `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`,
+ *optional*: `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the
encoder. Used in the cross-attention of the decoder.
"""
# init input variables to retrieve cache
diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
index b8e7e6fe7b..24ae231c0c 100644
--- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
@@ -300,11 +300,11 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer):
def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -377,17 +377,17 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`tf.Tensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`tf.Tensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(decoder_attention_heads,)`
- cross_attn_layer_head_mask (:obj:`tf.Tensor`): mask for heads of the cross-attention module.
- `(decoder_attention_heads,)`
- past_key_value (:obj:`Tuple(tf.Tensor)`): cached past key and value projection states
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(decoder_attention_heads,)*
+ cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
+ *(decoder_attention_heads,)*
+ past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -478,37 +478,39 @@ class TFBlenderbotPreTrainedModel(TFPreTrainedModel):
BLENDERBOT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(input_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.BlenderbotConfig`): Model configuration class with all the parameters of the model.
+ config ([`BlenderbotConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
@@ -539,76 +541,76 @@ BLENDERBOT_GENERATION_EXAMPLE = r"""
BLENDERBOT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- Blenderbot uses the :obj:`bos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ Blenderbot uses the `bos_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
+ encoder_outputs (`tf.FloatTensor`, *optional*):
hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
- of shape :obj:`(batch_size, sequence_length, hidden_size)` is a sequence of
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -619,7 +621,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
config_class = BlenderbotConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`TFBlenderbotEncoderLayer`.
+ [`TFBlenderbotEncoderLayer`].
Args:
config: BlenderbotConfig
@@ -663,44 +665,43 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
):
"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
in the config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail. This argument can be used only in eager mode, in graph mode the value in the config
will be used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -789,7 +790,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
class TFBlenderbotDecoder(tf.keras.layers.Layer):
config_class = BlenderbotConfig
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`TFBlenderbotDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotDecoderLayer`]
Args:
config: BlenderbotConfig
@@ -838,69 +839,66 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
):
r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BlenderbotTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
in the config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail. This argument can be used only in eager mode, in graph mode the value in the config
will be used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1370,10 +1368,9 @@ class TFBlenderbotForConditionalGeneration(TFBlenderbotPreTrainedModel, TFCausal
**kwargs,
):
r"""
- labels (:obj:`tf.tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 541b955914..276ff96a54 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -295,13 +295,13 @@ class BlenderbotSmallEncoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -379,19 +379,19 @@ class BlenderbotSmallDecoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
- encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for cross-attention heads in a given layer of
- size `(decoder_attention_heads,)`.
- past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+ size *(decoder_attention_heads,)*.
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -487,19 +487,19 @@ class BlenderbotSmallPreTrainedModel(PreTrainedModel):
BLENDERBOT_SMALL_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.BlenderbotSmallConfig`):
+ config ([`BlenderbotSmallConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BLENDERBOT_SMALL_GENERATION_EXAMPLE = r"""
@@ -531,102 +531,98 @@ BLENDERBOT_SMALL_GENERATION_EXAMPLE = r"""
BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- BlenderbotSmall uses the :obj:`bos_token_id` as the starting token for :obj:`decoder_input_ids` generation.
- If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ BlenderbotSmall uses the `bos_token_id` as the starting token for `decoder_input_ids` generation.
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in ``[0,
- 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
- have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
- :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
+ have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+ `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
- If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
- takes the value of :obj:`inputs_embeds`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+ takes the value of `inputs_embeds`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`BlenderbotSmallEncoderLayer`.
+ [`BlenderbotSmallEncoderLayer`].
Args:
config: BlenderbotSmallConfig
@@ -672,40 +668,40 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -794,7 +790,7 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
- :class:`BlenderbotSmallDecoderLayer`
+ [`BlenderbotSmallDecoderLayer`]
Args:
config: BlenderbotSmallConfig
@@ -867,71 +863,68 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
- cross-attention on hidden heads. Mask values selected in ``[0, 1]``:
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1262,10 +1255,9 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
"""
@@ -1360,7 +1352,7 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
class BlenderbotSmallDecoderWrapper(BlenderbotSmallPreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
- used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
+ used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
@@ -1422,88 +1414,87 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
- in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
model.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels in ``[0, ...,
- config.vocab_size]``.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
Returns:
- Example::
+ Example:
- >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForCausalLM
+ ```python
+ >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForCausalLM
- >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/bart-large')
- >>> model = BlenderbotSmallForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
- >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/bart-large')
+ >>> model = BlenderbotSmallForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+ >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- """
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
index aada0cbc1c..1daf801b78 100644
--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -57,170 +57,165 @@ _CONFIG_FOR_DOC = "BlenderbotSmallConfig"
_TOKENIZER_FOR_DOC = "BlenderbotSmallTokenizer"
BLENDERBOT_SMALL_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a Flax Linen `flax.nn.Module
- `__ subclass. Use it as a regular Flax
+ This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.BlenderbotSmallConfig`): Model configuration class with all the parameters of the model.
+ config ([`BlenderbotSmallConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in `the paper
- `__ for more information on the default strategy.
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ range `[0, config.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
BLENDERBOT_SMALL_ENCODE_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
BLENDERBOT_SMALL_DECODE_INPUTS_DOCSTRING = r"""
Args:
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`):
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- encoder_outputs (:obj:`tuple(tuple(jnp.ndarray)`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(jnp.ndarray)`):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- encoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in `the paper
- `__ for more information on the default strategy.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.max_position_embeddings - 1]``.
- past_key_values (:obj:`Dict[str, np.ndarray]`, `optional`, returned by ``init_cache`` or when passing previous ``past_key_values``):
+ range `[0, config.max_position_embeddings - 1]`.
+ past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
- auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -940,15 +935,14 @@ class FlaxBlenderbotSmallPreTrainedModel(FlaxPreTrainedModel):
def init_cache(self, batch_size, max_length, encoder_outputs):
r"""
Args:
- batch_size (:obj:`int`):
+ batch_size (`int`):
batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
- max_length (:obj:`int`):
+ max_length (`int`):
maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
cache.
- encoder_outputs (:obj:`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
- ``encoder_outputs`` consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`,
- `optional`: :obj:`attentions`). :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length,
- hidden_size)`, `optional`) is a sequence of hidden-states at the output of the last layer of the
+ encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+ `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`,
+ *optional*: `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the
encoder. Used in the cross-attention of the decoder.
"""
# init input variables to retrieve cache
diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
index c590a14458..dd12fa51d1 100644
--- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
@@ -299,11 +299,11 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer):
def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*
"""
residual = hidden_states
hidden_states, self_attn_weights, _ = self.self_attn(
@@ -376,17 +376,17 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`tf.Tensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`tf.Tensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(decoder_attention_heads,)`
- cross_attn_layer_head_mask (:obj:`tf.Tensor`): mask for heads of the cross-attention module.
- `(decoder_attention_heads,)`
- past_key_value (:obj:`Tuple(tf.Tensor)`): cached past key and value projection states
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(decoder_attention_heads,)*
+ cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
+ *(decoder_attention_heads,)*
+ past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
"""
residual = hidden_states
@@ -477,37 +477,39 @@ class TFBlenderbotSmallPreTrainedModel(TFPreTrainedModel):
BLENDERBOT_SMALL_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(input_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.BlenderbotSmallConfig`): Model configuration class with all the parameters of the model.
+ config ([`BlenderbotSmallConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
@@ -543,76 +545,76 @@ BLENDERBOT_SMALL_GENERATION_EXAMPLE = r"""
BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- BlenderbotSmall uses the :obj:`bos_token_id` as the starting token for :obj:`decoder_input_ids` generation.
- If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ BlenderbotSmall uses the `bos_token_id` as the starting token for `decoder_input_ids` generation.
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
+ encoder_outputs (`tf.FloatTensor`, *optional*):
hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
- of shape :obj:`(batch_size, sequence_length, hidden_size)` is a sequence of
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -623,7 +625,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
config_class = BlenderbotSmallConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`TFBlenderbotSmallEncoderLayer`.
+ [`TFBlenderbotSmallEncoderLayer`].
Args:
config: BlenderbotSmallConfig
@@ -667,44 +669,43 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
):
"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
in the config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail. This argument can be used only in eager mode, in graph mode the value in the config
will be used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -793,7 +794,7 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
config_class = BlenderbotSmallConfig
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
- :class:`TFBlenderbotSmallDecoderLayer`
+ [`TFBlenderbotSmallDecoderLayer`]
Args:
config: BlenderbotSmallConfig
@@ -842,69 +843,66 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
):
r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`BlenderbotSmallTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
in the config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail. This argument can be used only in eager mode, in graph mode the value in the config
will be used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1344,10 +1342,9 @@ class TFBlenderbotSmallForConditionalGeneration(TFBlenderbotSmallPreTrainedModel
**kwargs,
):
r"""
- labels (:obj:`tf.tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index 46bf8d20bb..872c936861 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -42,18 +42,18 @@ CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
CAMEMBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the
+ config ([`CamembertConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
@@ -64,7 +64,7 @@ CAMEMBERT_START_DOCSTRING = r"""
)
class CamembertModel(RobertaModel):
"""
- This class overrides :class:`~transformers.RobertaModel`. Please check the superclass for the appropriate
+ This class overrides [`RobertaModel`]. Please check the superclass for the appropriate
documentation alongside usage examples.
"""
@@ -77,7 +77,7 @@ class CamembertModel(RobertaModel):
)
class CamembertForMaskedLM(RobertaForMaskedLM):
"""
- This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the superclass for the appropriate
+ This class overrides [`RobertaForMaskedLM`]. Please check the superclass for the appropriate
documentation alongside usage examples.
"""
@@ -93,7 +93,7 @@ class CamembertForMaskedLM(RobertaForMaskedLM):
)
class CamembertForSequenceClassification(RobertaForSequenceClassification):
"""
- This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
+ This class overrides [`RobertaForSequenceClassification`]. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
@@ -109,7 +109,7 @@ class CamembertForSequenceClassification(RobertaForSequenceClassification):
)
class CamembertForMultipleChoice(RobertaForMultipleChoice):
"""
- This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the
+ This class overrides [`RobertaForMultipleChoice`]. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
@@ -125,7 +125,7 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice):
)
class CamembertForTokenClassification(RobertaForTokenClassification):
"""
- This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the superclass for the
+ This class overrides [`RobertaForTokenClassification`]. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
@@ -141,7 +141,7 @@ class CamembertForTokenClassification(RobertaForTokenClassification):
)
class CamembertForQuestionAnswering(RobertaForQuestionAnswering):
"""
- This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
+ This class overrides [`RobertaForQuestionAnswering`]. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
@@ -153,7 +153,7 @@ class CamembertForQuestionAnswering(RobertaForQuestionAnswering):
)
class CamembertForCausalLM(RobertaForCausalLM):
"""
- This class overrides :class:`~transformers.RobertaForCausalLM`. Please check the superclass for the appropriate
+ This class overrides [`RobertaForCausalLM`]. Please check the superclass for the appropriate
documentation alongside usage examples.
"""
diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py
index f552c9f5c2..bf17be951e 100644
--- a/src/transformers/models/camembert/modeling_tf_camembert.py
+++ b/src/transformers/models/camembert/modeling_tf_camembert.py
@@ -37,37 +37,39 @@ TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
CAMEMBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Parameters:
- config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the
+ config ([`CamembertConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
@@ -78,7 +80,7 @@ CAMEMBERT_START_DOCSTRING = r"""
)
class TFCamembertModel(TFRobertaModel):
"""
- This class overrides :class:`~transformers.TFRobertaModel`. Please check the superclass for the appropriate
+ This class overrides [`TFRobertaModel`]. Please check the superclass for the appropriate
documentation alongside usage examples.
"""
@@ -91,7 +93,7 @@ class TFCamembertModel(TFRobertaModel):
)
class TFCamembertForMaskedLM(TFRobertaForMaskedLM):
"""
- This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the superclass for the appropriate
+ This class overrides [`TFRobertaForMaskedLM`]. Please check the superclass for the appropriate
documentation alongside usage examples.
"""
@@ -107,7 +109,7 @@ class TFCamembertForMaskedLM(TFRobertaForMaskedLM):
)
class TFCamembertForSequenceClassification(TFRobertaForSequenceClassification):
"""
- This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the superclass for the
+ This class overrides [`TFRobertaForSequenceClassification`]. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
@@ -123,7 +125,7 @@ class TFCamembertForSequenceClassification(TFRobertaForSequenceClassification):
)
class TFCamembertForTokenClassification(TFRobertaForTokenClassification):
"""
- This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the superclass for the
+ This class overrides [`TFRobertaForTokenClassification`]. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
@@ -139,7 +141,7 @@ class TFCamembertForTokenClassification(TFRobertaForTokenClassification):
)
class TFCamembertForMultipleChoice(TFRobertaForMultipleChoice):
"""
- This class overrides :class:`~transformers.TFRobertaForMultipleChoice`. Please check the superclass for the
+ This class overrides [`TFRobertaForMultipleChoice`]. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
@@ -155,7 +157,7 @@ class TFCamembertForMultipleChoice(TFRobertaForMultipleChoice):
)
class TFCamembertForQuestionAnswering(TFRobertaForQuestionAnswering):
"""
- This class overrides :class:`~transformers.TFRobertaForQuestionAnswering`. Please check the superclass for the
+ This class overrides [`TFRobertaForQuestionAnswering`]. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py
index 0081bedcd9..7d87a7b855 100644
--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@@ -65,30 +65,28 @@ _PRIMES = [31, 43, 59, 61, 73, 97, 103, 113, 137, 149, 157, 173, 181, 193, 211,
@dataclass
class CanineModelOutputWithPooling(ModelOutput):
"""
- Output type of :class:`~transformers.CanineModel`. Based on
- :class:`~transformers.modeling_outputs.BaseModelOutputWithPooling`, but with slightly different
- :obj:`hidden_states` and :obj:`attentions`, as these also include the hidden states and attentions of the shallow
+ Output type of [`CanineModel`]. Based on
+ [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly different
+ `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
Transformer encoders.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
shallow Transformer encoder).
- pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
+ pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
weights are trained from the next sentence prediction (classification) objective during pretraining.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of
- each encoder) of shape :obj:`(batch_size, sequence_length, hidden_size)` and :obj:`(batch_size,
- sequence_length // config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of
+ each encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length // config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of
each layer plus the initial input to each Transformer encoder. The hidden states of the shallow encoders
- have length :obj:`sequence_length`, but the hidden states of the deep encoder have length
- :obj:`sequence_length` // :obj:`config.downsampling_rate`.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape
- :obj:`(batch_size, num_heads, sequence_length, sequence_length)` and :obj:`(batch_size, num_heads,
- sequence_length // config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions
+ have length `sequence_length`, but the hidden states of the deep encoder have length
+ `sequence_length` // `config.downsampling_rate`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape
+ `(batch_size, num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length // config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions
weights after the attention softmax, used to compute the weighted average in the self-attention heads.
"""
@@ -516,17 +514,17 @@ class CanineAttention(nn.Module):
"""
Additional arguments related to local attention:
- - **local** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether to apply local attention.
- - **always_attend_to_first_position** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Should all blocks
+ - **local** (`bool`, *optional*, defaults to `False`) -- Whether to apply local attention.
+ - **always_attend_to_first_position** (`bool`, *optional*, defaults to `False`) -- Should all blocks
be able to attend
- to the :obj:`to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all**
- (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Should the `from_tensor`'s first position be able to
- attend to all positions within the `from_tensor`? - **attend_from_chunk_width** (:obj:`int`, `optional`,
- defaults to 128) -- The width of each block-wise chunk in :obj:`from_tensor`. - **attend_from_chunk_stride**
- (:obj:`int`, `optional`, defaults to 128) -- The number of elements to skip when moving to the next block in
- :obj:`from_tensor`. - **attend_to_chunk_width** (:obj:`int`, `optional`, defaults to 128) -- The width of each
- block-wise chunk in `to_tensor`. - **attend_to_chunk_stride** (:obj:`int`, `optional`, defaults to 128) -- The
- number of elements to skip when moving to the next block in :obj:`to_tensor`.
+ to the `to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all**
+ (`bool`, *optional*, defaults to `False`) -- Should the *from_tensor*'s first position be able to
+ attend to all positions within the *from_tensor*? - **attend_from_chunk_width** (`int`, *optional*,
+ defaults to 128) -- The width of each block-wise chunk in `from_tensor`. - **attend_from_chunk_stride**
+ (`int`, *optional*, defaults to 128) -- The number of elements to skip when moving to the next block in
+ `from_tensor`. - **attend_to_chunk_width** (`int`, *optional*, defaults to 128) -- The width of each
+ block-wise chunk in *to_tensor*. - **attend_to_chunk_stride** (`int`, *optional*, defaults to 128) -- The
+ number of elements to skip when moving to the next block in `to_tensor`.
"""
def __init__(
@@ -921,65 +919,63 @@ class CaninePreTrainedModel(PreTrainedModel):
CANINE_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.CanineConfig`): Model configuration class with all the parameters of the model.
+ config ([`CanineConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
CANINE_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.CanineTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`CanineTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -1298,10 +1294,9 @@ class CanineForSequenceClassification(CaninePreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1395,10 +1390,9 @@ class CanineForMultipleChoice(CaninePreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1488,9 +1482,8 @@ class CanineForTokenClassification(CaninePreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1577,13 +1570,13 @@ class CanineForQuestionAnswering(CaninePreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index aa3f724b93..d61ce2553c 100755
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -76,24 +76,24 @@ def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
class CLIPOutput(ModelOutput):
"""
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`return_loss` is :obj:`True`):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for image-text similarity.
- logits_per_image:(:obj:`torch.FloatTensor` of shape :obj:`(image_batch_size, text_batch_size)`):
- The scaled dot product scores between :obj:`image_embeds` and :obj:`text_embeds`. This represents the
+ logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+ The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the
image-text similarity scores.
- logits_per_text:(:obj:`torch.FloatTensor` of shape :obj:`(text_batch_size, image_batch_size)`):
- The scaled dot product scores between :obj:`text_embeds` and :obj:`image_embeds`. This represents the
+ logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+ The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the
text-image similarity scores.
- text_embeds(:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`):
+ text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of
- :class:`~transformers.CLIPTextModel`.
- image_embeds(:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`):
+ [`CLIPTextModel`].
+ image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
The image embeddings obtained by applying the projection layer to the pooled output of
- :class:`~transformers.CLIPVisionModel`.
- text_model_output(:obj:`BaseModelOutputWithPooling`):
- The output of the :class:`~transformers.CLIPTextModel`.
- vision_model_output(:obj:`BaseModelOutputWithPooling`):
- The output of the :class:`~transformers.CLIPVisionModel`.
+ [`CLIPVisionModel`].
+ text_model_output(`BaseModelOutputWithPooling`):
+ The output of the [`CLIPTextModel`].
+ vision_model_output(`BaseModelOutputWithPooling`):
+ The output of the [`CLIPVisionModel`].
"""
loss: Optional[torch.FloatTensor] = None
@@ -299,12 +299,12 @@ class CLIPEncoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape :obj:`(batch, seq_len, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- :obj:`(config.encoder_attention_heads,)`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ `(config.encoder_attention_heads,)`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -391,110 +391,108 @@ class CLIPPreTrainedModel(PreTrainedModel):
CLIP_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ subclass. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.CLIPConfig`): Model configuration class with all the parameters of the model.
+ config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
CLIP_TEXT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.CLIPTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`CLIPTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ [What are position IDs?](../glossary#position-ids)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
- pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)`):
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
- :class:`~transformers.CLIPFeatureExtractor`. See :meth:`transformers.CLIPFeatureExtractor.__call__` for
+ [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for
details.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
CLIP_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.CLIPTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`CLIPTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)`):
+ [What are position IDs?](../glossary#position-ids)
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
- :class:`~transformers.CLIPFeatureExtractor`. See :meth:`transformers.CLIPFeatureExtractor.__call__` for
+ [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for
details.
- return_loss (:obj:`bool`, `optional`):
+ return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class CLIPEncoder(nn.Module):
"""
- Transformer encoder consisting of :obj:`config.num_hidden_layers` self attention layers. Each layer is a
- :class:`~transformers.CLIPEncoderLayer`.
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+ [`CLIPEncoderLayer`].
Args:
config: CLIPConfig
@@ -517,32 +515,32 @@ class CLIPEncoder(nn.Module):
):
r"""
Args:
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- causal_attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Causal mask for the text model. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Causal mask for the text model. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ [What are attention masks?](../glossary#attention-mask)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -884,19 +882,20 @@ class CLIPModel(CLIPPreTrainedModel):
):
r"""
Returns:
- text_features (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`): The text embeddings
- obtained by applying the projection layer to the pooled output of :class:`~transformers.CLIPTextModel`.
+ text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings
+ obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
- Examples::
+ Examples:
- >>> from transformers import CLIPTokenizer, CLIPModel
+ ```python
+ >>> from transformers import CLIPTokenizer, CLIPModel
- >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
- >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
- >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
- >>> text_features = model.get_text_features(**inputs)
- """
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+ >>> text_features = model.get_text_features(**inputs)
+ ```"""
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
@@ -921,25 +920,26 @@ class CLIPModel(CLIPPreTrainedModel):
):
r"""
Returns:
- image_features (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`): The image embeddings
- obtained by applying the projection layer to the pooled output of :class:`~transformers.CLIPVisionModel`.
+ image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings
+ obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
- Examples::
+ Examples:
- >>> from PIL import Image
- >>> import requests
- >>> from transformers import CLIPProcessor, CLIPModel
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import CLIPProcessor, CLIPModel
- >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
- >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
- >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> inputs = processor(images=image, return_tensors="pt")
+ >>> inputs = processor(images=image, return_tensors="pt")
- >>> image_features = model.get_image_features(**inputs)
- """
+ >>> image_features = model.get_image_features(**inputs)
+ ```"""
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py
index 2a088f0f02..13530e39d3 100644
--- a/src/transformers/models/clip/modeling_flax_clip.py
+++ b/src/transformers/models/clip/modeling_flax_clip.py
@@ -40,126 +40,123 @@ logger = logging.get_logger(__name__)
CLIP_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading, saving and converting weights from
PyTorch models)
- This model is also a Flax Linen `flax.linen.Module
- `__ subclass. Use it as a regular Flax linen Module
+ This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. Use it as a regular Flax linen Module
and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.CLIPConfig`): Model configuration class with all the parameters of the model.
+ config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
CLIP_TEXT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.CLIPTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`CLIPTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ [What are position IDs?](../glossary#position-ids)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
- pixel_values (:obj:`numpy.ndarray` of shape :obj:`(batch_size, num_channels, height, width)`):
+ pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
- :class:`~transformers.CLIPFeatureExtractor`. See :meth:`transformers.CLIPFeatureExtractor.__call__` for
+ [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for
details.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
CLIP_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.CLIPTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`CLIPTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- pixel_values (:obj:`numpy.ndarray` of shape :obj:`(batch_size, num_channels, height, width)`):
+ [What are position IDs?](../glossary#position-ids)
+ pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
- :class:`~transformers.CLIPFeatureExtractor`. See :meth:`transformers.CLIPFeatureExtractor.__call__` for
+ [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for
details.
- return_loss (:obj:`bool`, `optional`):
+ return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -167,22 +164,22 @@ CLIP_INPUTS_DOCSTRING = r"""
class FlaxCLIPOutput(ModelOutput):
"""
Args:
- logits_per_image:(:obj:`jnp.ndarray` of shape :obj:`(image_batch_size, text_batch_size)`):
- The scaled dot product scores between :obj:`image_embeds` and :obj:`text_embeds`. This represents the
+ logits_per_image:(`jnp.ndarray` of shape `(image_batch_size, text_batch_size)`):
+ The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the
image-text similarity scores.
- logits_per_text:(:obj:`jnp.ndarray` of shape :obj:`(text_batch_size, image_batch_size)`):
- The scaled dot product scores between :obj:`text_embeds` and :obj:`image_embeds`. This represents the
+ logits_per_text:(`jnp.ndarray` of shape `(text_batch_size, image_batch_size)`):
+ The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the
text-image similarity scores.
- text_embeds(:obj:`jnp.ndarray` of shape :obj:`(batch_size, output_dim`):
+ text_embeds(`jnp.ndarray` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of
- :class:`~transformers.FlaxCLIPTextModel`.
- image_embeds(:obj:`jnp.ndarray` of shape :obj:`(batch_size, output_dim`):
+ [`FlaxCLIPTextModel`].
+ image_embeds(`jnp.ndarray` of shape `(batch_size, output_dim`):
The image embeddings obtained by applying the projection layer to the pooled output of
- :class:`~transformers.FlaxCLIPVisionModel`.
- text_model_output(:obj:`FlaxBaseModelOutputWithPooling`):
- The output of the :class:`~transformers.FlaxCLIPTextModel`.
- vision_model_output(:obj:`FlaxBaseModelOutputWithPooling`):
- The output of the :class:`~transformers.FlaxCLIPVisionModel`.
+ [`FlaxCLIPVisionModel`].
+ text_model_output(`FlaxBaseModelOutputWithPooling`):
+ The output of the [`FlaxCLIPTextModel`].
+ vision_model_output(`FlaxBaseModelOutputWithPooling`):
+ The output of the [`FlaxCLIPVisionModel`].
"""
logits_per_image: jnp.ndarray = None
@@ -798,30 +795,31 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.CLIPTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`CLIPTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
+ [What are input IDs?](../glossary#input-ids)
Returns:
- text_features (:obj:`jnp.ndarray` of shape :obj:`(batch_size, output_dim`): The text embeddings obtained by
- applying the projection layer to the pooled output of :class:`~transformers.FlaxCLIPTextModel`.
+ text_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The text embeddings obtained by
+ applying the projection layer to the pooled output of [`FlaxCLIPTextModel`].
- Examples::
+ Examples:
- >>> from transformers import CLIPTokenizer, FlaxCLIPModel
+ ```python
+ >>> from transformers import CLIPTokenizer, FlaxCLIPModel
- >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
- >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+ >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
- >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
- >>> text_features = model.get_text_features(**inputs)
- """
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
+ >>> text_features = model.get_text_features(**inputs)
+ ```"""
if position_ids is None:
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
@@ -859,31 +857,32 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
):
r"""
Args:
- pixel_values (:obj:`numpy.ndarray` of shape :obj:`(batch_size, num_channels, height, width)`):
+ pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained
- using :class:`~transformers.CLIPFeatureExtractor`. See
- :meth:`transformers.CLIPFeatureExtractor.__call__` for details.
+ using [`CLIPFeatureExtractor`]. See
+ [`CLIPFeatureExtractor.__call__`] for details.
Returns:
- image_features (:obj:`jnp.ndarray` of shape :obj:`(batch_size, output_dim`): The image embeddings obtained
- by applying the projection layer to the pooled output of :class:`~transformers.FlaxCLIPVisionModel`
+ image_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The image embeddings obtained
+ by applying the projection layer to the pooled output of [`FlaxCLIPVisionModel`]
- Examples::
+ Examples:
- >>> from PIL import Image
- >>> import requests
- >>> from transformers import CLIPProcessor, FlaxCLIPModel
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import CLIPProcessor, FlaxCLIPModel
- >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
- >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+ >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
- >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> inputs = processor(images=image, return_tensors="np")
+ >>> inputs = processor(images=image, return_tensors="np")
- >>> image_features = model.get_image_features(**inputs)
- """
+ >>> image_features = model.get_image_features(**inputs)
+ ```"""
pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
# Handle any PRNG if needed
diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py
index 7d40d6b8ac..51018e648e 100755
--- a/src/transformers/models/convbert/modeling_convbert.py
+++ b/src/transformers/models/convbert/modeling_convbert.py
@@ -696,68 +696,66 @@ class ConvBertPredictionHeadTransform(nn.Module):
CONVBERT_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.ConvBertConfig`): Model configuration class with all the parameters of the model.
+ config ([`ConvBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
CONVBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.ConvBertTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`ConvBertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -917,10 +915,9 @@ class ConvBertForMaskedLM(ConvBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1021,10 +1018,9 @@ class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1119,10 +1115,9 @@ class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1215,9 +1210,8 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1304,13 +1298,13 @@ class ConvBertForQuestionAnswering(ConvBertPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py
index 8a376d2f7f..bb5c2b99c1 100644
--- a/src/transformers/models/convbert/modeling_tf_convbert.py
+++ b/src/transformers/models/convbert/modeling_tf_convbert.py
@@ -116,7 +116,7 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
if input_ids is None and inputs_embeds is None:
raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
@@ -649,92 +649,92 @@ class TFConvBertPreTrainedModel(TFPreTrainedModel):
CONVBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.ConvBertConfig`): Model configuration class with all the parameters of the model.
+ config ([`ConvBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
CONVBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.ConvBertTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`ConvBertTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -905,10 +905,9 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
func=self.call,
@@ -1029,10 +1028,9 @@ class TFConvBertForSequenceClassification(TFConvBertPreTrainedModel, TFSequenceC
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
func=self.call,
@@ -1138,10 +1136,9 @@ class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLos
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
inputs = input_processing(
func=self.call,
@@ -1276,9 +1273,8 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1372,13 +1368,13 @@ class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnswer
**kwargs,
):
r"""
- start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
inputs = input_processing(
diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py
index 58e147b4b5..28ac883d42 100644
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -239,81 +239,79 @@ class CTRLPreTrainedModel(PreTrainedModel):
CTRL_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
+ config ([`CTRLConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
CTRL_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
- :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
- ``past_key_values[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+ `past_key_values[0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
sequence tokens in the vocabulary.
- If :obj:`past_key_values` is used, only input IDs that do not have their past calculated should be passed
- as ``input_ids``.
+ If `past_key_values` is used, only input IDs that do not have their past calculated should be passed
+ as `input_ids`.
- Indices can be obtained using :class:`~transformers.CTRLTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.__call__` and :meth:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`CTRLTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- past_key_values (:obj:`Tuple[Tuple[torch.FloatTensor]]` of length :obj:`config.n_layers`):
+ [What are input IDs?](../glossary#input-ids)
+ past_key_values (`Tuple[Tuple[torch.FloatTensor]]` of length `config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
- :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
+ `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which
have their past given to this model should not be passed as input ids as they have already been computed.
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -539,10 +537,10 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
- ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
- ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to
+ `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -588,9 +586,9 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
@staticmethod
def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
"""
- This function is used to re-order the :obj:`past_key_values` cache if
- :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
- called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
+ This function is used to re-order the `past_key_values` cache if
+ [`~PreTrainedModel.beam_search`] or [`~PreTrainedModel.beam_sample`] is
+ called. This is required to match `past_key_values` with the correct beam_idx at every generation step.
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
@@ -601,12 +599,12 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
@add_start_docstrings(
"""
The CTRL Model transformer with a sequence classification head on top (linear layer).
- :class:`~transformers.CTRLForSequenceClassification` uses the last token in order to do the classification, as
+ [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as
other causal models (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the
- position of the last token. If a :obj:`pad_token_id` is defined in the configuration, it finds the last token that
- is not a padding token in each row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each
- row of the batch. Since it cannot guess the padding tokens when :obj:`inputs_embeds` are passed instead of
- :obj:`input_ids`, it does the same (take the last value in each row of the batch).
+ position of the last token. If a `pad_token_id` is defined in the configuration, it finds the last token that
+ is not a padding token in each row. If no `pad_token_id` is defined, it simply takes the last value in each
+ row of the batch. Since it cannot guess the padding tokens when `inputs_embeds` are passed instead of
+ `input_ids`, it does the same (take the last value in each row of the batch).
""",
CTRL_START_DOCSTRING,
)
@@ -643,10 +641,9 @@ class CTRLForSequenceClassification(CTRLPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index 1884cb7fd7..1d30e7beab 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -428,105 +428,105 @@ class TFCTRLPreTrainedModel(TFPreTrainedModel):
CTRL_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Parameters:
- config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
+ config ([`CTRLConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
CTRL_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, input_ids_length)`):
- :obj:`input_ids_length` = ``sequence_length`` if ``past`` is ``None`` else ``past[0].shape[-2]``
- (``sequence_length`` of input past key value states).
+ input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, input_ids_length)`):
+ `input_ids_length` = `sequence_length` if `past` is `None` else `past[0].shape[-2]`
+ (`sequence_length` of input past key value states).
Indices of input sequence tokens in the vocabulary.
- If :obj:`past` is used, only input IDs that do not have their past calculated should be passed as
- ``input_ids``.
+ If `past` is used, only input IDs that do not have their past calculated should be passed as
+ `input_ids`.
- Indices can be obtained using :class:`~transformers.CTRLTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.__call__` and :meth:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`CTRLTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
+ [What are input IDs?](../glossary#input-ids)
+ past (`List[tf.Tensor]` of length `config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
- :obj:`past` output below). Can be used to speed up sequential decoding. The token ids which have their past
+ `past` output below). Can be used to speed up sequential decoding. The token ids which have their past
given to this model should not be passed as input ids as they have already been computed.
- attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, ``past`` key value states are returned and can be used to speed up decoding (see
- ``past``).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past` key value states are returned and can be used to speed up decoding (see
+ `past`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -693,9 +693,8 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the cross entropy classification loss. Indices should be in ``[0, ...,
- config.vocab_size - 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., config.vocab_size - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -765,13 +764,13 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
"""
The CTRL Model transformer with a sequence classification head on top (linear layer).
- :class:`~transformers.TFCTRLForSequenceClassification` uses the last token in order to do the classification, as
+ [`TFCTRLForSequenceClassification`] uses the last token in order to do the classification, as
other causal models (e.g. GPT-1, GPT-2) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
- :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
- row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
- guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
+ row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
+ guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take
the last value in each row of the batch).
""",
CTRL_START_DOCSTRING,
@@ -816,9 +815,8 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the cross entropy classification loss. Indices should be in ``[0, ...,
- config.vocab_size - 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., config.vocab_size - 1]`.
"""
inputs = input_processing(
func=self.call,
diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index 2d2edd8f7d..07b3bdcf7f 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -78,23 +78,24 @@ class XSoftmax(torch.autograd.Function):
Masked Softmax which is optimized for saving memory
Args:
- input (:obj:`torch.tensor`): The input tensor that will apply softmax.
- mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+ input (`torch.tensor`): The input tensor that will apply softmax.
+ mask (`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
dim (int): The dimension that will apply softmax
- Example::
+ Example:
- >>> import torch
- >>> from transformers.models.deberta.modeling_deberta import XSoftmax
+ ```python
+ >>> import torch
+ >>> from transformers.models.deberta.modeling_deberta import XSoftmax
- >>> # Make a tensor
- >>> x = torch.randn([4,20,100])
+ >>> # Make a tensor
+ >>> x = torch.randn([4,20,100])
- >>> # Create a mask
- >>> mask = (x>0).int()
+ >>> # Create a mask
+ >>> mask = (x>0).int()
- >>> y = XSoftmax.apply(x, mask, dim=-1)
- """
+ >>> y = XSoftmax.apply(x, mask, dim=-1)
+ ```"""
@staticmethod
def forward(self, input, mask, dim):
@@ -197,7 +198,7 @@ class StableDropout(nn.Module):
Call the module
Args:
- x (:obj:`torch.tensor`): The input tensor to apply dropout
+ x (`torch.tensor`): The input tensor to apply dropout
"""
if self.training and self.drop_prob > 0:
return XDropout.apply(x, self.get_context())
@@ -475,16 +476,16 @@ def build_relative_position(query_size, key_size, device):
"""
Build relative position according to the query and key
- We assume the absolute position of query :math:`P_q` is range from (0, query_size) and the absolute position of key
- :math:`P_k` is range from (0, key_size), The relative positions from query to key is :math:`R_{q \\rightarrow k} =
- P_q - P_k`
+ We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+ \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} =
+ P_q - P_k\\)
Args:
query_size (int): the length of query
key_size (int): the length of key
Return:
- :obj:`torch.LongTensor`: A tensor with shape [1, query_size, key_size]
+ `torch.LongTensor`: A tensor with shape [1, query_size, key_size]
"""
@@ -516,9 +517,9 @@ class DisentangledSelfAttention(nn.Module):
Disentangled self-attention module
Parameters:
- config (:obj:`str`):
+ config (`str`):
A model config class instance with the configuration to build a new model. The schema is similar to
- `BertConfig`, for more details, please refer :class:`~transformers.DebertaConfig`
+ *BertConfig*, for more details, please refer [`DebertaConfig`]
"""
@@ -575,28 +576,28 @@ class DisentangledSelfAttention(nn.Module):
Call the module
Args:
- hidden_states (:obj:`torch.FloatTensor`):
+ hidden_states (`torch.FloatTensor`):
Input states to the module usually the output from previous layer, it will be the Q,K and V in
- `Attention(Q,K,V)`
+ *Attention(Q,K,V)*
- attention_mask (:obj:`torch.ByteTensor`):
- An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maximum
- sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j`
+ attention_mask (`torch.ByteTensor`):
+ An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+ sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
th token.
- output_attentions (:obj:`bool`, optional):
+ output_attentions (`bool`, optional):
Whether return the attention matrix.
- query_states (:obj:`torch.FloatTensor`, optional):
- The `Q` state in `Attention(Q,K,V)`.
+ query_states (`torch.FloatTensor`, optional):
+ The *Q* state in *Attention(Q,K,V)*.
- relative_pos (:obj:`torch.LongTensor`):
- The relative position encoding between the tokens in the sequence. It's of shape [`B`, `N`, `N`] with
- values ranging in [`-max_relative_positions`, `max_relative_positions`].
+ relative_pos (`torch.LongTensor`):
+ The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+ values ranging in [*-max_relative_positions*, *max_relative_positions*].
- rel_embeddings (:obj:`torch.FloatTensor`):
- The embedding of relative distances. It's a tensor of shape [:math:`2 \\times
- \\text{max_relative_positions}`, `hidden_size`].
+ rel_embeddings (`torch.FloatTensor`):
+ The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+ \\text{max_relative_positions}\\), *hidden_size*].
"""
@@ -814,65 +815,62 @@ class DebertaPreTrainedModel(PreTrainedModel):
DEBERTA_START_DOCSTRING = r"""
- The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
- `_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
+ The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.```
Parameters:
- config (:class:`~transformers.DebertaConfig`): Model configuration class with all the parameters of the model.
+ config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
DEBERTA_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.DebertaTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`DebertaTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ [What are position IDs?](../glossary#position-ids)
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -1031,10 +1029,9 @@ class DebertaForMaskedLM(DebertaPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1172,10 +1169,9 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1280,9 +1276,8 @@ class DebertaForTokenClassification(DebertaPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1366,13 +1361,13 @@ class DebertaForQuestionAnswering(DebertaPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index 646280a5fa..146bdac5d7 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -84,8 +84,8 @@ class TFDebertaXSoftmax(tf.keras.layers.Layer):
Masked Softmax which is optimized for saving memory
Args:
- input (:obj:`tf.Tensor`): The input tensor that will apply softmax.
- mask (:obj:`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+ input (`tf.Tensor`): The input tensor that will apply softmax.
+ mask (`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
dim (int): The dimension that will apply softmax
"""
@@ -394,16 +394,16 @@ def build_relative_position(query_size, key_size):
"""
Build relative position according to the query and key
- We assume the absolute position of query :math:`P_q` is range from (0, query_size) and the absolute position of key
- :math:`P_k` is range from (0, key_size), The relative positions from query to key is :math:`R_{q \\rightarrow k} =
- P_q - P_k`
+ We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+ \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} =
+ P_q - P_k\\)
Args:
query_size (int): the length of query
key_size (int): the length of key
Return:
- :obj:`tf.Tensor`: A tensor with shape [1, query_size, key_size]
+ `tf.Tensor`: A tensor with shape [1, query_size, key_size]
"""
q_ids = tf.range(query_size, dtype=tf.int32)
@@ -468,9 +468,9 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
Disentangled self-attention module
Parameters:
- config (:obj:`str`):
+ config (`str`):
A model config class instance with the configuration to build a new model. The schema is similar to
- `BertConfig`, for more details, please refer :class:`~transformers.DebertaConfig`
+ *BertConfig*, for more details, please refer [`DebertaConfig`]
"""
@@ -561,28 +561,28 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
Call the module
Args:
- hidden_states (:obj:`tf.Tensor`):
+ hidden_states (`tf.Tensor`):
Input states to the module usually the output from previous layer, it will be the Q,K and V in
- `Attention(Q,K,V)`
+ *Attention(Q,K,V)*
- attention_mask (:obj:`tf.Tensor`):
- An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maximum
- sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j`
+ attention_mask (`tf.Tensor`):
+ An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+ sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
th token.
- return_att (:obj:`bool`, optional):
+ return_att (`bool`, optional):
Whether return the attention matrix.
- query_states (:obj:`tf.Tensor`, optional):
- The `Q` state in `Attention(Q,K,V)`.
+ query_states (`tf.Tensor`, optional):
+ The *Q* state in *Attention(Q,K,V)*.
- relative_pos (:obj:`tf.Tensor`):
- The relative position encoding between the tokens in the sequence. It's of shape [`B`, `N`, `N`] with
- values ranging in [`-max_relative_positions`, `max_relative_positions`].
+ relative_pos (`tf.Tensor`):
+ The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+ values ranging in [*-max_relative_positions*, *max_relative_positions*].
- rel_embeddings (:obj:`tf.Tensor`):
- The embedding of relative distances. It's a tensor of shape [:math:`2 \\times
- \\text{max_relative_positions}`, `hidden_size`].
+ rel_embeddings (`tf.Tensor`):
+ The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+ \\text{max_relative_positions}\\), *hidden_size*].
"""
@@ -776,7 +776,7 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
@@ -1005,85 +1005,82 @@ class TFDebertaPreTrainedModel(TFPreTrainedModel):
DEBERTA_START_DOCSTRING = r"""
- The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
- `_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
+ The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
-
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
Parameters:
- config (:class:`~transformers.DebertaConfig`): Model configuration class with all the parameters of the model.
+ config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
DEBERTA_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`)
+ input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`)
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.DebertaTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`DebertaTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ [What are position IDs?](../glossary#position-ids)
+ inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~transformers.file_utils.ModelOutput``] instead of a plain tuple.
"""
@@ -1191,10 +1188,9 @@ class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLos
**kwargs,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
func=self.call,
@@ -1293,10 +1289,9 @@ class TFDebertaForSequenceClassification(TFDebertaPreTrainedModel, TFSequenceCla
**kwargs,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
func=self.call,
@@ -1390,9 +1385,8 @@ class TFDebertaForTokenClassification(TFDebertaPreTrainedModel, TFTokenClassific
**kwargs,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1484,13 +1478,13 @@ class TFDebertaForQuestionAnswering(TFDebertaPreTrainedModel, TFQuestionAnswerin
**kwargs,
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
- start_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
inputs = input_processing(
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index e0c78395e6..898d9594f6 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -79,23 +79,24 @@ class XSoftmax(torch.autograd.Function):
Masked Softmax which is optimized for saving memory
Args:
- input (:obj:`torch.tensor`): The input tensor that will apply softmax.
- mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+ input (`torch.tensor`): The input tensor that will apply softmax.
+ mask (`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
dim (int): The dimension that will apply softmax
- Example::
+ Example:
- >>> import torch
- >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
+ ```python
+ >>> import torch
+ >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
- >>> # Make a tensor
- >>> x = torch.randn([4,20,100])
+ >>> # Make a tensor
+ >>> x = torch.randn([4,20,100])
- >>> # Create a mask
- >>> mask = (x>0).int()
+ >>> # Create a mask
+ >>> mask = (x>0).int()
- >>> y = XSoftmax.apply(x, mask, dim=-1)
- """
+ >>> y = XSoftmax.apply(x, mask, dim=-1)
+ ```"""
@staticmethod
def forward(self, input, mask, dim):
@@ -202,7 +203,7 @@ class StableDropout(nn.Module):
Call the module
Args:
- x (:obj:`torch.tensor`): The input tensor to apply dropout
+ x (`torch.tensor`): The input tensor to apply dropout
"""
if self.training and self.drop_prob > 0:
return XDropout.apply(x, self.get_context())
@@ -537,9 +538,9 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
"""
Build relative position according to the query and key
- We assume the absolute position of query :math:`P_q` is range from (0, query_size) and the absolute position of key
- :math:`P_k` is range from (0, key_size), The relative positions from query to key is :math:`R_{q \\rightarrow k} =
- P_q - P_k`
+ We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+ \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} =
+ P_q - P_k\\)
Args:
query_size (int): the length of query
@@ -548,7 +549,7 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
max_position (int): the maximum allowed absolute position
Return:
- :obj:`torch.LongTensor`: A tensor with shape [1, query_size, key_size]
+ `torch.LongTensor`: A tensor with shape [1, query_size, key_size]
"""
q_ids = np.arange(0, query_size)
@@ -585,9 +586,9 @@ class DisentangledSelfAttention(nn.Module):
Disentangled self-attention module
Parameters:
- config (:obj:`DebertaV2Config`):
+ config (`DebertaV2Config`):
A model config class instance with the configuration to build a new model. The schema is similar to
- `BertConfig`, for more details, please refer :class:`~transformers.DebertaV2Config`
+ *BertConfig*, for more details, please refer [`DebertaV2Config`]
"""
@@ -647,28 +648,28 @@ class DisentangledSelfAttention(nn.Module):
Call the module
Args:
- hidden_states (:obj:`torch.FloatTensor`):
+ hidden_states (`torch.FloatTensor`):
Input states to the module usually the output from previous layer, it will be the Q,K and V in
- `Attention(Q,K,V)`
+ *Attention(Q,K,V)*
- attention_mask (:obj:`torch.ByteTensor`):
- An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maximum
- sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j`
+ attention_mask (`torch.ByteTensor`):
+ An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+ sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
th token.
- output_attentions (:obj:`bool`, optional):
+ output_attentions (`bool`, optional):
Whether return the attention matrix.
- query_states (:obj:`torch.FloatTensor`, optional):
- The `Q` state in `Attention(Q,K,V)`.
+ query_states (`torch.FloatTensor`, optional):
+ The *Q* state in *Attention(Q,K,V)*.
- relative_pos (:obj:`torch.LongTensor`):
- The relative position encoding between the tokens in the sequence. It's of shape [`B`, `N`, `N`] with
- values ranging in [`-max_relative_positions`, `max_relative_positions`].
+ relative_pos (`torch.LongTensor`):
+ The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+ values ranging in [*-max_relative_positions*, *max_relative_positions*].
- rel_embeddings (:obj:`torch.FloatTensor`):
- The embedding of relative distances. It's a tensor of shape [:math:`2 \\times
- \\text{max_relative_positions}`, `hidden_size`].
+ rel_embeddings (`torch.FloatTensor`):
+ The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+ \\text{max_relative_positions}\\), *hidden_size*].
"""
@@ -921,65 +922,62 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
DEBERTA_START_DOCSTRING = r"""
- The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
- `_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
+ The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.```
Parameters:
- config (:class:`~transformers.DebertaV2Config`): Model configuration class with all the parameters of the model.
+ config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
DEBERTA_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.DebertaV2Tokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`DebertaV2Tokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ [What are position IDs?](../glossary#position-ids)
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -1140,10 +1138,9 @@ class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1282,10 +1279,9 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1391,9 +1387,8 @@ class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1478,13 +1473,13 @@ class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index 63c3e29735..f5e034df33 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -85,8 +85,8 @@ class TFDebertaV2XSoftmax(tf.keras.layers.Layer):
Masked Softmax which is optimized for saving memory
Args:
- input (:obj:`tf.Tensor`): The input tensor that will apply softmax.
- mask (:obj:`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+ input (`tf.Tensor`): The input tensor that will apply softmax.
+ mask (`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
dim (int): The dimension that will apply softmax
"""
@@ -475,9 +475,9 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
"""
Build relative position according to the query and key
- We assume the absolute position of query :math:`P_q` is range from (0, query_size) and the absolute position of key
- :math:`P_k` is range from (0, key_size), The relative positions from query to key is :math:`R_{q \\rightarrow k} =
- P_q - P_k`
+ We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+ \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} =
+ P_q - P_k\\)
Args:
query_size (int): the length of query
@@ -486,7 +486,7 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
max_position (int): the maximum allowed absolute position
Return:
- :obj:`tf.Tensor`: A tensor with shape [1, query_size, key_size]
+ `tf.Tensor`: A tensor with shape [1, query_size, key_size]
"""
q_ids = tf.range(query_size, dtype=tf.int32)
@@ -553,9 +553,9 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer):
Disentangled self-attention module
Parameters:
- config (:obj:`DebertaV2Config`):
+ config (`DebertaV2Config`):
A model config class instance with the configuration to build a new model. The schema is similar to
- `BertConfig`, for more details, please refer :class:`~transformers.DebertaV2Config`
+ *BertConfig*, for more details, please refer [`DebertaV2Config`]
"""
@@ -642,28 +642,28 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer):
Call the module
Args:
- hidden_states (:obj:`tf.Tensor`):
+ hidden_states (`tf.Tensor`):
Input states to the module usually the output from previous layer, it will be the Q,K and V in
- `Attention(Q,K,V)`
+ *Attention(Q,K,V)*
- attention_mask (:obj:`tf.Tensor`):
- An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maximum
- sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j`
+ attention_mask (`tf.Tensor`):
+ An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+ sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
th token.
- return_att (:obj:`bool`, optional):
+ return_att (`bool`, optional):
Whether return the attention matrix.
- query_states (:obj:`tf.Tensor`, optional):
- The `Q` state in `Attention(Q,K,V)`.
+ query_states (`tf.Tensor`, optional):
+ The *Q* state in *Attention(Q,K,V)*.
- relative_pos (:obj:`tf.Tensor`):
- The relative position encoding between the tokens in the sequence. It's of shape [`B`, `N`, `N`] with
- values ranging in [`-max_relative_positions`, `max_relative_positions`].
+ relative_pos (`tf.Tensor`):
+ The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+ values ranging in [*-max_relative_positions*, *max_relative_positions*].
- rel_embeddings (:obj:`tf.Tensor`):
- The embedding of relative distances. It's a tensor of shape [:math:`2 \\times
- \\text{max_relative_positions}`, `hidden_size`].
+ rel_embeddings (`tf.Tensor`):
+ The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+ \\text{max_relative_positions}\\), *hidden_size*].
"""
@@ -893,7 +893,7 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
@@ -1126,85 +1126,82 @@ class TFDebertaV2PreTrainedModel(TFPreTrainedModel):
DEBERTA_START_DOCSTRING = r"""
- The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
- `_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
+ The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
-
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
Parameters:
- config (:class:`~transformers.DebertaV2Config`): Model configuration class with all the parameters of the model.
+ config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
DEBERTA_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`)
+ input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`)
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.DebertaV2Tokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`DebertaV2Tokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ [What are position IDs?](../glossary#position-ids)
+ inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~transformers.file_utils.ModelOutput``] instead of a plain tuple.
"""
@@ -1314,10 +1311,9 @@ class TFDebertaV2ForMaskedLM(TFDebertaV2PreTrainedModel, TFMaskedLanguageModelin
**kwargs,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
func=self.call,
@@ -1417,10 +1413,9 @@ class TFDebertaV2ForSequenceClassification(TFDebertaV2PreTrainedModel, TFSequenc
**kwargs,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
func=self.call,
@@ -1515,9 +1510,8 @@ class TFDebertaV2ForTokenClassification(TFDebertaV2PreTrainedModel, TFTokenClass
**kwargs,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1610,13 +1604,13 @@ class TFDebertaV2ForQuestionAnswering(TFDebertaV2PreTrainedModel, TFQuestionAnsw
**kwargs,
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
- start_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
inputs = input_processing(
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index 6698b5f77f..dbcf11c37a 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -410,37 +410,37 @@ class DeiTPreTrainedModel(PreTrainedModel):
DEIT_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ subclass. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.DeiTConfig`): Model configuration class with all the parameters of the model.
+ config ([`DeiTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
DEIT_INPUTS_DOCSTRING = r"""
Args:
- pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)`):
- Pixel values. Pixel values can be obtained using :class:`~transformers.DeiTFeatureExtractor`. See
- :meth:`transformers.DeiTFeatureExtractor.__call__` for details.
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`DeiTFeatureExtractor`]. See
+ [`DeiTFeatureExtractor.__call__`] for details.
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -591,34 +591,34 @@ class DeiTForImageClassification(DeiTPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassification
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassification
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
- >>> # so the head will be randomly initialized, hence the predictions will be random
- >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
- >>> model = DeiTForImageClassification.from_pretrained('facebook/deit-base-distilled-patch16-224')
+ >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
+ >>> # so the head will be randomly initialized, hence the predictions will be random
+ >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
+ >>> model = DeiTForImageClassification.from_pretrained('facebook/deit-base-distilled-patch16-224')
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- >>> # model predicts one of the 1000 ImageNet classes
- >>> predicted_class_idx = logits.argmax(-1).item()
- >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ >>> # model predicts one of the 1000 ImageNet classes
+ >>> predicted_class_idx = logits.argmax(-1).item()
+ >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.deit(
@@ -659,24 +659,23 @@ class DeiTForImageClassification(DeiTPreTrainedModel):
@dataclass
class DeiTForImageClassificationWithTeacherOutput(ModelOutput):
"""
- Output type of :class:`~transformers.DeiTForImageClassificationWithTeacher`.
+ Output type of [`DeiTForImageClassificationWithTeacher`].
Args:
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Prediction scores as the average of the cls_logits and distillation logits.
- cls_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+ cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
class token).
- distillation_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+ distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
distillation token).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
"""
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index fbaa46e17f..0895290aa7 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -68,21 +68,19 @@ class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
attention softmax, used to compute the weighted average in the cross-attention heads.
- intermediate_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(config.decoder_layers, batch_size, num_queries, hidden_size)`, `optional`, returned when ``config.auxiliary_loss=True``):
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
layernorm.
"""
@@ -98,31 +96,28 @@ class DetrModelOutput(Seq2SeqModelOutput):
gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to
compute the weighted average in the self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
attention softmax, used to compute the weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of
each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to
compute the weighted average in the self-attention heads.
- intermediate_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(config.decoder_layers, batch_size, sequence_length, hidden_size)`, `optional`, returned when ``config.auxiliary_loss=True``):
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
layernorm.
"""
@@ -133,49 +128,46 @@ class DetrModelOutput(Seq2SeqModelOutput):
@dataclass
class DetrObjectDetectionOutput(ModelOutput):
"""
- Output type of :class:`~transformers.DetrForObjectDetection`.
+ Output type of [`DetrForObjectDetection`].
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` are provided)):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
scale-invariant IoU loss.
- loss_dict (:obj:`Dict`, `optional`):
+ loss_dict (`Dict`, *optional*):
A dictionary containing the individual losses. Useful for logging.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, num_classes + 1)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
Classification logits (including no-object) for all queries.
- pred_boxes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`):
+ pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
- possible padding). You can use :meth:`~transformers.DetrFeatureExtractor.post_process` to retrieve the
+ possible padding). You can use [`~DetrFeatureExtractor.post_process`] to retrieve the
unnormalized bounding boxes.
- auxiliary_outputs (:obj:`list[Dict]`, `optional`):
- Optional, only returned when auxilary losses are activated (i.e. :obj:`config.auxiliary_loss` is set to
- `True`) and labels are provided. It is a list of dictionaries containing the two above keys (:obj:`logits`
- and :obj:`pred_boxes`) for each decoder layer.
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ auxiliary_outputs (`list[Dict]`, *optional*):
+ Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to
+ *True*) and labels are provided. It is a list of dictionaries containing the two above keys (`logits`
+ and `pred_boxes`) for each decoder layer.
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to
compute the weighted average in the self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
attention softmax, used to compute the weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of
each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to
compute the weighted average in the self-attention heads.
"""
@@ -196,54 +188,51 @@ class DetrObjectDetectionOutput(ModelOutput):
@dataclass
class DetrSegmentationOutput(ModelOutput):
"""
- Output type of :class:`~transformers.DetrForSegmentation`.
+ Output type of [`DetrForSegmentation`].
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` are provided)):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
scale-invariant IoU loss.
- loss_dict (:obj:`Dict`, `optional`):
+ loss_dict (`Dict`, *optional*):
A dictionary containing the individual losses. Useful for logging.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, num_classes + 1)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
Classification logits (including no-object) for all queries.
- pred_boxes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`):
+ pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
- possible padding). You can use :meth:`~transformers.DetrFeatureExtractor.post_process` to retrieve the
+ possible padding). You can use [`~DetrFeatureExtractor.post_process`] to retrieve the
unnormalized bounding boxes.
- pred_masks (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, height/4, width/4)`):
+ pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
Segmentation masks logits for all queries. See also
- :meth:`~transformers.DetrFeatureExtractor.post_process_segmentation` or
- :meth:`~transformers.DetrFeatureExtractor.post_process_panoptic` to evaluate instance and panoptic
+ [`~DetrFeatureExtractor.post_process_segmentation`] or
+ [`~DetrFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic
segmentation masks respectively.
- auxiliary_outputs (:obj:`list[Dict]`, `optional`):
- Optional, only returned when auxiliary losses are activated (i.e. :obj:`config.auxiliary_loss` is set to
- `True`) and labels are provided. It is a list of dictionaries containing the two above keys (:obj:`logits`
- and :obj:`pred_boxes`) for each decoder layer.
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ auxiliary_outputs (`list[Dict]`, *optional*):
+ Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to
+ *True*) and labels are provided. It is a list of dictionaries containing the two above keys (`logits`
+ and `pred_boxes`) for each decoder layer.
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to
compute the weighted average in the self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
attention softmax, used to compute the weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of
each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to
compute the weighted average in the self-attention heads.
"""
@@ -618,12 +607,12 @@ class DetrEncoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape :obj:`(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- position_embeddings (:obj:`torch.FloatTensor`, `optional`): position embeddings, to be added to hidden_states.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -700,18 +689,18 @@ class DetrDecoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape :obj:`(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- position_embeddings (:obj:`torch.FloatTensor`, `optional`): position embeddings that are added to the queries and keys
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ position_embeddings (`torch.FloatTensor`, *optional*): position embeddings that are added to the queries and keys
in the cross-attention layer.
- query_position_embeddings (:obj:`torch.FloatTensor`, `optional`): position embeddings that are added to the queries and keys
+ query_position_embeddings (`torch.FloatTensor`, *optional*): position embeddings that are added to the queries and keys
in the self-attention layer.
- encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape :obj:`(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
- :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -815,65 +804,65 @@ class DetrPreTrainedModel(PreTrainedModel):
DETR_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.DetrConfig`):
+ config ([`DetrConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
DETR_INPUTS_DOCSTRING = r"""
Args:
- pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)`):
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it.
- Pixel values can be obtained using :class:`~transformers.DetrFeatureExtractor`. See
- :meth:`transformers.DetrFeatureExtractor.__call__` for details.
+ Pixel values can be obtained using [`DetrFeatureExtractor`]. See
+ [`DetrFeatureExtractor.__call__`] for details.
- pixel_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, height, width)`, `optional`):
- Mask to avoid performing attention on padding pixel values. Mask values selected in ``[0, 1]``:
+ pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+ Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
- 1 for pixels that are real (i.e. **not masked**),
- 0 for pixels that are padding (i.e. **masked**).
- `What are attention masks? <../glossary.html#attention-mask>`__
+ [What are attention masks?](../glossary#attention-mask)
- decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_queries)`, `optional`):
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
Not used by default. Can be used to mask object queries.
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
can choose to directly pass a flattened representation of an image.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, hidden_size)`, `optional`):
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
embedded representation.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class DetrEncoder(DetrPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`DetrEncoderLayer`.
+ [`DetrEncoderLayer`].
The encoder updates the flattened feature map through multiple self-attention layers.
@@ -909,28 +898,28 @@ class DetrEncoder(DetrPreTrainedModel):
):
r"""
Args:
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding pixel features. Mask values selected in ``[0, 1]``:
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
- 1 for pixel features that are real (i.e. **not masked**),
- 0 for pixel features that are padding (i.e. **masked**).
- `What are attention masks? <../glossary.html#attention-mask>`__
+ [What are attention masks?](../glossary#attention-mask)
- position_embeddings (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Position embeddings that are added to the queries and keys in each self-attention layer.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -981,7 +970,7 @@ class DetrEncoder(DetrPreTrainedModel):
class DetrDecoder(DetrPreTrainedModel):
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`DetrDecoderLayer`.
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetrDecoderLayer`].
The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
@@ -1021,38 +1010,38 @@ class DetrDecoder(DetrPreTrainedModel):
):
r"""
Args:
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
The query embeddings that are passed into the decoder.
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on certain queries. Mask values selected in ``[0, 1]``:
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:
- 1 for queries that are **not masked**,
- 0 for queries that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
- in ``[0, 1]``:
+ in `[0, 1]`:
- 1 for pixels that are real (i.e. **not masked**),
- 0 for pixels that are padding (i.e. **masked**).
- position_embeddings (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Position embeddings that are added to the queries and keys in each cross-attention layer.
- query_position_embeddings (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, hidden_size)`):, `optional`):
+ query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):, *optional*):
Position embeddings that are added to the queries and keys in each self-attention layer.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1364,33 +1353,32 @@ class DetrForObjectDetection(DetrPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`List[Dict]` of len :obj:`(batch_size,)`, `optional`):
+ labels (`List[Dict]` of len `(batch_size,)`, *optional*):
Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
- respectively). The class labels themselves should be a :obj:`torch.LongTensor` of len :obj:`(number of
- bounding boxes in the image,)` and the boxes a :obj:`torch.FloatTensor` of shape :obj:`(number of bounding
- boxes in the image, 4)`.
+ respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
Returns:
- Examples::
+ Examples:
- >>> from transformers import DetrFeatureExtractor, DetrForObjectDetection
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import DetrFeatureExtractor, DetrForObjectDetection
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
- >>> model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50')
+ >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
+ >>> model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50')
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> # model predicts bounding boxes and corresponding COCO classes
- >>> logits = outputs.logits
- >>> bboxes = outputs.pred_boxes
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> # model predicts bounding boxes and corresponding COCO classes
+ >>> logits = outputs.logits
+ >>> bboxes = outputs.pred_boxes
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# First, sent images through DETR base model to obtain encoder + decoder outputs
@@ -1518,35 +1506,36 @@ class DetrForSegmentation(DetrPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`List[Dict]` of len :obj:`(batch_size,)`, `optional`):
+ labels (`List[Dict]` of len `(batch_size,)`, *optional*):
Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels,
bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves
- should be a :obj:`torch.LongTensor` of len :obj:`(number of bounding boxes in the image,)`, the boxes a
- :obj:`torch.FloatTensor` of shape :obj:`(number of bounding boxes in the image, 4)` and the masks a
- :obj:`torch.FloatTensor` of shape :obj:`(number of bounding boxes in the image, height, width)`.
+ should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a
+ `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a
+ `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`.
Returns:
- Examples::
+ Examples:
- >>> from transformers import DetrFeatureExtractor, DetrForSegmentation
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import DetrFeatureExtractor, DetrForSegmentation
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50-panoptic')
- >>> model = DetrForSegmentation.from_pretrained('facebook/detr-resnet-50-panoptic')
+ >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50-panoptic')
+ >>> model = DetrForSegmentation.from_pretrained('facebook/detr-resnet-50-panoptic')
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> # model predicts COCO classes, bounding boxes, and masks
- >>> logits = outputs.logits
- >>> bboxes = outputs.pred_boxes
- >>> masks = outputs.pred_masks
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> # model predicts COCO classes, bounding boxes, and masks
+ >>> logits = outputs.logits
+ >>> bboxes = outputs.pred_boxes
+ >>> masks = outputs.pred_masks
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2162,7 +2151,7 @@ def box_area(boxes: Tensor) -> Tensor:
Args:
boxes (Tensor[N, 4]): boxes for which the area will be computed. They
- are expected to be in (x1, y1, x2, y2) format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+ are expected to be in (x1, y1, x2, y2) format with `0 <= x1 < x2` and `0 <= y1 < y2`.
Returns:
area (Tensor[N]): area for each box
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index e9b57adc81..db57dd061b 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -377,56 +377,56 @@ class DistilBertPreTrainedModel(PreTrainedModel):
DISTILBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
+ config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
DISTILBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.DistilBertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`DistilBertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -452,11 +452,10 @@ class DistilBertModel(DistilBertPreTrainedModel):
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
- Resizes position embeddings of the model if :obj:`new_num_position_embeddings !=
- config.max_position_embeddings`.
+ Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
- new_num_position_embeddings (:obj:`int`):
+ new_num_position_embeddings (`int`):
The number of new position embedding matrix. If position embeddings are learned, increasing the size
will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
@@ -585,11 +584,10 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
- Resizes position embeddings of the model if :obj:`new_num_position_embeddings !=
- config.max_position_embeddings`.
+ Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
- new_num_position_embeddings (:obj:`int`):
+ new_num_position_embeddings (`int`):
The number of new position embedding matrix. If position embeddings are learned, increasing the size
will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
@@ -623,10 +621,9 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -690,11 +687,10 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
- Resizes position embeddings of the model if :obj:`new_num_position_embeddings !=
- config.max_position_embeddings`.
+ Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
- new_num_position_embeddings (:obj:`int`):
+ new_num_position_embeddings (`int`):
The number of new position embedding matrix. If position embeddings are learned, increasing the size
will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
@@ -722,10 +718,9 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -807,11 +802,10 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
- Resizes position embeddings of the model if :obj:`new_num_position_embeddings !=
- config.max_position_embeddings`.
+ Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
- new_num_position_embeddings (:obj:`int`):
+ new_num_position_embeddings (`int`):
The number of new position embedding matrix. If position embeddings are learned, increasing the size
will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
@@ -840,13 +834,13 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -925,11 +919,10 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
- Resizes position embeddings of the model if :obj:`new_num_position_embeddings !=
- config.max_position_embeddings`.
+ Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
- new_num_position_embeddings (:obj:`int`):
+ new_num_position_embeddings (`int`):
The number of new position embedding matrix. If position embeddings are learned, increasing the size
will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
@@ -957,9 +950,8 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1031,11 +1023,10 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
- Resizes position embeddings of the model if :obj:`new_num_position_embeddings !=
- config.max_position_embeddings`.
+ Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
- new_num_position_embeddings (:obj:`int`)
+ new_num_position_embeddings (`int`)
The number of new position embeddings. If position embeddings are learned, increasing the size will add
newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
@@ -1060,33 +1051,33 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
Returns:
- Examples::
+ Examples:
- >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
- >>> import torch
+ ```python
+ >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
+ >>> import torch
- >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
- >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
+ >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
+ >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
- >>> choice0 = "It is eaten with a fork and a knife."
- >>> choice1 = "It is eaten while held in the hand."
- >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+ >>> choice0 = "It is eaten with a fork and a knife."
+ >>> choice1 = "It is eaten while held in the hand."
+ >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
- >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
- >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
+ >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
+ >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
- >>> # the linear classifier still needs to be trained
- >>> loss = outputs.loss
- >>> logits = outputs.logits
- """
+ >>> # the linear classifier still needs to be trained
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
diff --git a/src/transformers/models/distilbert/modeling_flax_distilbert.py b/src/transformers/models/distilbert/modeling_flax_distilbert.py
index db79c2e06f..b61453f918 100644
--- a/src/transformers/models/distilbert/modeling_flax_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_flax_distilbert.py
@@ -47,53 +47,52 @@ _TOKENIZER_FOR_DOC = "DistilBertTokenizer"
FLAX_DISTILBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading, saving and converting weights from
PyTorch models)
- This model is also a Flax Linen `flax.linen.Module
- `__ subclass. Use it as a regular Flax linen Module
+ This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. Use it as a regular Flax linen Module
and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
+ config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
DISTILBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`):
+ input_ids (`numpy.ndarray` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ [What are attention masks?](../glossary#attention-mask)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py
index d2449f2a3e..ec888623ef 100644
--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@@ -102,7 +102,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
@@ -451,79 +451,81 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel):
DISTILBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids})`
+
+
Parameters:
- config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
+ config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
DISTILBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.DistilBertTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`DistilBertTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -674,10 +676,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
func=self.call,
@@ -775,10 +776,9 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ...,
- config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
- If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
func=self.call,
@@ -870,9 +870,8 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -978,10 +977,9 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
inputs = input_processing(
func=self.call,
@@ -1105,13 +1103,13 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
**kwargs,
):
r"""
- start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
inputs = input_processing(
diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py
index 6cde47678c..c845c31aa9 100644
--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -61,21 +61,20 @@ DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST = [
@dataclass
class DPRContextEncoderOutput(ModelOutput):
"""
- Class for outputs of :class:`~transformers.DPRQuestionEncoder`.
+ Class for outputs of [`DPRQuestionEncoder`].
Args:
- pooler_output: (:obj:``torch.FloatTensor`` of shape ``(batch_size, embeddings_size)``):
- The DPR encoder outputs the `pooler_output` that corresponds to the context representation. Last layer
+ pooler_output: (:obj:`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
+ The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -89,21 +88,20 @@ class DPRContextEncoderOutput(ModelOutput):
@dataclass
class DPRQuestionEncoderOutput(ModelOutput):
"""
- Class for outputs of :class:`~transformers.DPRQuestionEncoder`.
+ Class for outputs of [`DPRQuestionEncoder`].
Args:
- pooler_output: (:obj:``torch.FloatTensor`` of shape ``(batch_size, embeddings_size)``):
- The DPR encoder outputs the `pooler_output` that corresponds to the question representation. Last layer
+ pooler_output: (:obj:`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
+ The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
This output is to be used to embed questions for nearest neighbors queries with context embeddings.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -117,24 +115,23 @@ class DPRQuestionEncoderOutput(ModelOutput):
@dataclass
class DPRReaderOutput(ModelOutput):
"""
- Class for outputs of :class:`~transformers.DPRQuestionEncoder`.
+ Class for outputs of [`DPRQuestionEncoder`].
Args:
- start_logits: (:obj:``torch.FloatTensor`` of shape ``(n_passages, sequence_length)``):
+ start_logits: (:obj:`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
Logits of the start index of the span for each passage.
- end_logits: (:obj:``torch.FloatTensor`` of shape ``(n_passages, sequence_length)``):
+ end_logits: (:obj:`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
Logits of the end index of the span for each passage.
- relevance_logits: (:obj:`torch.FloatTensor`` of shape ``(n_passages, )``):
+ relevance_logits: (``torch.FloatTensor``` of shape `(n_passages, )`):
Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
question, compared to all the other passages.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -330,113 +327,112 @@ class DPRPretrainedReader(DPRPreTrainedModel):
DPR_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.DPRConfig`): Model configuration class with all the parameters of the model.
+ config ([`DPRConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
DPR_ENCODERS_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be
formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs (for a pair title+text for example):
- ::
-
- tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
- token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
+ ```
+ tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+ token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
+ ```
(b) For single sequences (for a question for example):
- ::
-
- tokens: [CLS] the dog is hairy . [SEP]
- token_type_ids: 0 0 0 0 0 0 0
+ ```
+ tokens: [CLS] the dog is hairy . [SEP]
+ token_type_ids: 0 0 0 0 0 0 0
+ ```
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
rather than the left.
- Indices can be obtained using :class:`~transformers.DPRTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`DPRTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ [What are token type IDs?](../glossary#token-type-ids)
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
DPR_READER_INPUTS_DOCSTRING = r"""
Args:
- input_ids: (:obj:`Tuple[torch.LongTensor]` of shapes :obj:`(n_passages, sequence_length)`):
+ input_ids: (`Tuple[torch.LongTensor]` of shapes `(n_passages, sequence_length)`):
Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question
- and 2) the passages titles and 3) the passages texts To match pretraining, DPR :obj:`input_ids` sequence
+ and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence
should be formatted with [CLS] and [SEP] with the format:
- ``[CLS] [SEP] [SEP] ``
+ `[CLS] [SEP] [SEP] `
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
rather than the left.
- Indices can be obtained using :class:`~transformers.DPRReaderTokenizer`. See this class documentation for
+ Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for
more details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(n_passages, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `(n_passages, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(n_passages, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ [What are attention masks?](../glossary#attention-mask)
+ inputs_embeds (`torch.FloatTensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py
index b060fbb286..4ec0e7b5fb 100644
--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -58,21 +58,20 @@ TF_DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST = [
@dataclass
class TFDPRContextEncoderOutput(ModelOutput):
r"""
- Class for outputs of :class:`~transformers.TFDPRContextEncoder`.
+ Class for outputs of [`TFDPRContextEncoder`].
Args:
- pooler_output: (:obj:``tf.Tensor`` of shape ``(batch_size, embeddings_size)``):
- The DPR encoder outputs the `pooler_output` that corresponds to the context representation. Last layer
+ pooler_output: (:obj:`tf.Tensor` of shape `(batch_size, embeddings_size)`):
+ The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -86,21 +85,20 @@ class TFDPRContextEncoderOutput(ModelOutput):
@dataclass
class TFDPRQuestionEncoderOutput(ModelOutput):
"""
- Class for outputs of :class:`~transformers.TFDPRQuestionEncoder`.
+ Class for outputs of [`TFDPRQuestionEncoder`].
Args:
- pooler_output: (:obj:``tf.Tensor`` of shape ``(batch_size, embeddings_size)``):
- The DPR encoder outputs the `pooler_output` that corresponds to the question representation. Last layer
+ pooler_output: (:obj:`tf.Tensor` of shape `(batch_size, embeddings_size)`):
+ The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
This output is to be used to embed questions for nearest neighbors queries with context embeddings.
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -114,24 +112,23 @@ class TFDPRQuestionEncoderOutput(ModelOutput):
@dataclass
class TFDPRReaderOutput(ModelOutput):
"""
- Class for outputs of :class:`~transformers.TFDPRReaderEncoder`.
+ Class for outputs of [`TFDPRReaderEncoder`].
Args:
- start_logits: (:obj:``tf.Tensor`` of shape ``(n_passages, sequence_length)``):
+ start_logits: (:obj:`tf.Tensor` of shape `(n_passages, sequence_length)`):
Logits of the start index of the span for each passage.
- end_logits: (:obj:``tf.Tensor`` of shape ``(n_passages, sequence_length)``):
+ end_logits: (:obj:`tf.Tensor` of shape `(n_passages, sequence_length)`):
Logits of the end index of the span for each passage.
- relevance_logits: (:obj:`tf.Tensor`` of shape ``(n_passages, )``):
+ relevance_logits: (``tf.Tensor``` of shape `(n_passages, )`):
Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
question, compared to all the other passages.
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -444,136 +441,137 @@ class TFDPRPretrainedReader(TFPreTrainedModel):
TF_DPR_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a Tensorflow `tf.keras.Model `__
+ This model is also a Tensorflow [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)
subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to
general usage and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Parameters:
- config (:class:`~transformers.DPRConfig`): Model configuration class with all the parameters of the model.
+ config ([`DPRConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
TF_DPR_ENCODERS_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be
formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs (for a pair title+text for example):
- ::
-
- tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
- token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
+ ```
+ tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+ token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
+ ```
(b) For single sequences (for a question for example):
- ::
-
- tokens: [CLS] the dog is hairy . [SEP]
- token_type_ids: 0 0 0 0 0 0 0
+ ```
+ tokens: [CLS] the dog is hairy . [SEP]
+ token_type_ids: 0 0 0 0 0 0 0
+ ```
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
rather than the left.
- Indices can be obtained using :class:`~transformers.DPRTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`DPRTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ [What are token type IDs?](../glossary#token-type-ids)
+ inputs_embeds (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
TF_DPR_READER_INPUTS_DOCSTRING = r"""
Args:
- input_ids: (:obj:`Numpy array` or :obj:`tf.Tensor` of shapes :obj:`(n_passages, sequence_length)`):
+ input_ids: (`Numpy array` or `tf.Tensor` of shapes `(n_passages, sequence_length)`):
Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question
- and 2) the passages titles and 3) the passages texts To match pretraining, DPR :obj:`input_ids` sequence
+ and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence
should be formatted with [CLS] and [SEP] with the format:
- ``[CLS] [SEP] [SEP] ``
+ `[CLS] [SEP] [SEP] `
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
rather than the left.
- Indices can be obtained using :class:`~transformers.DPRReaderTokenizer`. See this class documentation for
+ Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for
more details.
- attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(n_passages, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`Numpy array` or `tf.Tensor` of shape `(n_passages, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(n_passages, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ [What are attention masks?](../glossary#attention-mask)
+ inputs_embeds (`Numpy array` or `tf.Tensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index 71782da69b..0c4d0e626f 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -695,21 +695,20 @@ class ElectraPreTrainedModel(PreTrainedModel):
@dataclass
class ElectraForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.ElectraForPreTraining`.
+ Output type of [`ElectraForPreTraining`].
Args:
- loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+ loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss of the ELECTRA objective.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Prediction scores of the head (scores for each token before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -723,79 +722,77 @@ class ElectraForPreTrainingOutput(ModelOutput):
ELECTRA_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.ElectraConfig`): Model configuration class with all the parameters of the model.
+ config ([`ElectraConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
ELECTRA_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.ElectraTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`ElectraTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -964,10 +961,9 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1054,26 +1050,27 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
return_dict=None,
):
r"""
- labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
- Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids`
- docstring) Indices should be in ``[0, 1]``:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids`
+ docstring) Indices should be in `[0, 1]`:
- 0 indicates the token is an original token,
- 1 indicates the token was replaced.
Returns:
- Examples::
+ Examples:
- >>> from transformers import ElectraTokenizer, ElectraForPreTraining
- >>> import torch
+ ```python
+ >>> from transformers import ElectraTokenizer, ElectraForPreTraining
+ >>> import torch
- >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
- >>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+ >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+ >>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
- >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
- >>> logits = model(input_ids).logits
- """
+ >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
+ >>> logits = model(input_ids).logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
discriminator_hidden_states = self.electra(
@@ -1161,10 +1158,9 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1244,9 +1240,8 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1335,13 +1330,13 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1436,10 +1431,9 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
diff --git a/src/transformers/models/electra/modeling_flax_electra.py b/src/transformers/models/electra/modeling_flax_electra.py
index 509e2e7f37..020a18eba6 100644
--- a/src/transformers/models/electra/modeling_flax_electra.py
+++ b/src/transformers/models/electra/modeling_flax_electra.py
@@ -56,19 +56,18 @@ _TOKENIZER_FOR_DOC = "ElectraTokenizer"
@flax.struct.dataclass
class FlaxElectraForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.ElectraForPreTraining`.
+ Output type of [`ElectraForPreTraining`].
Args:
- logits (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(jnp.ndarray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`jnp.ndarray` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -81,64 +80,60 @@ class FlaxElectraForPreTrainingOutput(ModelOutput):
ELECTRA_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading, saving and converting weights from
PyTorch models)
- This model is also a Flax Linen `flax.nn.Module
- `__ subclass. Use it as a regular Flax
+ This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.ElectraConfig`): Model configuration class with all the parameters of the model.
+ config ([`ElectraConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
ELECTRA_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`):
+ input_ids (`numpy.ndarray` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.ElectraTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`ElectraTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- head_mask (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ head_mask (`numpy.ndarray` of shape `({0})`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -924,18 +919,18 @@ class FlaxElectraSequenceSummary(nn.Module):
Compute a single vector summary of a sequence hidden states.
Args:
- config (:class:`~transformers.PretrainedConfig`):
+ config ([`PretrainedConfig`]):
The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
config class of your model for the default values it uses):
- - **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction.
- - **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to
- :obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`).
- - **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
- output, another string or :obj:`None` will add no activation.
- - **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and
+ - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+ - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to
+ `config.num_labels` classes (otherwise to `config.hidden_size`).
+ - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the
+ output, another string or `None` will add no activation.
+ - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and
activation.
- - **summary_last_dropout** (:obj:`float`)-- Optional dropout probability after the projection and
+ - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and
activation.
"""
config: ElectraConfig
@@ -970,14 +965,14 @@ class FlaxElectraSequenceSummary(nn.Module):
Compute a single vector summary of a sequence hidden states.
Args:
- hidden_states (:obj:`jnp.array` of shape :obj:`[batch_size, seq_len, hidden_size]`):
+ hidden_states (`jnp.array` of shape `[batch_size, seq_len, hidden_size]`):
The hidden states of the last layer.
- cls_index (:obj:`jnp.array` of shape :obj:`[batch_size]` or :obj:`[batch_size, ...]` where ... are optional leading dimensions of :obj:`hidden_states`, `optional`):
- Used if :obj:`summary_type == "cls_index"` and takes the last token of the sequence as classification
+ cls_index (`jnp.array` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+ Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification
token.
Returns:
- :obj:`jnp.array`: The summary of the sequence hidden states.
+ `jnp.array`: The summary of the sequence hidden states.
"""
# NOTE: this doest "first" type summary always
output = hidden_states[:, 0]
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index 64053a9110..10f8ac6cfc 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -522,7 +522,7 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
if input_ids is None and inputs_embeds is None:
raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
@@ -599,7 +599,7 @@ class TFElectraPreTrainedModel(TFPreTrainedModel):
Dummy inputs to build the network.
Returns:
- :obj:`Dict[str, tf.Tensor]`: The dummy inputs.
+ `Dict[str, tf.Tensor]`: The dummy inputs.
"""
dummy = {"input_ids": tf.constant(DUMMY_INPUTS)}
# Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
@@ -822,21 +822,20 @@ class TFElectraMainLayer(tf.keras.layers.Layer):
@dataclass
class TFElectraForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.TFElectraForPreTraining`.
+ Output type of [`TFElectraForPreTraining`].
Args:
- loss (`optional`, returned when ``labels`` is provided, ``tf.Tensor`` of shape :obj:`(1,)`):
+ loss (*optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
Total loss of the ELECTRA objective.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Prediction scores of the head (scores for each token before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -849,84 +848,85 @@ class TFElectraForPreTrainingOutput(ModelOutput):
ELECTRA_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Parameters:
- config (:class:`~transformers.ElectraConfig`): Model configuration class with all the parameters of the model.
+ config ([`ElectraConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
ELECTRA_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.ElectraTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`ElectraTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -972,24 +972,24 @@ class TFElectraModel(TFElectraPreTrainedModel):
**kwargs,
):
r"""
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
"""
inputs = input_processing(
func=self.call,
@@ -1230,10 +1230,9 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
func=self.call,
@@ -1355,10 +1354,9 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
func=self.call,
@@ -1463,10 +1461,9 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
inputs = input_processing(
func=self.call,
@@ -1603,9 +1600,8 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1701,13 +1697,13 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
**kwargs,
):
r"""
- start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
inputs = input_processing(
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index d530884eee..27d69e9ef9 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -45,107 +45,104 @@ DEPRECATION_WARNING = (
ENCODER_DECODER_START_DOCSTRING = r"""
This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
- :meth:`~transformers.AutoModel.from_pretrained` function and the decoder is loaded via
- :meth:`~transformers.AutoModelForCausalLM.from_pretrained` function. Cross-attention layers are automatically added
+ [`~AutoModel.from_pretrained`] function and the decoder is loaded via
+ [`~AutoModelForCausalLM.from_pretrained`] function. Cross-attention layers are automatically added
to the decoder and should be fine-tuned on a downstream generative task, like summarization.
The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
- tasks was shown in `Leveraging Pre-trained Checkpoints for Sequence Generation Tasks
- `__ by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
+ tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
Zhou, Wei Li, Peter J. Liu.
After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models
(see the examples for more information).
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.EncoderDecoderConfig`): Model configuration class with all the parameters of the model.
+ config ([`EncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
ENCODER_DECODER_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PreTrainedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PreTrainedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
+ [What are input IDs?](../glossary#input-ids)
- If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
- For training, :obj:`decoder_input_ids` are automatically created by the model by shifting the :obj:`labels`
- to the right, replacing -100 by the :obj:`pad_token_id` and prepending them with the
- :obj:`decoder_start_token_id`.
- decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ For training, `decoder_input_ids` are automatically created by the model by shifting the `labels`
+ to the right, replacing -100 by the `pad_token_id` and prepending them with the
+ `decoder_start_token_id`.
+ decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- encoder_outputs (:obj:`tuple(torch.FloatTensor)`, `optional`):
- This tuple must consist of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
- sequence_length, hidden_size)`) is a tensor of hidden-states at the output of the last layer of the
+ encoder_outputs (`tuple(torch.FloatTensor)`, *optional*):
+ This tuple must consist of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) is a tensor of hidden-states at the output of the last layer of the
encoder. Used in the cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`decoder_input_ids`
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `decoder_input_ids`
indices into associated vectors than the model's internal embedding lookup matrix.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss for the decoder. Indices should be in ``[-100, 0,
- ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- If set to ``True``, the model will return a :class:`~transformers.file_utils.Seq2SeqLMOutput` instead of a
+ return_dict (`bool`, *optional*):
+ If set to `True`, the model will return a [`~file_utils.Seq2SeqLMOutput`] instead of a
plain tuple.
- kwargs: (`optional`) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:
+ kwargs: (*optional*) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:
- - Without a prefix which will be input as ``**encoder_kwargs`` for the encoder forward function.
- - With a `decoder_` prefix which will be input as ``**decoder_kwargs`` for the decoder forward function.
+ - Without a prefix which will be input as `**encoder_kwargs` for the encoder forward function.
+ - With a *decoder_* prefix which will be input as `**decoder_kwargs` for the decoder forward function.
"""
@@ -170,10 +167,10 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)
class EncoderDecoderModel(PreTrainedModel):
r"""
- :class:`~transformers.EncoderDecoderModel` is a generic model class that will be instantiated as a transformer
+ [`EncoderDecoderModel`] is a generic model class that will be instantiated as a transformer
architecture with one of the base model classes of the library as encoder and another one as decoder when created
- with the :meth`~transformers.AutoModel.from_pretrained` class method for the encoder and
- :meth`~transformers.AutoModelForCausalLM.from_pretrained` class method for the decoder.
+ with the :meth*~transformers.AutoModel.from_pretrained* class method for the encoder and
+ :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder.
"""
config_class = EncoderDecoderConfig
base_model_prefix = "encoder_decoder"
@@ -294,60 +291,60 @@ class EncoderDecoderModel(PreTrainedModel):
checkpoints.
- The model is set in evaluation mode by default using :obj:`model.eval()` (Dropout modules are deactivated). To
- train the model, you need to first set it back in training mode with :obj:`model.train()`.
+ The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To
+ train the model, you need to first set it back in training mode with `model.train()`.
Params:
- encoder_pretrained_model_name_or_path (:obj: `str`, `optional`):
+ encoder_pretrained_model_name_or_path (:obj: *str*, *optional*):
Information necessary to initiate the encoder. Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
- this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
- as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+ this case, `from_tf` should be set to `True` and a configuration object should be provided
+ as `config` argument. This loading path is slower than converting the TensorFlow checkpoint in
a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- decoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
+ decoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
Information necessary to initiate the decoder. Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
- this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
- as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+ this case, `from_tf` should be set to `True` and a configuration object should be provided
+ as `config` argument. This loading path is slower than converting the TensorFlow checkpoint in
a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- model_args (remaining positional arguments, `optional`):
- All remaining positional arguments will be passed to the underlying model's ``__init__`` method.
+ model_args (remaining positional arguments, *optional*):
+ All remaining positional arguments will be passed to the underlying model's `__init__` method.
- kwargs (remaining dictionary of keyword arguments, `optional`):
+ kwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- :obj:`output_attentions=True`).
+ `output_attentions=True`).
- - To update the encoder configuration, use the prefix `encoder_` for each configuration parameter.
- - To update the decoder configuration, use the prefix `decoder_` for each configuration parameter.
+ - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+ - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
- To update the parent model configuration, do not use a prefix for each configuration parameter.
- Behaves differently depending on whether a :obj:`config` is provided or automatically loaded.
+ Behaves differently depending on whether a `config` is provided or automatically loaded.
- Example::
+ Example:
- >>> from transformers import EncoderDecoderModel
- >>> # initialize a bert2bert from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized
- >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')
- >>> # saving model after fine-tuning
- >>> model.save_pretrained("./bert2bert")
- >>> # load fine-tuned model
- >>> model = EncoderDecoderModel.from_pretrained("./bert2bert")
-
- """
+ ```python
+ >>> from transformers import EncoderDecoderModel
+ >>> # initialize a bert2bert from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized
+ >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')
+ >>> # saving model after fine-tuning
+ >>> model.save_pretrained("./bert2bert")
+ >>> # load fine-tuned model
+ >>> model = EncoderDecoderModel.from_pretrained("./bert2bert")
+ ```"""
kwargs_encoder = {
argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index 5846610317..3cfb2eb334 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -41,174 +41,170 @@ _CONFIG_FOR_DOC = "EncoderDecoderConfig"
ENCODER_DECODER_START_DOCSTRING = r"""
This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
- :meth:`~transformers.AutoModel.from_pretrained` function and the decoder is loaded via
- :meth:`~transformers.AutoModelForCausalLM.from_pretrained` function. Cross-attention layers are automatically added
+ [`~AutoModel.from_pretrained`] function and the decoder is loaded via
+ [`~AutoModelForCausalLM.from_pretrained`] function. Cross-attention layers are automatically added
to the decoder and should be fine-tuned on a downstream generative task, like summarization.
The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
- tasks was shown in `Leveraging Pre-trained Checkpoints for Sequence Generation Tasks
- `__ by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
+ tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
Zhou, Wei Li, Peter J. Liu.
After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models
(see the examples for more information).
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a Flax Linen `flax.nn.Module
- `__ subclass. Use it as a regular Flax
+ This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
Module and refer to the Flax documentation for all matter related to general usage and behavior.
Parameters:
- config (:class:`~transformers.EncoderDecoderConfig`): Model configuration class with all the parameters of the model.
+ config ([`EncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
ENCODER_DECODER_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PreTrainedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PreTrainedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- For sequence to sequence training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For sequence to sequence training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training.
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.encoder.max_position_embeddings - 1]``.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.encoder.max_position_embeddings - 1]`.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.decoder.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ range `[0, config.decoder.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- If set to ``True``, the model will return a :class:`~transformers.file_utils.FlaxSeq2SeqLMOutput` instead
+ return_dict (`bool`, *optional*):
+ If set to `True`, the model will return a [`~file_utils.FlaxSeq2SeqLMOutput`] instead
of a plain tuple.
"""
ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PreTrainedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.encoder.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.encoder.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- If set to ``True``, the model will return a :class:`~transformers.file_utils.FlaxBaseModelOutput` instead
+ return_dict (`bool`, *optional*):
+ If set to `True`, the model will return a [`~file_utils.FlaxBaseModelOutput`] instead
of a plain tuple.
"""
ENCODER_DECODER_DECODE_INPUTS_DOCSTRING = r"""
Args:
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PreTrainedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
- For sequence to sequence training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For sequence to sequence training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training.
- encoder_outputs (:obj:`tuple(tuple(jnp.ndarray)`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(jnp.ndarray)`):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- encoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.decoder.max_position_embeddings - 1]``.
- past_key_values (:obj:`Dict[str, np.ndarray]`, `optional`, returned by ``init_cache`` or when passing previous ``past_key_values``):
+ range `[0, config.decoder.max_position_embeddings - 1]`.
+ past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
- auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- If set to ``True``, the model will return a
- :class:`~transformers.file_utils.FlaxCausalLMOutputWithCrossAttentions` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ If set to `True`, the model will return a
+ [`~file_utils.FlaxCausalLMOutputWithCrossAttentions`] instead of a plain tuple.
"""
@@ -309,10 +305,10 @@ class FlaxEncoderDecoderModule(nn.Module):
@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)
class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
r"""
- :class:`~transformers.FlaxEncoderDecoderModel` is a generic model class that will be instantiated as a transformer
+ [`FlaxEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer
architecture with the module (flax.nn.Module) of one of the base model classes of the library as encoder module and
- another one as decoder module when created with the :meth`~transformers.FlaxAutoModel.from_pretrained` class method
- for the encoder and :meth`~transformers.FlaxAutoModelForCausalLM.from_pretrained` class method for the decoder.
+ another one as decoder module when created with the :meth*~transformers.FlaxAutoModel.from_pretrained* class method
+ for the encoder and :meth*~transformers.FlaxAutoModelForCausalLM.from_pretrained* class method for the decoder.
"""
config_class = EncoderDecoderConfig
base_model_prefix = "encoder_decoder"
@@ -378,15 +374,14 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
def init_cache(self, batch_size, max_length, encoder_outputs):
r"""
Args:
- batch_size (:obj:`int`):
+ batch_size (`int`):
batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
- max_length (:obj:`int`):
+ max_length (`int`):
maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
cache.
- encoder_outputs (:obj:`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
- ``encoder_outputs`` consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`,
- `optional`: :obj:`attentions`). :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length,
- hidden_size)`, `optional`) is a sequence of hidden-states at the output of the last layer of the
+ encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+ `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`,
+ *optional*: `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the
encoder. Used in the cross-attention of the decoder.
"""
# init input variables to retrieve cache
@@ -753,48 +748,48 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
checkpoints.
Params:
- encoder_pretrained_model_name_or_path (:obj: `Union[str, os.PathLike]`, `optional`):
+ encoder_pretrained_model_name_or_path (:obj: *Union[str, os.PathLike]*, *optional*):
Information necessary to initiate the encoder. Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
- decoder_pretrained_model_name_or_path (:obj: `Union[str, os.PathLike]`, `optional`, defaults to `None`):
+ decoder_pretrained_model_name_or_path (:obj: *Union[str, os.PathLike]*, *optional*, defaults to *None*):
Information necessary to initiate the decoder. Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
- model_args (remaining positional arguments, `optional`):
- All remaning positional arguments will be passed to the underlying model's ``__init__`` method.
+ model_args (remaining positional arguments, *optional*):
+ All remaning positional arguments will be passed to the underlying model's `__init__` method.
- kwargs (remaining dictionary of keyword arguments, `optional`):
+ kwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- :obj:`output_attentions=True`).
+ `output_attentions=True`).
- - To update the encoder configuration, use the prefix `encoder_` for each configuration parameter.
- - To update the decoder configuration, use the prefix `decoder_` for each configuration parameter.
+ - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+ - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
- To update the parent model configuration, do not use a prefix for each configuration parameter.
- Behaves differently depending on whether a :obj:`config` is provided or automatically loaded.
+ Behaves differently depending on whether a `config` is provided or automatically loaded.
- Example::
+ Example:
- >>> from transformers import FlaxEncoderDecoderModel
- >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
- >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
- >>> # saving model after fine-tuning
- >>> model.save_pretrained("./bert2gpt2")
- >>> # load fine-tuned model
- >>> model = FlaxEncoderDecoderModel.from_pretrained("./bert2gpt2")
-
- """
+ ```python
+ >>> from transformers import FlaxEncoderDecoderModel
+ >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+ >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
+ >>> # saving model after fine-tuning
+ >>> model.save_pretrained("./bert2gpt2")
+ >>> # load fine-tuned model
+ >>> model = FlaxEncoderDecoderModel.from_pretrained("./bert2gpt2")
+ ```"""
kwargs_encoder = {
argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index 43905fcb6a..9dc68878f3 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -43,120 +43,118 @@ _CONFIG_FOR_DOC = "EncoderDecoderConfig"
ENCODER_DECODER_START_DOCSTRING = r"""
This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
- :meth:`~transformers.TFAutoModel.from_pretrained` function and the decoder is loaded via
- :meth:`~transformers.TFAutoModelForCausalLM.from_pretrained` function. Cross-attention layers are automatically
+ [`~TFAutoModel.from_pretrained`] function and the decoder is loaded via
+ [`~TFAutoModelForCausalLM.from_pretrained`] function. Cross-attention layers are automatically
added to the decoder and should be fine-tuned on a downstream generative task, like summarization.
The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
- tasks was shown in `Leveraging Pre-trained Checkpoints for Sequence Generation Tasks
- `__ by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
+ tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
Zhou, Wei Li, Peter J. Liu.
After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models
(see the examples for more information).
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
Parameters:
- config (:class:`~transformers.EncoderDecoderConfig`): Model configuration class with all the parameters of the model.
+ config ([`EncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
ENCODER_DECODER_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`):
+ input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PreTrainedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PreTrainedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
+ [What are input IDs?](../glossary#input-ids)
- If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
Provide for sequence to sequence training to the decoder. Indices can be obtained using
- :class:`~transformers.PreTrainedTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and
- :meth:`transformers.PreTrainedTokenizer.__call__` for details.
- decoder_attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+ decoder_attention_mask (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- encoder_outputs (:obj:`tuple(tuple(tf.Tensor)`, `optional`):
- This tuple must consist of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`) is a
+ encoder_outputs (`tuple(tuple(tf.Tensor)`, *optional*):
+ This tuple must consist of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` (`tf.Tensor` of shape `({0}, hidden_size)`) is a
tensor of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the
decoder.
- past_key_values (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(tf.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`({0})`.
- inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `({0})`.
+ inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`decoder_input_ids`
+ decoder_inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `decoder_input_ids`
indices into associated vectors than the model's internal embedding lookup matrix.
- labels (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Labels for computing the masked language modeling loss for the decoder. Indices should be in ``[-100, 0,
- ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ labels (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- If set to ``True``, the model will return a :class:`~transformers.file_utils.Seq2SeqLMOutput` instead of a
+ return_dict (`bool`, *optional*):
+ If set to `True`, the model will return a [`~file_utils.Seq2SeqLMOutput`] instead of a
plain tuple.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
- kwargs: (`optional`) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:
+ kwargs: (*optional*) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:
- - Without a prefix which will be input as ``**encoder_kwargs`` for the encoder forward function.
- - With a `decoder_` prefix which will be input as ``**decoder_kwargs`` for the decoder forward function.
+ - Without a prefix which will be input as `**encoder_kwargs` for the encoder forward function.
+ - With a *decoder_* prefix which will be input as `**decoder_kwargs`` for the decoder forward function.
"""
@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)
class TFEncoderDecoderModel(TFPreTrainedModel):
r"""
- :class:`~transformers.TFEncoderDecoder` is a generic model class that will be instantiated as a transformer
+ [`TFEncoderDecoder`] is a generic model class that will be instantiated as a transformer
architecture with one of the base model classes of the library as encoder and another one as decoder when created
- with the :meth`~transformers.TFAutoModel.from_pretrained` class method for the encoder and
- :meth`~transformers.TFAutoModelForCausalLM.from_pretrained` class method for the decoder.
+ with the :meth*~transformers.TFAutoModel.from_pretrained* class method for the encoder and
+ :meth*~transformers.TFAutoModelForCausalLM.from_pretrained* class method for the decoder.
"""
config_class = EncoderDecoderConfig
base_model_prefix = "encoder_decoder"
@@ -233,7 +231,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel):
Dummy inputs to build the network.
Returns:
- :obj:`Dict[str, tf.Tensor]`: The dummy inputs.
+ `Dict[str, tf.Tensor]`: The dummy inputs.
"""
# Add `decoder_input_ids` because `self.decoder` requires it.
input_ids = tf.constant(DUMMY_INPUTS)
@@ -311,52 +309,52 @@ class TFEncoderDecoderModel(TFPreTrainedModel):
Params:
- encoder_pretrained_model_name_or_path (:obj: `str`, `optional`):
+ encoder_pretrained_model_name_or_path (:obj: *str*, *optional*):
Information necessary to initiate the encoder. Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `pytorch index checkpoint file` (e.g, ``./pt_model/``). In this case,
- ``encoder_from_pt`` should be set to :obj:`True`.
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *pytorch index checkpoint file* (e.g, `./pt_model/`). In this case,
+ `encoder_from_pt` should be set to `True`.
- decoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
+ decoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
Information necessary to initiate the decoder. Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `pytorch checkpoint file` (e.g, ``./pt_model/``). In this case,
- ``decoder_from_pt`` should be set to :obj:`True`.
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *pytorch checkpoint file* (e.g, `./pt_model/`). In this case,
+ `decoder_from_pt` should be set to `True`.
- model_args (remaining positional arguments, `optional`):
- All remaning positional arguments will be passed to the underlying model's ``__init__`` method.
+ model_args (remaining positional arguments, *optional*):
+ All remaning positional arguments will be passed to the underlying model's `__init__` method.
- kwargs (remaining dictionary of keyword arguments, `optional`):
+ kwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- :obj:`output_attentions=True`).
+ `output_attentions=True`).
- - To update the encoder configuration, use the prefix `encoder_` for each configuration parameter.
- - To update the decoder configuration, use the prefix `decoder_` for each configuration parameter.
+ - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+ - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
- To update the parent model configuration, do not use a prefix for each configuration parameter.
- Behaves differently depending on whether a :obj:`config` is provided or automatically loaded.
+ Behaves differently depending on whether a `config` is provided or automatically loaded.
- Example::
+ Example:
- >>> from transformers import TFEncoderDecoderModel
- >>> # initialize a bert2gpt2 from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized
- >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'gpt2')
- >>> # saving model after fine-tuning
- >>> model.save_pretrained("./bert2gpt2")
- >>> # load fine-tuned model
- >>> model = TFEncoderDecoderModel.from_pretrained("./bert2gpt2")
-
- """
+ ```python
+ >>> from transformers import TFEncoderDecoderModel
+ >>> # initialize a bert2gpt2 from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized
+ >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'gpt2')
+ >>> # saving model after fine-tuning
+ >>> model.save_pretrained("./bert2gpt2")
+ >>> # load fine-tuned model
+ >>> model = TFEncoderDecoderModel.from_pretrained("./bert2gpt2")
+ ```"""
kwargs_encoder = {
argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py
index 9887b639c0..d13ed14bb8 100644
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@@ -54,78 +54,76 @@ FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
FLAUBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.FlaubertConfig`): Model configuration class with all the parameters of the model.
+ config ([`FlaubertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
FLAUBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.FlaubertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`FlaubertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ [What are position IDs?](../glossary#position-ids)
+ lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Length of each sentence that can be used to avoid performing attention on padding token indices. You can
- also use :obj:`attention_mask` for the same result (see above), kept here for compatibility. Indices
- selected in ``[0, ..., input_ids.size(-1)]``:
- cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
- Dictionary strings to ``torch.FloatTensor`` that contains precomputed hidden-states (key and values in the
- attention blocks) as computed by the model (see :obj:`cache` output below). Can be used to speed up
+ also use `attention_mask` for the same result (see above), kept here for compatibility. Indices
+ selected in `[0, ..., input_ids.size(-1)]`:
+ cache (`Dict[str, torch.FloatTensor]`, *optional*):
+ Dictionary strings to `torch.FloatTensor` that contains precomputed hidden-states (key and values in the
+ attention blocks) as computed by the model (see `cache` output below). Can be used to speed up
sequential decoding. The dictionary object will be modified in-place during the forward pass to add newly
computed hidden-states.
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -327,7 +325,7 @@ class FlaubertModel(XLMModel):
)
class FlaubertWithLMHeadModel(XLMWithLMHeadModel):
"""
- This class overrides :class:`~transformers.XLMWithLMHeadModel`. Please check the superclass for the appropriate
+ This class overrides [`XLMWithLMHeadModel`]. Please check the superclass for the appropriate
documentation alongside usage examples.
"""
@@ -349,7 +347,7 @@ class FlaubertWithLMHeadModel(XLMWithLMHeadModel):
)
class FlaubertForSequenceClassification(XLMForSequenceClassification):
"""
- This class overrides :class:`~transformers.XLMForSequenceClassification`. Please check the superclass for the
+ This class overrides [`XLMForSequenceClassification`]. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
@@ -371,7 +369,7 @@ class FlaubertForSequenceClassification(XLMForSequenceClassification):
)
class FlaubertForTokenClassification(XLMForTokenClassification):
"""
- This class overrides :class:`~transformers.XLMForTokenClassification`. Please check the superclass for the
+ This class overrides [`XLMForTokenClassification`]. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
@@ -393,7 +391,7 @@ class FlaubertForTokenClassification(XLMForTokenClassification):
)
class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple):
"""
- This class overrides :class:`~transformers.XLMForQuestionAnsweringSimple`. Please check the superclass for the
+ This class overrides [`XLMForQuestionAnsweringSimple`]. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
@@ -415,7 +413,7 @@ class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple):
)
class FlaubertForQuestionAnswering(XLMForQuestionAnswering):
"""
- This class overrides :class:`~transformers.XLMForQuestionAnswering`. Please check the superclass for the
+ This class overrides [`XLMForQuestionAnswering`]. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
@@ -437,7 +435,7 @@ class FlaubertForQuestionAnswering(XLMForQuestionAnswering):
)
class FlaubertForMultipleChoice(XLMForMultipleChoice):
"""
- This class overrides :class:`~transformers.XLMForMultipleChoice`. Please check the superclass for the appropriate
+ This class overrides [`XLMForMultipleChoice`]. Please check the superclass for the appropriate
documentation alongside usage examples.
"""
diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py
index eecd686ef1..97089fc666 100644
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -62,111 +62,111 @@ TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
FLAUBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Parameters:
- config (:class:`~transformers.FlaubertConfig`): Model configuration class with all the parameters of the model.
+ config ([`FlaubertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
FLAUBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.FlaubertTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`FlaubertTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- - ``1`` for tokens that are **not masked**,
- - ``0`` for tokens that are **masked**.
+ - `1` for tokens that are **not masked**,
+ - `0` for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- langs (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ langs (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
languages ids which can be obtained from the language names by using two conversion mappings provided in
- the configuration of the model (only provided for multilingual models). More precisely, the `language name
- to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary string to int) and the
- `language id to language name` mapping is in :obj:`model.config.id2lang` (dictionary int to string).
+ the configuration of the model (only provided for multilingual models). More precisely, the *language name
+ to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+ *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
- See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`.
- token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ See usage examples detailed in the [multilingual documentation](../multilingual).
+ token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - ``0`` corresponds to a `sentence A` token,
- - ``1`` corresponds to a `sentence B` token.
+ - `0` corresponds to a *sentence A* token,
+ - `1` corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`):
+ [What are position IDs?](../glossary#position-ids)
+ lengths (`tf.Tensor` or `Numpy array` of shape `(batch_size,)`, *optional*):
Length of each sentence that can be used to avoid performing attention on padding token indices. You can
- also use `attention_mask` for the same result (see above), kept here for compatibility Indices selected in
- ``[0, ..., input_ids.size(-1)]``:
- cache (:obj:`Dict[str, tf.Tensor]`, `optional`):
- Dictionary string to ``tf.FloatTensor`` that contains precomputed hidden states (key and values in the
- attention blocks) as computed by the model (see :obj:`cache` output below). Can be used to speed up
+ also use *attention_mask* for the same result (see above), kept here for compatibility Indices selected in
+ `[0, ..., input_ids.size(-1)]`:
+ cache (`Dict[str, tf.Tensor]`, *optional*):
+ Dictionary string to `tf.FloatTensor` that contains precomputed hidden states (key and values in the
+ attention blocks) as computed by the model (see `cache` output below). Can be used to speed up
sequential decoding.
The dictionary object will be modified in-place during the forward pass to add newly computed
hidden-states.
- head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- - ``1`` indicates the head is **not masked**,
- - ``0`` indicates the head is **masked**.
+ - `1` indicates the head is **not masked**,
+ - `0` indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -761,19 +761,18 @@ class TFFlaubertPredLayer(tf.keras.layers.Layer):
@dataclass
class TFFlaubertWithLMHeadModelOutput(ModelOutput):
"""
- Base class for :class:`~transformers.TFFlaubertWithLMHeadModel` outputs.
+ Base class for [`TFFlaubertWithLMHeadModel`] outputs.
Args:
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py
index 9eb17f2c6a..78eaa9b255 100755
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -442,20 +442,20 @@ class FNetPreTrainedModel(PreTrainedModel):
@dataclass
class FNetForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.FNetForPreTraining`.
+ Output type of [`FNetForPreTraining`].
Args:
- loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+ loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.
- prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+ seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
each layer plus the initial embedding outputs.
"""
@@ -466,50 +466,48 @@ class FNetForPreTrainingOutput(ModelOutput):
FNET_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.FNetConfig`): Model configuration class with all the parameters of the model.
+ config ([`FNetConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
FNET_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.FNetTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`FNetTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
+ [What are position IDs?](../glossary#position-ids)
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -657,33 +655,32 @@ class FNetForPreTraining(FNetPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
- (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+ (see `input_ids` docstring) Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
- kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
Returns:
- Example::
+ Example:
-
- >>> from transformers import FNetTokenizer, FNetForPreTraining
- >>> import torch
- >>> tokenizer = FNetTokenizer.from_pretrained('google/fnet-base')
- >>> model = FNetForPreTraining.from_pretrained('google/fnet-base')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.prediction_logits
- >>> seq_relationship_logits = outputs.seq_relationship_logits
- """
+ ```python
+ >>> from transformers import FNetTokenizer, FNetForPreTraining
+ >>> import torch
+ >>> tokenizer = FNetTokenizer.from_pretrained('google/fnet-base')
+ >>> model = FNetForPreTraining.from_pretrained('google/fnet-base')
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> prediction_logits = outputs.prediction_logits
+ >>> seq_relationship_logits = outputs.seq_relationship_logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.fnet(
@@ -752,10 +749,9 @@ class FNetForMaskedLM(FNetPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -811,29 +807,29 @@ class FNetForNextSentencePrediction(FNetPreTrainedModel):
**kwargs,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
- (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+ (see `input_ids` docstring). Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
Returns:
- Example::
+ Example:
-
- >>> from transformers import FNetTokenizer, FNetForNextSentencePrediction
- >>> import torch
- >>> tokenizer = FNetTokenizer.from_pretrained('google/fnet-base')
- >>> model = FNetForNextSentencePrediction.from_pretrained('google/fnet-base')
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
- >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
- >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
- >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
- >>> logits = outputs.logits
- >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
- """
+ ```python
+ >>> from transformers import FNetTokenizer, FNetForNextSentencePrediction
+ >>> import torch
+ >>> tokenizer = FNetTokenizer.from_pretrained('google/fnet-base')
+ >>> model = FNetForNextSentencePrediction.from_pretrained('google/fnet-base')
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+ >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+ >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+ >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+ >>> logits = outputs.logits
+ >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+ ```"""
if "next_sentence_label" in kwargs:
warnings.warn(
@@ -910,10 +906,9 @@ class FNetForSequenceClassification(FNetPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -995,10 +990,9 @@ class FNetForMultipleChoice(FNetPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1077,9 +1071,8 @@ class FNetForTokenClassification(FNetPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1148,13 +1141,13 @@ class FNetForQuestionAnswering(FNetPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index a1c4c1ed8c..70294d269a 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -182,18 +182,18 @@ PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/w
FSMT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.FSMTConfig`): Model configuration class with all the parameters of the model.
+ config ([`FSMTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
@@ -218,76 +218,75 @@ FSMT_GENERATION_EXAMPLE = r"""
FSMT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- IIndices can be obtained using :class:`~transformers.FSTMTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ IIndices can be obtained using [`FSTMTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.FSMTTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`FSMTTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- FSMT uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ FSMT uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in ``[0,
- 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`Tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)` is a
+ encoder_outputs (`Tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a
sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
the decoder.
- past_key_values (:obj:`Tuple(torch.FloatTensor)` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`Tuple(torch.FloatTensor)` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -411,16 +410,16 @@ class EncoderLayer(nn.Module):
def forward(self, x, encoder_padding_mask, layer_head_mask, output_attentions=False):
"""
Args:
- x (:obj:`torch.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_padding_mask (:obj:`torch.ByteTensor`): binary ByteTensor of shape
- `(batch, src_len)` where padding elements are indicated by ``1``.
+ x (`torch.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_padding_mask (`torch.ByteTensor`): binary ByteTensor of shape
+ *(batch, src_len)* where padding elements are indicated by `1`.
for t_tgt, t_src is excluded (or masked out), =0 means it is
included in attention
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(config.encoder_attention_heads,)`.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(config.encoder_attention_heads,)*.
Returns:
- encoded output of shape `(seq_len, batch, embed_dim)`
+ encoded output of shape *(seq_len, batch, embed_dim)*
"""
residual = x
x, attn_weights = self.self_attn(
@@ -447,7 +446,7 @@ class EncoderLayer(nn.Module):
class FSMTEncoder(nn.Module):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`EncoderLayer`.
+ [`EncoderLayer`].
Args:
config: FSMTConfig
@@ -479,11 +478,11 @@ class FSMTEncoder(nn.Module):
):
"""
Args:
- input_ids (:obj:`torch.LongTensor`): tokens in the source language of shape
- `(batch, src_len)`
- attention_mask (:obj:`torch.LongTensor`): indicating which indices are padding tokens
- head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ input_ids (`torch.LongTensor`): tokens in the source language of shape
+ *(batch, src_len)*
+ attention_mask (`torch.LongTensor`): indicating which indices are padding tokens
+ head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
@@ -491,10 +490,10 @@ class FSMTEncoder(nn.Module):
Returns:
BaseModelOutput or Tuple comprised of:
- - **x** (:obj:`torch.Tensor`): the last encoder layer's output of shape `(src_len, batch, embed_dim)`
- - **encoder_states** (:obj:`Tuple(torch.FloatTensor`)): all intermediate hidden states of shape
- `(src_len, batch, embed_dim)`. Only populated if *output_hidden_states:* is True.
- - **all_attentions** (:obj:`Tuple(torch.FloatTensor`)): Attention weights for each layer.
+ - **x** (`torch.Tensor`): the last encoder layer's output of shape *(src_len, batch, embed_dim)*
+ - **encoder_states** (`Tuple(torch.FloatTensor`)): all intermediate hidden states of shape
+ *(src_len, batch, embed_dim)*. Only populated if *output_hidden_states:* is True.
+ - **all_attentions** (`Tuple(torch.FloatTensor`)): Attention weights for each layer.
During training might not be of length n_layers because of layer dropout.
"""
# check attention mask and invert
@@ -637,7 +636,7 @@ class DecoderLayer(nn.Module):
class FSMTDecoder(nn.Module):
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`DecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DecoderLayer`]
Args:
config: FSMTConfig
@@ -689,20 +688,20 @@ class FSMTDecoder(nn.Module):
EMNLP 2019).
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch, tgt_len)`):
+ input_ids (`torch.LongTensor` of shape `(batch, tgt_len)`):
previous decoder outputs for teacher forcing
encoder_hidden_states: output from the encoder, used for
encoder-side attention
encoder_padding_mask: for ignoring pad tokens
past_key_values (dict or None): dictionary used for storing state during generation
- head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
@@ -710,7 +709,7 @@ class FSMTDecoder(nn.Module):
Returns:
BaseModelOutputWithPast or tuple:
- - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+ - the decoder's features of shape *(batch, tgt_len, embed_dim)*
- the cache
- hidden states
- attentions
@@ -1153,10 +1152,9 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py
index fffed242fd..b55c50f238 100644
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -800,21 +800,20 @@ class FunnelClassificationHead(nn.Module):
@dataclass
class FunnelForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.FunnelForPreTraining`.
+ Output type of [`FunnelForPreTraining`].
Args:
- loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+ loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss of the ELECTRA-style objective.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Prediction scores of the head (scores for each token before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -828,61 +827,60 @@ class FunnelForPreTrainingOutput(ModelOutput):
FUNNEL_START_DOCSTRING = r"""
- The Funnel Transformer model was proposed in `Funnel-Transformer: Filtering out Sequential Redundancy for Efficient
- Language Processing `__ by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+ The Funnel Transformer model was proposed in [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient
+ Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.FunnelConfig`): Model configuration class with all the parameters of the model.
+ config ([`FunnelConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
FUNNEL_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ [What are token type IDs?](../glossary#token-type-ids)
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -1101,26 +1099,27 @@ class FunnelForPreTraining(FunnelPreTrainedModel):
return_dict=None,
):
r"""
- labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
- Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see :obj:`input_ids`
- docstring) Indices should be in ``[0, 1]``:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see `input_ids`
+ docstring) Indices should be in `[0, 1]`:
- 0 indicates the token is an original token,
- 1 indicates the token was replaced.
Returns:
- Examples::
+ Examples:
- >>> from transformers import FunnelTokenizer, FunnelForPreTraining
- >>> import torch
+ ```python
+ >>> from transformers import FunnelTokenizer, FunnelForPreTraining
+ >>> import torch
- >>> tokenizer = FunnelTokenizer.from_pretrained('funnel-transformer/small')
- >>> model = FunnelForPreTraining.from_pretrained('funnel-transformer/small')
+ >>> tokenizer = FunnelTokenizer.from_pretrained('funnel-transformer/small')
+ >>> model = FunnelForPreTraining.from_pretrained('funnel-transformer/small')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors= "pt")
- >>> logits = model(**inputs).logits
- """
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors= "pt")
+ >>> logits = model(**inputs).logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
discriminator_hidden_states = self.funnel(
@@ -1196,10 +1195,9 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1270,10 +1268,9 @@ class FunnelForSequenceClassification(FunnelPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1361,10 +1358,9 @@ class FunnelForMultipleChoice(FunnelPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1448,9 +1444,8 @@ class FunnelForTokenClassification(FunnelPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1532,13 +1527,13 @@ class FunnelForQuestionAnswering(FunnelPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index 9011304784..04a4208fae 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -102,7 +102,7 @@ class TFFunnelEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
assert not (input_ids is not None and inputs_embeds is not None)
@@ -1004,19 +1004,18 @@ class TFFunnelPreTrainedModel(TFPreTrainedModel):
@dataclass
class TFFunnelForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.FunnelForPreTraining`.
+ Output type of [`FunnelForPreTraining`].
Args:
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Prediction scores of the head (scores for each token before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -1029,84 +1028,85 @@ class TFFunnelForPreTrainingOutput(ModelOutput):
FUNNEL_START_DOCSTRING = r"""
- The Funnel Transformer model was proposed in `Funnel-Transformer: Filtering out Sequential Redundancy for Efficient
- Language Processing `__ by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+ The Funnel Transformer model was proposed in [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient
+ Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Parameters:
- config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model.
+ config ([`XxxConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
FUNNEL_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.FunnelTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`FunnelTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ [What are token type IDs?](../glossary#token-type-ids)
+ inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1356,10 +1356,9 @@ class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss)
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
func=self.call,
@@ -1445,10 +1444,9 @@ class TFFunnelForSequenceClassification(TFFunnelPreTrainedModel, TFSequenceClass
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
func=self.call,
@@ -1544,10 +1542,9 @@ class TFFunnelForMultipleChoice(TFFunnelPreTrainedModel, TFMultipleChoiceLoss):
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
inputs = input_processing(
func=self.call,
@@ -1674,9 +1671,8 @@ class TFFunnelForTokenClassification(TFFunnelPreTrainedModel, TFTokenClassificat
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1767,13 +1763,13 @@ class TFFunnelForQuestionAnswering(TFFunnelPreTrainedModel, TFQuestionAnsweringL
**kwargs,
):
r"""
- start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
inputs = input_processing(
diff --git a/src/transformers/models/gpt2/modeling_flax_gpt2.py b/src/transformers/models/gpt2/modeling_flax_gpt2.py
index 00a3e6d403..c3931a17de 100644
--- a/src/transformers/models/gpt2/modeling_flax_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_flax_gpt2.py
@@ -42,71 +42,69 @@ _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
GPT2_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a Flax Linen `flax.nn.Module
- `__ subclass. Use it as a regular Flax
+ This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+ config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
GPT2_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, input_ids_length)`):
- :obj:`input_ids_length` = ``sequence_length``. Indices of input sequence tokens in the vocabulary.
+ input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+ `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`GPT2Tokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- past_key_values (:obj:`Dict[str, np.ndarray]`, `optional`, returned by ``init_cache`` or when passing previous ``past_key_values``):
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
- auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -430,9 +428,9 @@ class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
def init_cache(self, batch_size, max_length):
r"""
Args:
- batch_size (:obj:`int`):
+ batch_size (`int`):
batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
- max_length (:obj:`int`):
+ max_length (`int`):
maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
cache.
"""
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 1fbee09990..bdc019fcba 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -494,28 +494,26 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss.
- mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
+ mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided):
Multiple choice classification loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+ mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
- past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of length :obj:`config.n_layers`, containing tuples of tensors of shape :obj:`(batch_size, num_heads,
- sequence_length, embed_size_per_head)`).
+ past_key_values (`Tuple[Tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
GPT2Attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads.
@@ -532,85 +530,83 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
GPT2_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+ config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
GPT2_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`):
- :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
- ``past_key_values[0][0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
+ input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+ `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+ `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
sequence tokens in the vocabulary.
- If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be
- passed as ``input_ids``.
+ If `past_key_values` is used, only `input_ids` that do not have their past calculated should be
+ passed as `input_ids`.
- Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`GPT2Tokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers`):
+ [What are input IDs?](../glossary#input-ids)
+ past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
- :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
- have their past given to this model should not be passed as ``input_ids`` as they have already been
+ `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which
+ have their past given to this model should not be passed as `input_ids` as they have already been
computed.
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- If :obj:`past_key_values` is used, optionally only the last :obj:`inputs_embeds` have to be input (see
- :obj:`past_key_values`).
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
+ `past_key_values`).
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
PARALLELIZE_DOCSTRING = r"""
This is an experimental feature and is a subject to change at a moment's notice.
@@ -619,7 +615,7 @@ PARALLELIZE_DOCSTRING = r"""
it will evenly distribute blocks across all devices.
Args:
- device_map (:obj:`Dict[int, list]`, optional, defaults to None):
+ device_map (`Dict[int, list]`, optional, defaults to None):
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
automatically mapped to the first device (for esoteric reasons). That means that the first device should
have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
@@ -630,16 +626,18 @@ PARALLELIZE_DOCSTRING = r"""
- gpt2-large: 36
- gpt2-xl: 48
- Example::
+ Example:
- # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
- model = GPT2LMHeadModel.from_pretrained('gpt2-xl')
- device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
+ ```python
+ # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
+ model = GPT2LMHeadModel.from_pretrained('gpt2-xl')
+ device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
- 1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
- 2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
- 3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]}
- model.parallelize(device_map)
+ 1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
+ 2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
+ 3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]}
+ model.parallelize(device_map)
+ ```
"""
DEPARALLELIZE_DOCSTRING = r"""
Moves the model to cpu from a model parallel state.
@@ -1034,10 +1032,10 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
- ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
- ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to
+ `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1090,9 +1088,9 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
@staticmethod
def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
"""
- This function is used to re-order the :obj:`past_key_values` cache if
- :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
- called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
+ This function is used to re-order the `past_key_values` cache if
+ [`~PreTrainedModel.beam_search`] or [`~PreTrainedModel.beam_sample`] is
+ called. This is required to match `past_key_values` with the correct beam_idx at every generation step.
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
@@ -1204,45 +1202,43 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
**kwargs,
):
r"""
- mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
- Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) -
- 1[``.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
+ Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) - 1[`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
- ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size - 1]`` All labels set to
- ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size - 1]``
- mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
- `input_ids` above)
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size - 1]` All labels set to
+ `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`
+ mc_labels (`torch.LongTensor` of shape `(batch_size)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where *num_choices* is the size of the second dimension of the input tensors. (see
+ *input_ids* above)
Return:
- Example::
+ Example:
- >>> import torch
- >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+ ```python
+ >>> import torch
+ >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
- >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
- >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+ >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+ >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
- >>> # Add a [CLS] to the vocabulary (we should train it also!)
- >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+ >>> # Add a [CLS] to the vocabulary (we should train it also!)
+ >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
- >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
+ >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
- >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
- >>> encoded_choices = [tokenizer.encode(s) for s in choices]
- >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+ >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+ >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+ >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
- >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
- >>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
+ >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
+ >>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
- >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
- >>> lm_logits = outputs.logits
- >>> mc_logits = outputs.mc_logits
-
- """
+ >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+ >>> lm_logits = outputs.logits
+ >>> mc_logits = outputs.mc_logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
@@ -1299,9 +1295,9 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
@staticmethod
def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
"""
- This function is used to re-order the :obj:`past_key_values` cache if
- :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
- called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
+ This function is used to re-order the `past_key_values` cache if
+ [`~PreTrainedModel.beam_search`] or [`~PreTrainedModel.beam_sample`] is
+ called. This is required to match `past_key_values` with the correct beam_idx at every generation step.
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
@@ -1313,13 +1309,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
"""
The GPT2 Model transformer with a sequence classification head on top (linear layer).
- :class:`~transformers.GPT2ForSequenceClassification` uses the last token in order to do the classification, as
+ [`GPT2ForSequenceClassification`] uses the last token in order to do the classification, as
other causal models (e.g. GPT-1) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
- :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
- row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
- guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
+ row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
+ guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take
the last value in each row of the batch).
""",
GPT2_START_DOCSTRING,
@@ -1363,10 +1359,9 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1495,10 +1490,9 @@ class GPT2ForTokenClassification(GPT2PreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index f08e5662cb..568ba30e50 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -558,7 +558,7 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel):
Dummy inputs to build the network.
Returns:
- :obj:`Dict[str, tf.Tensor]`: The dummy inputs.
+ `Dict[str, tf.Tensor]`: The dummy inputs.
"""
dummy = {"input_ids": tf.constant(DUMMY_INPUTS)}
# Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
@@ -590,24 +590,22 @@ class TFGPT2DoubleHeadsModelOutput(ModelOutput):
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
+ logits (`tf.Tensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- mc_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
+ mc_logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
- :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -622,100 +620,100 @@ class TFGPT2DoubleHeadsModelOutput(ModelOutput):
GPT2_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Parameters:
- config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+ config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
GPT2_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, input_ids_length)`):
- :obj:`input_ids_length` = ``sequence_length`` if ``past`` is ``None`` else ``past[0].shape[-2]``
- (``sequence_length`` of input past key value states). Indices of input sequence tokens in the vocabulary.
+ input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, input_ids_length)`):
+ `input_ids_length` = `sequence_length` if `past` is `None` else `past[0].shape[-2]`
+ (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
- If :obj:`past` is used, only input IDs that do not have their past calculated should be passed as
- ``input_ids``.
+ If `past` is used, only input IDs that do not have their past calculated should be passed as
+ `input_ids`.
- Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`GPT2Tokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
+ [What are input IDs?](../glossary#input-ids)
+ past (`List[tf.Tensor]` of length `config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
- :obj:`past` output below). Can be used to speed up sequential decoding. The token ids which have their past
+ `past` output below). Can be used to speed up sequential decoding. The token ids which have their past
given to this model should not be passed as input ids as they have already been computed.
- attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -756,24 +754,24 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
**kwargs,
):
r"""
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ past (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past` are used, the user can optionally input only the last :obj:`decoder_input_ids` (those that
- don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all
- :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past`). Set to :obj:`False` during training, :obj:`True` during generation
+ If `past` are used, the user can optionally input only the last `decoder_input_ids` (those that
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past`). Set to `False` during training, `True` during generation
"""
inputs = input_processing(
func=self.call,
@@ -886,27 +884,26 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
**kwargs,
):
r"""
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ past (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past` are used, the user can optionally input only the last :obj:`decoder_input_ids` (those that
- don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all
- :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past`). Set to :obj:`False` during training, :obj:`True` during generation
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the cross entropy classification loss. Indices should be in ``[0, ...,
- config.vocab_size - 1]``.
+ If `past` are used, the user can optionally input only the last `decoder_input_ids` (those that
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past`). Set to `False` during training, `True` during generation
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., config.vocab_size - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1022,36 +1019,35 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
**kwargs,
):
r"""
- mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
- Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) -
- 1[``.
+ mc_token_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
+ Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) - 1[`.
Return:
- Examples::
+ Examples:
- >>> import tensorflow as tf
- >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
+ ```python
+ >>> import tensorflow as tf
+ >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
- >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
- >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
+ >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+ >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
- >>> # Add a [CLS] to the vocabulary (we should train it also!)
- >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+ >>> # Add a [CLS] to the vocabulary (we should train it also!)
+ >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
- >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
+ >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
- >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
- >>> encoded_choices = [tokenizer.encode(s) for s in choices]
- >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+ >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+ >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+ >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
- >>> input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2
- >>> mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1
+ >>> input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2
+ >>> mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1
- >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
- >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
-
- """
+ >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+ >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+ ```"""
inputs = input_processing(
func=self.call,
config=self.config,
@@ -1152,13 +1148,13 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
"""
The GPT2 Model transformer with a sequence classification head on top (linear layer).
- :class:`~transformers.TFGPT2ForSequenceClassification` uses the last token in order to do the classification, as
+ [`TFGPT2ForSequenceClassification`] uses the last token in order to do the classification, as
other causal models (e.g. GPT-1) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
- :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
- row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
- guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
+ row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
+ guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take
the last value in each row of the batch).
""",
GPT2_START_DOCSTRING,
@@ -1200,9 +1196,8 @@ class TFGPT2ForSequenceClassification(TFGPT2PreTrainedModel, TFSequenceClassific
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the cross entropy classification loss. Indices should be in ``[0, ...,
- config.vocab_size - 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., config.vocab_size - 1]`.
"""
inputs = input_processing(
func=self.call,
diff --git a/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
index c43343ecaf..17e042dec8 100644
--- a/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
@@ -40,71 +40,69 @@ _CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neo-1.3B"
GPT_NEO_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a Flax Linen `flax.nn.Module
- `__ subclass. Use it as a regular Flax
+ This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.GPTNeoConfig`): Model configuration class with all the parameters of the model.
+ config ([`GPTNeoConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
GPT_NEO_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, input_ids_length)`):
- :obj:`input_ids_length` = ``sequence_length``. Indices of input sequence tokens in the vocabulary.
+ input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+ `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.GPTNeoTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`GPTNeoTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- past_key_values (:obj:`Dict[str, np.ndarray]`, `optional`, returned by ``init_cache`` or when passing previous ``past_key_values``):
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
- auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -374,9 +372,9 @@ class FlaxGPTNeoPreTrainedModel(FlaxPreTrainedModel):
def init_cache(self, batch_size, max_length):
r"""
Args:
- batch_size (:obj:`int`):
+ batch_size (`int`):
batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
- max_length (:obj:`int`):
+ max_length (`int`):
maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
cache.
"""
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 7046f75b55..13b3278f53 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -389,85 +389,83 @@ class GPTNeoPreTrainedModel(PreTrainedModel):
GPT_NEO_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.GPTNeoConfig`): Model configuration class with all the parameters of the model.
+ config ([`GPTNeoConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
GPT_NEO_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`):
- :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
- ``past_key_values[0][0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
+ input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+ `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+ `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
sequence tokens in the vocabulary.
- If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be
- passed as ``input_ids``.
+ If `past_key_values` is used, only `input_ids` that do not have their past calculated should be
+ passed as `input_ids`.
- Indices can be obtained using :class:`~transformers.GPTNeoTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`GPTNeoTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.num_layers`):
+ [What are input IDs?](../glossary#input-ids)
+ past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.num_layers`):
Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
- :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
- have their past given to this model should not be passed as ``input_ids`` as they have already been
+ `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which
+ have their past given to this model should not be passed as `input_ids` as they have already been
computed.
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- If :obj:`past_key_values` is used, optionally only the last :obj:`inputs_embeds` have to be input (see
- :obj:`past_key_values`).
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
+ `past_key_values`).
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -736,10 +734,10 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
- ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
- ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to
+ `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -791,9 +789,9 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
@staticmethod
def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
"""
- This function is used to re-order the :obj:`past_key_values` cache if
- :meth:`~transformers.PretrainedModel.beam_search` or :meth:`~transformers.PretrainedModel.beam_sample` is
- called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
+ This function is used to re-order the `past_key_values` cache if
+ [`~PretrainedModel.beam_search`] or [`~PretrainedModel.beam_sample`] is
+ called. This is required to match `past_key_values` with the correct beam_idx at every generation step.
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
@@ -805,13 +803,13 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
"""
The GPTNeo Model transformer with a sequence classification head on top (linear layer).
- :class:`~transformers.GPTNeoForSequenceClassification` uses the last token in order to do the classification, as
+ [`GPTNeoForSequenceClassification`] uses the last token in order to do the classification, as
other causal models (e.g. GPT-1) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
- :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
- row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
- guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
+ row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
+ guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take
the last value in each row of the batch).
""",
GPT_NEO_START_DOCSTRING,
@@ -851,10 +849,9 @@ class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/gptj/modeling_flax_gptj.py b/src/transformers/models/gptj/modeling_flax_gptj.py
index 8b81ba9f15..840d7ed53b 100644
--- a/src/transformers/models/gptj/modeling_flax_gptj.py
+++ b/src/transformers/models/gptj/modeling_flax_gptj.py
@@ -42,71 +42,69 @@ _TOKENIZER_FOR_DOC = "GPTJTokenizer"
GPTJ_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a Flax Linen `flax.nn.Module
- `__ subclass. Use it as a regular Flax
+ This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.GPTJConfig`): Model configuration class with all the parameters of the model.
+ config ([`GPTJConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
GPTJ_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, input_ids_length)`):
- :obj:`input_ids_length` = ``sequence_length``. Indices of input sequence tokens in the vocabulary.
+ input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+ `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.GPTJTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`GPTJTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- past_key_values (:obj:`Dict[str, np.ndarray]`, `optional`, returned by ``init_cache`` or when passing previous ``past_key_values``):
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
- auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -409,9 +407,9 @@ class FlaxGPTJPreTrainedModel(FlaxPreTrainedModel):
def init_cache(self, batch_size, max_length):
r"""
Args:
- batch_size (:obj:`int`):
+ batch_size (`int`):
batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
- max_length (:obj:`int`):
+ max_length (`int`):
maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
cache.
"""
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index a046300636..0c6b60f65f 100755
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -335,65 +335,63 @@ class GPTJPreTrainedModel(PreTrainedModel):
GPTJ_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.GPTJConfig`): Model configuration class with all the parameters of the model.
+ config ([`GPTJConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
GPTJ_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.GPTJTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`GPTJTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.n_positions - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_attention_heads,)` or :obj:`(n_layer, num_attention_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_attention_heads,)` or `(n_layer, num_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_dim)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_dim)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
PARALLELIZE_DOCSTRING = r"""
@@ -402,7 +400,7 @@ PARALLELIZE_DOCSTRING = r"""
across all devices.
Args:
- device_map (:obj:`Dict[int, list]`, optional, defaults to None):
+ device_map (`Dict[int, list]`, optional, defaults to None):
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
automatically mapped to the first device (for esoteric reasons). That means that the first device should
have fewer attention modules mapped to it than other devices. For reference, the GPT-J models have the
@@ -410,14 +408,17 @@ PARALLELIZE_DOCSTRING = r"""
- gpt-j-6B: 28
- Example::
- # Here is an example of a device map on a machine with 4 GPUs using gpt-j-6B, which has a total of 28 attention modules:
- model = GPTJForCausalLM.from_pretrained('EleutherAI/gpt-j-6B')
- device_map = {0: [0, 1, 2, 3, 4, 5, 6],
- 1: [7, 8, 9, 10, 11, 12, 13],
- 2: [14, 15, 16, 17, 18, 19, 20],
- 3: [21, 22, 23, 24, 25, 26, 27]}
- model.parallelize(device_map)
+ Example:
+
+ ```python
+ # Here is an example of a device map on a machine with 4 GPUs using gpt-j-6B, which has a total of 28 attention modules:
+ model = GPTJForCausalLM.from_pretrained('EleutherAI/gpt-j-6B')
+ device_map = {0: [0, 1, 2, 3, 4, 5, 6],
+ 1: [7, 8, 9, 10, 11, 12, 13],
+ 2: [14, 15, 16, 17, 18, 19, 20],
+ 3: [21, 22, 23, 24, 25, 26, 27]}
+ model.parallelize(device_map)
+ ```
"""
DEPARALLELIZE_DOCSTRING = r"""
@@ -772,10 +773,10 @@ class GPTJForCausalLM(GPTJPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
- ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
- ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to
+ `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -830,9 +831,9 @@ class GPTJForCausalLM(GPTJPreTrainedModel):
@staticmethod
def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
"""
- This function is used to re-order the :obj:`past_key_values` cache if
- :meth:`~transformers.PretrainedModel.beam_search` or :meth:`~transformers.PretrainedModel.beam_sample` is
- called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
+ This function is used to re-order the `past_key_values` cache if
+ [`~PretrainedModel.beam_search`] or [`~PretrainedModel.beam_sample`] is
+ called. This is required to match `past_key_values` with the correct beam_idx at every generation step.
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
@@ -844,13 +845,13 @@ class GPTJForCausalLM(GPTJPreTrainedModel):
"""
The GPT-J Model transformer with a sequence classification head on top (linear layer).
- :class:`~transformers.GPTJForSequenceClassification` uses the last token in order to do the classification, as
+ [`GPTJForSequenceClassification`] uses the last token in order to do the classification, as
other causal models (e.g. GPT, GPT-2, GPT-Neo) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
- :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
- row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
- guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
+ row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
+ guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take
the last value in each row of the batch).
""",
GPTJ_START_DOCSTRING,
@@ -894,10 +895,9 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1019,13 +1019,13 @@ class GPTJForQuestionAnswering(GPTJPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 52d66831e8..416f6ce63d 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -838,59 +838,59 @@ class HubertPreTrainedModel(PreTrainedModel):
HUBERT_START_DOCSTRING = r"""
- Hubert was proposed in `HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units
- `__ by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia,
+ Hubert was proposed in [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia,
Ruslan Salakhutdinov, Abdelrahman Mohamed.
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving etc.).
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.HubertConfig`): Model configuration class with all the parameters of the model.
+ config ([`HubertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
HUBERT_INPUTS_DOCSTRING = r"""
Args:
- input_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
- Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
- into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
- soundfile`). To prepare the array into `input_values`, the :class:`~transformers.Wav2Vec2Processor` should
- be used for padding and conversion into a tensor of type `torch.FloatTensor`. See
- :meth:`transformers.Wav2Vec2Processor.__call__` for details.
- attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing convolution and attention on padding token indices. Mask values selected in ``[0,
- 1]``:
+ input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+ Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+ into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
+ soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should
+ be used for padding and conversion into a tensor of type *torch.FloatTensor*. See
+ [`Wav2Vec2Processor.__call__`] for details.
+ attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
+ [What are attention masks?](../glossary#attention-mask)
- .. warning::
- :obj:`attention_mask` should only be passed if the corresponding processor has
- ``config.return_attention_mask == True``. For all models whose processor has
- ``config.return_attention_mask == False``, such as `hubert-base
- `__, :obj:`attention_mask` should **not** be passed
- to avoid degraded performance when doing batched inference. For such models :obj:`input_values` should
- simply be padded with 0 and passed without :obj:`attention_mask`. Be aware that these models also yield
- slightly different results depending on whether :obj:`input_values` is padded or not.
+
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ `attention_mask` should only be passed if the corresponding processor has
+ `config.return_attention_mask == True`. For all models whose processor has
+ `config.return_attention_mask == False`, such as [hubert-base](https://huggingface.co/facebook/hubert-base-ls960), `attention_mask` should **not** be passed
+ to avoid degraded performance when doing batched inference. For such models `input_values` should
+ simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield
+ slightly different results depending on whether `input_values` is padded or not.
+
+
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -1081,11 +1081,9 @@ class HubertForCTC(HubertPreTrainedModel):
labels=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_length)`, `optional`):
- Labels for connectionist temporal classification. Note that ``target_length`` has to be smaller or equal to
- the sequence length of the output logits. Indices are selected in ``[-100, 0, ..., config.vocab_size -
- 1]``. All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ...,
- config.vocab_size - 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+ Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+ the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1199,10 +1197,9 @@ class HubertForSequenceClassification(HubertPreTrainedModel):
labels=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index d08b747b70..d25ee4f38c 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -56,14 +56,13 @@ LARGE_NEGATIVE = -1e8
def input_values_processing(func, config, input_values, **kwargs):
"""
Process the input of each TensorFlow model including the booleans. In case of a list of symbolic inputs, each input
- has to be named accordingly to the parameters name, i.e. :obj:`input_values = tf.keras.Input(shape=(128,),
- dtype='float32', name="input_values")` otherwise the order of the tensors will not be guaranteed during the
+ has to be named accordingly to the parameters name, i.e. `input_values = tf.keras.Input(shape=(128,), dtype='float32', name="input_values")` otherwise the order of the tensors will not be guaranteed during the
training.
Args:
- func (:obj:`callable`):
+ func (`callable`):
The callable function of the TensorFlow model.
- config (:class:`~transformers.PretrainedConfig`):
+ config ([`PretrainedConfig`]):
The config of the running model.
**kwargs:
The inputs of the model.
@@ -1286,92 +1285,92 @@ class TFHubertPreTrainedModel(TFPreTrainedModel):
HUBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_values` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_values, attention_mask])` or :obj:`model([input_values, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_values": input_values, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_values` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_values, attention_mask])` or `model([input_values, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_values": input_values, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.HubertConfig`): Model configuration class with all the parameters of the model.
+ config ([`HubertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
HUBERT_INPUTS_DOCSTRING = r"""
Args:
- input_values (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`):
+ input_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_values` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_values` indices
+ inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_values` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_values` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1508,45 +1507,45 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
training: Optional[bool] = False,
) -> Union[TFCausalLMOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_values`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_values` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Returns:
- Example::
+ Example:
- >>> import tensorflow as tf
- >>> from transformers import Wav2Vec2Processor, TFHubertForCTC
- >>> from datasets import load_dataset
- >>> import soundfile as sf
+ ```python
+ >>> import tensorflow as tf
+ >>> from transformers import Wav2Vec2Processor, TFHubertForCTC
+ >>> from datasets import load_dataset
+ >>> import soundfile as sf
- >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-base-960h")
- >>> model = TFHubertForCTC.from_pretrained("facebook/hubert-base-960h")
+ >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-base-960h")
+ >>> model = TFHubertForCTC.from_pretrained("facebook/hubert-base-960h")
- >>> def map_to_array(batch):
- ... speech, _ = sf.read(batch["file"])
- ... batch["speech"] = speech
- ... return batch
+ >>> def map_to_array(batch):
+ ... speech, _ = sf.read(batch["file"])
+ ... batch["speech"] = speech
+ ... return batch
- >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
- >>> ds = ds.map(map_to_array)
+ >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+ >>> ds = ds.map(map_to_array)
- >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
- >>> logits = model(input_values).logits >>> predicted_ids = tf.argmax(logits, axis=-1)
+ >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
+ >>> logits = model(input_values).logits >>> predicted_ids = tf.argmax(logits, axis=-1)
- >>> transcription = processor.decode(predicted_ids[0])
+ >>> transcription = processor.decode(predicted_ids[0])
- >>> # compute loss
- >>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST"
+ >>> # compute loss
+ >>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST"
- >>> # wrap processor as target processor to encode labels
- >>> with processor.as_target_processor():
- ... labels = processor(transcription, return_tensors="tf").input_values
+ >>> # wrap processor as target processor to encode labels
+ >>> with processor.as_target_processor():
+ ... labels = processor(transcription, return_tensors="tf").input_values
- >>> loss = model(input_values, labels=labels).loss
- """
+ >>> loss = model(input_values, labels=labels).loss
+ ```"""
inputs = input_values_processing(
func=self.call,
config=self.config,
diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py
index 6666258e70..a8eb01bd14 100644
--- a/src/transformers/models/ibert/modeling_ibert.py
+++ b/src/transformers/models/ibert/modeling_ibert.py
@@ -662,69 +662,67 @@ class IBertPreTrainedModel(PreTrainedModel):
IBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.IBertConfig`): Model configuration class with all the parameters of the
+ config ([`IBertConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
IBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.RobertaTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`RobertaTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -897,11 +895,10 @@ class IBertForMaskedLM(IBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1005,10 +1002,9 @@ class IBertForSequenceClassification(IBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1101,10 +1097,9 @@ class IBertForMultipleChoice(IBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1196,9 +1191,8 @@ class IBertForTokenClassification(IBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1307,13 +1301,13 @@ class IBertForQuestionAnswering(IBertPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1370,10 +1364,10 @@ class IBertForQuestionAnswering(IBertPreTrainedModel):
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
- are ignored. This is modified from fairseq's `utils.make_positions`.
+ are ignored. This is modified from fairseq's *utils.make_positions*.
Args:
- input_ids (:obj:`torch.LongTensor`):
+ input_ids (`torch.LongTensor`):
Indices of input sequence tokens in the vocabulary.
Returns: torch.Tensor
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index 054054df80..e1568ab73a 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -535,83 +535,81 @@ class ImageGPTPreTrainedModel(PreTrainedModel):
IMAGEGPT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.ImageGPTConfig`): Model configuration class with all the parameters of the model.
+ config ([`ImageGPTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
IMAGEGPT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
- :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
- ``past_key_values[0][0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+ `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
sequence tokens in the vocabulary.
- If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be
- passed as ``input_ids``.
+ If `past_key_values` is used, only `input_ids` that do not have their past calculated should be
+ passed as `input_ids`.
- Indices can be obtained using :class:`~transformers.ImageGPTFeatureExtractor`. See
- :meth:`transformers.ImageGPTFeatureExtractor.__call__` for details.
+ Indices can be obtained using [`ImageGPTFeatureExtractor`]. See
+ [`ImageGPTFeatureExtractor.__call__`] for details.
- past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers`):
+ past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
- :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
- have their past given to this model should not be passed as ``input_ids`` as they have already been
+ `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which
+ have their past given to this model should not be passed as `input_ids` as they have already been
computed.
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- If :obj:`past_key_values` is used, optionally only the last :obj:`inputs_embeds` have to be input (see
- :obj:`past_key_values`).
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
+ `past_key_values`).
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -674,29 +672,30 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
**kwargs,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
- ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
- ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to
+ `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
Returns:
- Examples::
+ Examples:
- >>> from transformers import ImageGPTFeatureExtractor, ImageGPTModel
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import ImageGPTFeatureExtractor, ImageGPTModel
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained('openai/imagegpt-small')
- >>> model = ImageGPTModel.from_pretrained('openai/imagegpt-small')
+ >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained('openai/imagegpt-small')
+ >>> model = ImageGPTModel.from_pretrained('openai/imagegpt-small')
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> last_hidden_states = outputs.last_hidden_state
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
if "pixel_values" in kwargs:
warnings.warn(
@@ -967,42 +966,43 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
**kwargs,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
- ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
- ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to
+ `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
Returns:
- Examples::
+ Examples:
- >>> from transformers import ImageGPTFeatureExtractor, ImageGPTForCausalImageModeling
- >>> import torch
- >>> import matplotlib.pyplot as plt
- >>> import numpy as np
+ ```python
+ >>> from transformers import ImageGPTFeatureExtractor, ImageGPTForCausalImageModeling
+ >>> import torch
+ >>> import matplotlib.pyplot as plt
+ >>> import numpy as np
- >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained('openai/imagegpt-small')
- >>> model = ImageGPTForCausalImageModeling.from_pretrained('openai/imagegpt-small')
- >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- >>> model.to(device)
+ >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained('openai/imagegpt-small')
+ >>> model = ImageGPTForCausalImageModeling.from_pretrained('openai/imagegpt-small')
+ >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ >>> model.to(device)
- >>> # unconditional generation of 8 images
- >>> batch_size = 8
- >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1) #initialize with SOS token
- >>> context = torch.tensor(context).to(device)
- >>> output = model.generate(input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40)
+ >>> # unconditional generation of 8 images
+ >>> batch_size = 8
+ >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1) #initialize with SOS token
+ >>> context = torch.tensor(context).to(device)
+ >>> output = model.generate(input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40)
- >>> clusters = feature_extractor.clusters
- >>> n_px = feature_extractor.size
+ >>> clusters = feature_extractor.clusters
+ >>> n_px = feature_extractor.size
- >>> samples = output[:,1:].cpu().detach().numpy()
- >>> samples_img = [np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [n_px, n_px, 3]).astype(np.uint8) for s in samples] # convert color cluster tokens back to pixels
- >>> f, axes = plt.subplots(1, batch_size, dpi=300)
+ >>> samples = output[:,1:].cpu().detach().numpy()
+ >>> samples_img = [np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [n_px, n_px, 3]).astype(np.uint8) for s in samples] # convert color cluster tokens back to pixels
+ >>> f, axes = plt.subplots(1, batch_size, dpi=300)
- >>> for img, ax in zip(samples_img, axes):
- ... ax.axis('off')
- ... ax.imshow(img)
- """
+ >>> for img, ax in zip(samples_img, axes):
+ ... ax.axis('off')
+ ... ax.imshow(img)
+ ```"""
if "pixel_values" in kwargs:
warnings.warn(
@@ -1064,9 +1064,9 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
@staticmethod
def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
"""
- This function is used to re-order the :obj:`past_key_values` cache if
- :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
- called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
+ This function is used to re-order the `past_key_values` cache if
+ [`~PreTrainedModel.beam_search`] or [`~PreTrainedModel.beam_sample`] is
+ called. This is required to match `past_key_values` with the correct beam_idx at every generation step.
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
@@ -1077,7 +1077,7 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
@add_start_docstrings(
"""
The ImageGPT Model transformer with an image classification head on top (linear layer).
- :class:`~transformers.ImageGPTForImageClassification` average-pools the hidden states in order to do the
+ [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the
classification.
""",
IMAGEGPT_START_DOCSTRING,
@@ -1113,29 +1113,29 @@ class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
**kwargs,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import ImageGPTFeatureExtractor, ImageGPTForImageClassification
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import ImageGPTFeatureExtractor, ImageGPTForImageClassification
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained('openai/imagegpt-small')
- >>> model = ImageGPTForImageClassification.from_pretrained('openai/imagegpt-small')
+ >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained('openai/imagegpt-small')
+ >>> model = ImageGPTForImageClassification.from_pretrained('openai/imagegpt-small')
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ ```"""
if "pixel_values" in kwargs:
warnings.warn(
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 251ad624cf..3aae3c62dd 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -638,65 +638,61 @@ class LayoutLMPreTrainedModel(PreTrainedModel):
LAYOUTLM_START_DOCSTRING = r"""
- The LayoutLM model was proposed in `LayoutLM: Pre-training of Text and Layout for Document Image Understanding
- `__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei and Ming Zhou.
+ The LayoutLM model was proposed in [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei and Ming Zhou.
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.LayoutLMConfig`): Model configuration class with all the parameters of the model.
+ config ([`LayoutLMConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
LAYOUTLM_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.LayoutLMTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`LayoutLMTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- bbox (:obj:`torch.LongTensor` of shape :obj:`({0}, 4)`, `optional`):
- Bounding boxes of each input sequence tokens. Selected in the range ``[0,
- config.max_2d_position_embeddings-1]``. Each bounding box should be a normalized version in (x0, y0, x1,
+ [What are input IDs?](../glossary#input-ids)
+ bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*):
+ Bounding boxes of each input sequence tokens. Selected in the range `[0, config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1,
y1) format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and
- (x1, y1) represents the position of the lower right corner. See :ref:`Overview` for normalization.
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for
- tokens that are NOT MASKED, ``0`` for MASKED tokens.
+ (x1, y1) represents the position of the lower right corner. See [Overview](#Overview) for normalization.
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: `1` for
+ tokens that are NOT MASKED, `0` for MASKED tokens.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`: `0` corresponds to a *sentence A* token, `1` corresponds to a *sentence B* token
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1`
- indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: `1`
+ indicates the head is **not masked**, `0` indicates the head is **masked**.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ If set to `True`, the attentions tensors of all attention layers are returned. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned
+ output_hidden_states (`bool`, *optional*):
+ If set to `True`, the hidden states of all layers are returned. See `hidden_states` under returned
tensors for more detail.
- return_dict (:obj:`bool`, `optional`):
- If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
+ return_dict (`bool`, *optional*):
+ If set to `True`, the model will return a [`~file_utils.ModelOutput`] instead of a
plain tuple.
"""
@@ -888,44 +884,44 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Returns:
- Examples::
+ Examples:
- >>> from transformers import LayoutLMTokenizer, LayoutLMForMaskedLM
- >>> import torch
+ ```python
+ >>> from transformers import LayoutLMTokenizer, LayoutLMForMaskedLM
+ >>> import torch
- >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> model = LayoutLMForMaskedLM.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> model = LayoutLMForMaskedLM.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> words = ["Hello", "[MASK]"]
- >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+ >>> words = ["Hello", "[MASK]"]
+ >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
- >>> token_boxes = []
- >>> for word, box in zip(words, normalized_word_boxes):
- ... word_tokens = tokenizer.tokenize(word)
- ... token_boxes.extend([box] * len(word_tokens))
- >>> # add bounding boxes of cls + sep tokens
- >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+ >>> token_boxes = []
+ >>> for word, box in zip(words, normalized_word_boxes):
+ ... word_tokens = tokenizer.tokenize(word)
+ ... token_boxes.extend([box] * len(word_tokens))
+ >>> # add bounding boxes of cls + sep tokens
+ >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
- >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
- >>> input_ids = encoding["input_ids"]
- >>> attention_mask = encoding["attention_mask"]
- >>> token_type_ids = encoding["token_type_ids"]
- >>> bbox = torch.tensor([token_boxes])
+ >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+ >>> input_ids = encoding["input_ids"]
+ >>> attention_mask = encoding["attention_mask"]
+ >>> token_type_ids = encoding["token_type_ids"]
+ >>> bbox = torch.tensor([token_boxes])
- >>> labels = tokenizer("Hello world", return_tensors="pt")["input_ids"]
+ >>> labels = tokenizer("Hello world", return_tensors="pt")["input_ids"]
- >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
- ... labels=labels)
+ >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
+ ... labels=labels)
- >>> loss = outputs.loss
- """
+ >>> loss = outputs.loss
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.layoutlm(
@@ -1004,44 +1000,44 @@ class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import LayoutLMTokenizer, LayoutLMForSequenceClassification
- >>> import torch
+ ```python
+ >>> from transformers import LayoutLMTokenizer, LayoutLMForSequenceClassification
+ >>> import torch
- >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> model = LayoutLMForSequenceClassification.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> model = LayoutLMForSequenceClassification.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> words = ["Hello", "world"]
- >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+ >>> words = ["Hello", "world"]
+ >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
- >>> token_boxes = []
- >>> for word, box in zip(words, normalized_word_boxes):
- ... word_tokens = tokenizer.tokenize(word)
- ... token_boxes.extend([box] * len(word_tokens))
- >>> # add bounding boxes of cls + sep tokens
- >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+ >>> token_boxes = []
+ >>> for word, box in zip(words, normalized_word_boxes):
+ ... word_tokens = tokenizer.tokenize(word)
+ ... token_boxes.extend([box] * len(word_tokens))
+ >>> # add bounding boxes of cls + sep tokens
+ >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
- >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
- >>> input_ids = encoding["input_ids"]
- >>> attention_mask = encoding["attention_mask"]
- >>> token_type_ids = encoding["token_type_ids"]
- >>> bbox = torch.tensor([token_boxes])
- >>> sequence_label = torch.tensor([1])
+ >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+ >>> input_ids = encoding["input_ids"]
+ >>> attention_mask = encoding["attention_mask"]
+ >>> token_type_ids = encoding["token_type_ids"]
+ >>> bbox = torch.tensor([token_boxes])
+ >>> sequence_label = torch.tensor([1])
- >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
- ... labels=sequence_label)
+ >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
+ ... labels=sequence_label)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
- """
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.layoutlm(
@@ -1135,43 +1131,43 @@ class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Returns:
- Examples::
+ Examples:
- >>> from transformers import LayoutLMTokenizer, LayoutLMForTokenClassification
- >>> import torch
+ ```python
+ >>> from transformers import LayoutLMTokenizer, LayoutLMForTokenClassification
+ >>> import torch
- >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> model = LayoutLMForTokenClassification.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> model = LayoutLMForTokenClassification.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> words = ["Hello", "world"]
- >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+ >>> words = ["Hello", "world"]
+ >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
- >>> token_boxes = []
- >>> for word, box in zip(words, normalized_word_boxes):
- ... word_tokens = tokenizer.tokenize(word)
- ... token_boxes.extend([box] * len(word_tokens))
- >>> # add bounding boxes of cls + sep tokens
- >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+ >>> token_boxes = []
+ >>> for word, box in zip(words, normalized_word_boxes):
+ ... word_tokens = tokenizer.tokenize(word)
+ ... token_boxes.extend([box] * len(word_tokens))
+ >>> # add bounding boxes of cls + sep tokens
+ >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
- >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
- >>> input_ids = encoding["input_ids"]
- >>> attention_mask = encoding["attention_mask"]
- >>> token_type_ids = encoding["token_type_ids"]
- >>> bbox = torch.tensor([token_boxes])
- >>> token_labels = torch.tensor([1,1,0,0]).unsqueeze(0) # batch size of 1
+ >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+ >>> input_ids = encoding["input_ids"]
+ >>> attention_mask = encoding["attention_mask"]
+ >>> token_type_ids = encoding["token_type_ids"]
+ >>> bbox = torch.tensor([token_boxes])
+ >>> token_labels = torch.tensor([1,1,0,0]).unsqueeze(0) # batch size of 1
- >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
- ... labels=token_labels)
+ >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
+ ... labels=token_labels)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
- """
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.layoutlm(
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index 088475f623..88326c109c 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -136,7 +136,7 @@ class TFLayoutLMEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
@@ -823,92 +823,91 @@ class TFLayoutLMPreTrainedModel(TFPreTrainedModel):
LAYOUTLM_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.LayoutLMConfig`): Model configuration class with all the parameters of the model.
+ config ([`LayoutLMConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
LAYOUTLM_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.LayoutLMTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`LayoutLMTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- bbox (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0}, 4)`, `optional`):
- Bounding Boxes of each input sequence tokens. Selected in the range ``[0,
- config.max_2d_position_embeddings- 1]``.
- attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ bbox (`Numpy array` or `tf.Tensor` of shape `({0}, 4)`, *optional*):
+ Bounding Boxes of each input sequence tokens. Selected in the range `[0, config.max_2d_position_embeddings- 1]`.
+ attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1080,44 +1079,44 @@ class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingL
**kwargs,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Returns:
- Examples::
+ Examples:
- >>> from transformers import LayoutLMTokenizer, TFLayoutLMForMaskedLM
- >>> import tensorflow as tf
+ ```python
+ >>> from transformers import LayoutLMTokenizer, TFLayoutLMForMaskedLM
+ >>> import tensorflow as tf
- >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> model = TFLayoutLMForMaskedLM.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> model = TFLayoutLMForMaskedLM.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> words = ["Hello", "[MASK]"]
- >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+ >>> words = ["Hello", "[MASK]"]
+ >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
- >>> token_boxes = []
- >>> for word, box in zip(words, normalized_word_boxes):
- ... word_tokens = tokenizer.tokenize(word)
- ... token_boxes.extend([box] * len(word_tokens))
- >>> # add bounding boxes of cls + sep tokens
- >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+ >>> token_boxes = []
+ >>> for word, box in zip(words, normalized_word_boxes):
+ ... word_tokens = tokenizer.tokenize(word)
+ ... token_boxes.extend([box] * len(word_tokens))
+ >>> # add bounding boxes of cls + sep tokens
+ >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
- >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
- >>> input_ids = encoding["input_ids"]
- >>> attention_mask = encoding["attention_mask"]
- >>> token_type_ids = encoding["token_type_ids"]
- >>> bbox = tf.convert_to_tensor([token_boxes])
+ >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
+ >>> input_ids = encoding["input_ids"]
+ >>> attention_mask = encoding["attention_mask"]
+ >>> token_type_ids = encoding["token_type_ids"]
+ >>> bbox = tf.convert_to_tensor([token_boxes])
- >>> labels = tokenizer("Hello world", return_tensors="tf")["input_ids"]
+ >>> labels = tokenizer("Hello world", return_tensors="tf")["input_ids"]
- >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
- ... labels=labels)
+ >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
+ ... labels=labels)
- >>> loss = outputs.loss
- """
+ >>> loss = outputs.loss
+ ```"""
inputs = input_processing(
func=self.call,
config=self.config,
@@ -1216,44 +1215,44 @@ class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceC
**kwargs,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import LayoutLMTokenizer, TFLayoutLMForSequenceClassification
- >>> import tensorflow as tf
+ ```python
+ >>> from transformers import LayoutLMTokenizer, TFLayoutLMForSequenceClassification
+ >>> import tensorflow as tf
- >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> model = TFLayoutLMForSequenceClassification.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> model = TFLayoutLMForSequenceClassification.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> words = ["Hello", "world"]
- >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+ >>> words = ["Hello", "world"]
+ >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
- >>> token_boxes = []
- >>> for word, box in zip(words, normalized_word_boxes):
- ... word_tokens = tokenizer.tokenize(word)
- ... token_boxes.extend([box] * len(word_tokens))
- >>> # add bounding boxes of cls + sep tokens
- >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+ >>> token_boxes = []
+ >>> for word, box in zip(words, normalized_word_boxes):
+ ... word_tokens = tokenizer.tokenize(word)
+ ... token_boxes.extend([box] * len(word_tokens))
+ >>> # add bounding boxes of cls + sep tokens
+ >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
- >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
- >>> input_ids = encoding["input_ids"]
- >>> attention_mask = encoding["attention_mask"]
- >>> token_type_ids = encoding["token_type_ids"]
- >>> bbox = tf.convert_to_tensor([token_boxes])
- >>> sequence_label = tf.convert_to_tensor([1])
+ >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
+ >>> input_ids = encoding["input_ids"]
+ >>> attention_mask = encoding["attention_mask"]
+ >>> token_type_ids = encoding["token_type_ids"]
+ >>> bbox = tf.convert_to_tensor([token_boxes])
+ >>> sequence_label = tf.convert_to_tensor([1])
- >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
- ... labels=sequence_label)
+ >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
+ ... labels=sequence_label)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
- """
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```"""
inputs = input_processing(
func=self.call,
config=self.config,
@@ -1357,43 +1356,43 @@ class TFLayoutLMForTokenClassification(TFLayoutLMPreTrainedModel, TFTokenClassif
**kwargs,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Returns:
- Examples::
+ Examples:
- >>> from transformers import LayoutLMTokenizer, TFLayoutLMForTokenClassification
- >>> import torch
+ ```python
+ >>> from transformers import LayoutLMTokenizer, TFLayoutLMForTokenClassification
+ >>> import torch
- >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> model = TFLayoutLMForTokenClassification.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> model = TFLayoutLMForTokenClassification.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> words = ["Hello", "world"]
- >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+ >>> words = ["Hello", "world"]
+ >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
- >>> token_boxes = []
- >>> for word, box in zip(words, normalized_word_boxes):
- ... word_tokens = tokenizer.tokenize(word)
- ... token_boxes.extend([box] * len(word_tokens))
- >>> # add bounding boxes of cls + sep tokens
- >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+ >>> token_boxes = []
+ >>> for word, box in zip(words, normalized_word_boxes):
+ ... word_tokens = tokenizer.tokenize(word)
+ ... token_boxes.extend([box] * len(word_tokens))
+ >>> # add bounding boxes of cls + sep tokens
+ >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
- >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
- >>> input_ids = encoding["input_ids"]
- >>> attention_mask = encoding["attention_mask"]
- >>> token_type_ids = encoding["token_type_ids"]
- >>> bbox = tf.convert_to_tensor([token_boxes])
- >>> token_labels = tf.convert_to_tensor([1,1,0,0])
+ >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
+ >>> input_ids = encoding["input_ids"]
+ >>> attention_mask = encoding["attention_mask"]
+ >>> token_type_ids = encoding["token_type_ids"]
+ >>> bbox = tf.convert_to_tensor([token_boxes])
+ >>> token_labels = tf.convert_to_tensor([1,1,0,0])
- >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
- ... labels=token_labels)
+ >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
+ ... labels=token_labels)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
- """
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```"""
inputs = input_processing(
func=self.call,
config=self.config,
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 93706000e6..e05bff9ebe 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -615,75 +615,72 @@ class LayoutLMv2VisualBackbone(nn.Module):
LAYOUTLMV2_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.LayoutLMv2Config`): Model configuration class with all the parameters of the model.
+ config ([`LayoutLMv2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
LAYOUTLMV2_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
+ input_ids (`torch.LongTensor` of shape `{0}`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.LayoutLMv2Tokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`LayoutLMv2Tokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
+ [What are input IDs?](../glossary#input-ids)
- bbox (:obj:`torch.LongTensor` of shape :obj:`({0}, 4)`, `optional`):
- Bounding boxes of each input sequence tokens. Selected in the range ``[0,
- config.max_2d_position_embeddings-1]``. Each bounding box should be a normalized version in (x0, y0, x1,
+ bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*):
+ Bounding boxes of each input sequence tokens. Selected in the range `[0, config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1,
y1) format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and
(x1, y1) represents the position of the lower right corner.
- image (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)` or :obj:`detectron.structures.ImageList` whose :obj:`tensors` is of shape :obj:`(batch_size, num_channels, height, width)`):
+ image (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `detectron.structures.ImageList` whose `tensors` is of shape `(batch_size, num_channels, height, width)`):
Batch of document images.
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `{0}`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `{0}`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -982,31 +979,31 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification
- >>> from PIL import Image
- >>> import torch
+ ```python
+ >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification
+ >>> from PIL import Image
+ >>> import torch
- >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased')
- >>> model = LayoutLMv2ForSequenceClassification.from_pretrained('microsoft/layoutlmv2-base-uncased')
+ >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased')
+ >>> model = LayoutLMv2ForSequenceClassification.from_pretrained('microsoft/layoutlmv2-base-uncased')
- >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+ >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
- >>> encoding = processor(image, return_tensors="pt")
- >>> sequence_label = torch.tensor([1])
+ >>> encoding = processor(image, return_tensors="pt")
+ >>> sequence_label = torch.tensor([1])
- >>> outputs = model(**encoding, labels=sequence_label)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
- """
+ >>> outputs = model(**encoding, labels=sequence_label)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1150,31 +1147,31 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Returns:
- Examples::
+ Examples:
- >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
- >>> from PIL import Image
+ ```python
+ >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
+ >>> from PIL import Image
- >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased', revision="no_ocr")
- >>> model = LayoutLMv2ForTokenClassification.from_pretrained('microsoft/layoutlmv2-base-uncased')
+ >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased', revision="no_ocr")
+ >>> model = LayoutLMv2ForTokenClassification.from_pretrained('microsoft/layoutlmv2-base-uncased')
- >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
- >>> words = ["hello", "world"]
- >>> boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
- >>> word_labels = [0, 1]
+ >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+ >>> words = ["hello", "world"]
+ >>> boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+ >>> word_labels = [0, 1]
- >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
+ >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
- >>> outputs = model(**encoding)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
- """
+ >>> outputs = model(**encoding)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1267,38 +1264,39 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
Returns:
- Examples::
+ Examples:
- >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForQuestionAnswering
- >>> from PIL import Image
- >>> import torch
+ ```python
+ >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForQuestionAnswering
+ >>> from PIL import Image
+ >>> import torch
- >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased')
- >>> model = LayoutLMv2ForQuestionAnswering.from_pretrained('microsoft/layoutlmv2-base-uncased')
+ >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased')
+ >>> model = LayoutLMv2ForQuestionAnswering.from_pretrained('microsoft/layoutlmv2-base-uncased')
- >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
- >>> question = "what's his name?"
+ >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+ >>> question = "what's his name?"
- >>> encoding = processor(image, question, return_tensors="pt")
- >>> start_positions = torch.tensor([1])
- >>> end_positions = torch.tensor([3])
+ >>> encoding = processor(image, question, return_tensors="pt")
+ >>> start_positions = torch.tensor([1])
+ >>> end_positions = torch.tensor([3])
- >>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
- >>> loss = outputs.loss
- >>> start_scores = outputs.start_logits
- >>> end_scores = outputs.end_logits
- """
+ >>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
+ >>> loss = outputs.loss
+ >>> start_scores = outputs.start_logits
+ >>> end_scores = outputs.end_logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 2f15448522..b0fd2e4ed7 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -170,14 +170,14 @@ class LEDEncoderSelfAttention(nn.Module):
output_attentions=False,
):
"""
- :class:`LEDEncoderSelfAttention` expects `len(hidden_states)` to be multiple of `attention_window`. Padding to
- `attention_window` happens in :meth:`LEDEncoderModel.forward` to avoid redoing the padding on each layer.
+ [`LEDEncoderSelfAttention`] expects *len(hidden_states)* to be multiple of *attention_window*. Padding to
+ *attention_window* happens in [`LEDEncoderModel.forward`] to avoid redoing the padding on each layer.
- The `attention_mask` is changed in :meth:`LEDEncoderModel.forward` from 0, 1, 2 to:
+ The *attention_mask* is changed in [`LEDEncoderModel.forward`] from 0, 1, 2 to:
- * -10000: no attention
- * 0: local attention
- * +10000: global attention
+ - -10000: no attention
+ - 0: local attention
+ - +10000: global attention
"""
hidden_states = hidden_states.transpose(0, 1)
@@ -898,11 +898,11 @@ class LEDEncoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
"""
residual = hidden_states
attn_outputs = self.self_attn(
@@ -976,18 +976,18 @@ class LEDDecoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(decoder_attention_heads,)`.
- cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for encoder attention heads in a given layer of
- size `(decoder_attention_heads,)`.
- past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
- output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(decoder_attention_heads,)*.
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for encoder attention heads in a given layer of
+ size *(decoder_attention_heads,)*.
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+ output_attentions (`bool`): Whether the base model outputs attentions.
This requires the attentions tensor to be reshaped in this function.
"""
residual = hidden_states
@@ -1112,32 +1112,29 @@ class LEDEncoderBaseModelOutput(ModelOutput):
Base class for LEDEncoder's outputs, with potential hidden states, local and global attentions.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention
mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -1157,50 +1154,44 @@ class LEDSeq2SeqModelOutput(ModelOutput):
decoding.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
- If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
- 1, hidden_size)` is output.
- past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
- batch_size, num_heads, sequence_length, embed_size_per_head)`).
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
+ past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- encoder_global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+ encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -1224,49 +1215,44 @@ class LEDSeq2SeqLMOutput(ModelOutput):
Base class for sequence-to-sequence language models outputs.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
- batch_size, num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- encoder_global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+ encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -1291,49 +1277,44 @@ class LEDSeq2SeqSequenceClassifierOutput(ModelOutput):
Base class for outputs of sequence-to-sequence sentence classification models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
Classification (or regression if config.num_labels==1) loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
- batch_size, num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- encoder_global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+ encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -1358,51 +1339,46 @@ class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
Base class for outputs of sequence-to-sequence question answering models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
- start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+ start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
- end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+ end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
- past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
- batch_size, num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- encoder_global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+ encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -1423,19 +1399,19 @@ class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
LED_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.LEDConfig`):
+ config ([`LEDConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
LED_GENERATION_EXAMPLE = r"""
@@ -1482,115 +1458,110 @@ LED_GENERATION_EXAMPLE = r"""
LED_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.LEDTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`LEDTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.LedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`LedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
+ [What are input IDs?](../glossary#input-ids)
- LED uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ LED uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should read :func:`modeling_led._prepare_decoder_inputs` and
- modify to your needs. See diagram 1 in `the paper `__ for more
+ If you want to change padding behavior, you should read [`modeling_led._prepare_decoder_inputs`] and
+ modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
- global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to decide the attention given on each token, local attention or global attention for the encoder.
Tokens with global attention attends to all other tokens, and all other tokens attend to them. This is
important for task-specific finetuning because it makes the model more flexible at representing the task.
For example, for classification, the token should be given global attention. For QA, all question
- tokens should also have global attention. Please refer to the `Longformer paper
- `__ for more details. Mask values selected in ``[0, 1]``:
+ tokens should also have global attention. Please refer to the [Longformer paper](https://arxiv.org/abs/2004.05150) for more details. Mask values selected in `[0, 1]`:
- 0 for local attention (a sliding window attention),
- 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in ``[0,
- 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
- have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
- :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
+ have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+ `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
- If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
- takes the value of :obj:`inputs_embeds`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+ takes the value of `inputs_embeds`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class LEDEncoder(LEDPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`LEDEncoderLayer`.
+ [`LEDEncoderLayer`].
Args:
config: LEDConfig
@@ -1700,49 +1671,48 @@ class LEDEncoder(LEDPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.LEDTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`LEDTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to decide the attention given on each token, local attention or global attention for the encoder.
Tokens with global attention attends to all other tokens, and all other tokens attend to them. This is
important for task-specific finetuning because it makes the model more flexible at representing the
task. For example, for classification, the token should be given global attention. For QA, all
- question tokens should also have global attention. Please refer to the `Longformer paper
- `__ for more details. Mask values selected in ``[0, 1]``:
+ question tokens should also have global attention. Please refer to the [Longformer paper](https://arxiv.org/abs/2004.05150) for more details. Mask values selected in `[0, 1]`:
- 0 for local attention (a sliding window attention),
- 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1879,7 +1849,7 @@ class LEDEncoder(LEDPreTrainedModel):
class LEDDecoder(LEDPreTrainedModel):
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`LEDDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`LEDDecoderLayer`]
Args:
config: LEDConfig
@@ -1927,80 +1897,76 @@ class LEDDecoder(LEDPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.LEDTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`LEDTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to decide the attention given on each token, local attention or global attention. Tokens with
global attention attends to all other tokens, and all other tokens attend to them. This is important
for task-specific finetuning because it makes the model more flexible at representing the task. For
example, for classification, the token should be given global attention. For QA, all question
- tokens should also have global attention. Please refer to the `Longformer paper
- `__ for more details. Mask values selected in ``[0, 1]``:
+ tokens should also have global attention. Please refer to the [Longformer paper](https://arxiv.org/abs/2004.05150) for more details. Mask values selected in `[0, 1]`:
- 0 for local attention (a sliding window attention),
- 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -2339,25 +2305,25 @@ class LEDForConditionalGeneration(LEDPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
- Conditional generation example::
+ Conditional generation example:
- >>> from transformers import LEDTokenizer, LEDForConditionalGeneration
- >>> tokenizer = LEDTokenizer.from_pretrained('allenai/led-base-16384')
- >>> TXT = "My friends are but they eat too many carbs."
+ ```python
+ >>> from transformers import LEDTokenizer, LEDForConditionalGeneration
+ >>> tokenizer = LEDTokenizer.from_pretrained('allenai/led-base-16384')
+ >>> TXT = "My friends are but they eat too many carbs."
- >>> model = LEDForConditionalGeneration.from_pretrained('allenai/led-base-16384')
- >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+ >>> model = LEDForConditionalGeneration.from_pretrained('allenai/led-base-16384')
+ >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
- >>> prediction = model.generate(input_ids)[0]
- >>> print(tokenizer.decode(prediction, skip_special_tokens=True))
- """
+ >>> prediction = model.generate(input_ids)[0]
+ >>> print(tokenizer.decode(prediction, skip_special_tokens=True))
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
@@ -2497,9 +2463,8 @@ class LEDForSequenceClassification(LEDPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
@@ -2625,13 +2590,13 @@ class LEDForQuestionAnswering(LEDPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index fa8904e474..b12c1d0786 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -185,14 +185,14 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer):
training=False,
):
"""
- LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`. Padding to
- `attention_window` happens in LongformerModel.forward to avoid redoing the padding on each layer.
+ LongformerSelfAttention expects *len(hidden_states)* to be multiple of *attention_window*. Padding to
+ *attention_window* happens in LongformerModel.forward to avoid redoing the padding on each layer.
- The `attention_mask` is changed in :meth:`LongformerModel.forward` from 0, 1, 2 to:
+ The *attention_mask* is changed in [`LongformerModel.forward`] from 0, 1, 2 to:
- * -10000: no attention
- * 0: local attention
- * +10000: global attention
+ - -10000: no attention
+ - 0: local attention
+ - +10000: global attention
"""
# retrieve input args
(
@@ -1127,11 +1127,11 @@ class TFLEDEncoderLayer(tf.keras.layers.Layer):
):
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(config.encoder_attention_heads,)`.
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(config.encoder_attention_heads,)*.
"""
residual = hidden_states
layer_outputs = self.self_attn(
@@ -1203,17 +1203,17 @@ class TFLEDDecoderLayer(tf.keras.layers.Layer):
) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`tf.Tensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`tf.Tensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(config.encoder_attention_heads,)`.
- encoder_layer_head_mask (:obj:`tf.Tensor`): mask for encoder attention heads in a given layer of
- size `(config.encoder_attention_heads,)`.
- past_key_value (:obj:`Tuple(tf.Tensor)`): cached past key and value projection states
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(config.encoder_attention_heads,)*.
+ encoder_layer_head_mask (`tf.Tensor`): mask for encoder attention heads in a given layer of
+ size *(config.encoder_attention_heads,)*.
+ past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
"""
residual = hidden_states
@@ -1309,31 +1309,29 @@ class TFLEDEncoderBaseModelOutput(ModelOutput):
Base class for Longformer's outputs, with potential hidden states, local and global attentions.
Args:
- last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x +
- attention_window + 1)`, where ``x`` is the number of tokens with global attention mask.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x)`,
- where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+ where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -1353,50 +1351,45 @@ class TFLEDSeq2SeqModelOutput(ModelOutput):
decoding.
Args:
- last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
- If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
- 1, hidden_size)` is output.
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- encoder_global_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x)`,
- where ``x`` is the number of tokens with global attention mask.
+ encoder_global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+ where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -1420,49 +1413,45 @@ class TFLEDSeq2SeqLMOutput(ModelOutput):
Base class for sequence-to-sequence language models outputs.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
- encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- encoder_global_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x)`,
- where ``x`` is the number of tokens with global attention mask.
+ encoder_global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+ where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -1482,106 +1471,108 @@ class TFLEDSeq2SeqLMOutput(ModelOutput):
LED_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(input_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.LEDConfig`): Model configuration class with all the parameters of the model.
+ config ([`LEDConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
LED_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`tf.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`tf.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.LedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`LedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
+ [What are input IDs?](../glossary#input-ids)
- LED uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ LED uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
+ encoder_outputs (`tf.FloatTensor`, *optional*):
hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
- of shape :obj:`(batch_size, sequence_length, hidden_size)` is a sequence of
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1592,7 +1583,7 @@ class TFLEDEncoder(tf.keras.layers.Layer):
config_class = LEDConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`TFLEDEncoderLayer`.
+ [`TFLEDEncoderLayer`].
Args:
config: LEDConfig
@@ -1648,40 +1639,40 @@ class TFLEDEncoder(tf.keras.layers.Layer):
):
"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.LEDTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`LEDTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
inputs = input_processing(
func=self.call,
@@ -1857,7 +1848,7 @@ class TFLEDEncoder(tf.keras.layers.Layer):
class TFLEDDecoder(tf.keras.layers.Layer):
config_class = LEDConfig
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`TFLEDDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFLEDDecoderLayer`]
Args:
config: LEDConfig
@@ -1904,58 +1895,55 @@ class TFLEDDecoder(tf.keras.layers.Layer):
):
r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
- provide it. Indices can be obtained using :class:`~transformers.LEDTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
- for details. `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ provide it. Indices can be obtained using [`LEDTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+ for details. [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
+ encoder_head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
- on hidden heads. Mask values selected in ``[0, 1]``:
+ on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
- decoding. If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ decoding. If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
inputs = input_processing(
func=self.call,
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 9e0fb45c1a..6524f42d80 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -63,32 +63,29 @@ class LongformerBaseModelOutput(ModelOutput):
Base class for Longformer's outputs, with potential hidden states, local and global attentions.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention
mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -107,36 +104,33 @@ class LongformerBaseModelOutputWithPooling(ModelOutput):
Base class for Longformer's outputs that also contains a pooling of the last hidden states.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
+ pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
prediction (classification) objective during pretraining.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention
mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -156,34 +150,31 @@ class LongformerMaskedLMOutput(ModelOutput):
Base class for masked language models outputs.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Masked language modeling (MLM) loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention
mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -203,36 +194,33 @@ class LongformerQuestionAnsweringModelOutput(ModelOutput):
Base class for outputs of question answering Longformer models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
- start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+ start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
- end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+ end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention
mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -253,34 +241,31 @@ class LongformerSequenceClassifierOutput(ModelOutput):
Base class for outputs of sentence classification models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention
mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -300,36 +285,33 @@ class LongformerMultipleChoiceModelOutput(ModelOutput):
Base class for outputs of multiple choice Longformer models.
Args:
- loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
Classification loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
- `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+ logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
+ *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention
mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -349,34 +331,31 @@ class LongformerTokenClassifierOutput(ModelOutput):
Base class for outputs of token classification models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
Classification loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention
mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -560,14 +539,14 @@ class LongformerSelfAttention(nn.Module):
output_attentions=False,
):
"""
- :class:`LongformerSelfAttention` expects `len(hidden_states)` to be multiple of `attention_window`. Padding to
- `attention_window` happens in :meth:`LongformerModel.forward` to avoid redoing the padding on each layer.
+ [`LongformerSelfAttention`] expects *len(hidden_states)* to be multiple of *attention_window*. Padding to
+ *attention_window* happens in [`LongformerModel.forward`] to avoid redoing the padding on each layer.
- The `attention_mask` is changed in :meth:`LongformerModel.forward` from 0, 1, 2 to:
+ The *attention_mask* is changed in [`LongformerModel.forward`] from 0, 1, 2 to:
- * -10000: no attention
- * 0: local attention
- * +10000: global attention
+ - -10000: no attention
+ - 0: local attention
+ - +10000: global attention
"""
hidden_states = hidden_states.transpose(0, 1)
@@ -1390,86 +1369,84 @@ class LongformerPreTrainedModel(PreTrainedModel):
LONGFORMER_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.LongformerConfig`): Model configuration class with all the parameters of the
+ config ([`LongformerConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
LONGFORMER_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.LongformerTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`LongformerTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ global_attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to decide the attention given on each token, local attention or global attention. Tokens with global
attention attends to all other tokens, and all other tokens attend to them. This is important for
task-specific finetuning because it makes the model more flexible at representing the task. For example,
for classification, the token should be given global attention. For QA, all question tokens should also
- have global attention. Please refer to the `Longformer paper `__ for more
- details. Mask values selected in ``[0, 1]``:
+ have global attention. Please refer to the [Longformer paper](https://arxiv.org/abs/2004.05150) for more
+ details. Mask values selected in `[0, 1]`:
- 0 for local attention (a sliding window attention),
- 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
- head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ [What are position IDs?](../glossary#position-ids)
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -1479,13 +1456,13 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
)
class LongformerModel(LongformerPreTrainedModel):
"""
- This class copied code from :class:`~transformers.RobertaModel` and overwrote standard self-attention with
+ This class copied code from [`RobertaModel`] and overwrote standard self-attention with
longformer self-attention to provide the ability to process long sequences following the self-attention approach
- described in `Longformer: the Long-Document Transformer `__ by Iz Beltagy,
+ described in [Longformer: the Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy,
Matthew E. Peters, and Arman Cohan. Longformer self-attention combines a local (sliding window) and global
attention to extend to long documents without the O(n^2) increase in memory and compute.
- The self-attention module :obj:`LongformerSelfAttention` implemented here supports the combination of local and
+ The self-attention module `LongformerSelfAttention` implemented here supports the combination of local and
global attention but it lacks support for autoregressive attention and dilated attention. Autoregressive and
dilated attention are more relevant for autoregressive language modeling than finetuning on downstream tasks.
Future release will add support for autoregressive attention, but the support for dilated attention requires a
@@ -1740,32 +1717,32 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
Returns:
- Examples::
+ Examples:
- >>> import torch
- >>> from transformers import LongformerForMaskedLM, LongformerTokenizer
+ ```python
+ >>> import torch
+ >>> from transformers import LongformerForMaskedLM, LongformerTokenizer
- >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
- >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+ >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
+ >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
- >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
- >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
+ >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
+ >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
- >>> attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM
- ... # check ``LongformerModel.forward`` for more details how to set `attention_mask`
- >>> outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
- >>> loss = outputs.loss
- >>> prediction_logits = outputs.logits
- """
+ >>> attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM
+ ... # check `LongformerModel.forward` for more details how to set *attention_mask*
+ >>> outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
+ >>> loss = outputs.loss
+ >>> prediction_logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.longformer(
@@ -1845,10 +1822,9 @@ class LongformerForSequenceClassification(LongformerPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1967,42 +1943,42 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
Returns:
- Examples::
+ Examples:
- >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
- >>> import torch
+ ```python
+ >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
+ >>> import torch
- >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
- >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+ >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+ >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
- >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
- >>> encoding = tokenizer(question, text, return_tensors="pt")
- >>> input_ids = encoding["input_ids"]
+ >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+ >>> encoding = tokenizer(question, text, return_tensors="pt")
+ >>> input_ids = encoding["input_ids"]
- >>> # default is local attention everywhere
- >>> # the forward method will automatically set global attention on question tokens
- >>> attention_mask = encoding["attention_mask"]
+ >>> # default is local attention everywhere
+ >>> # the forward method will automatically set global attention on question tokens
+ >>> attention_mask = encoding["attention_mask"]
- >>> outputs = model(input_ids, attention_mask=attention_mask)
- >>> start_logits = outputs.start_logits
- >>> end_logits = outputs.end_logits
- >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
+ >>> outputs = model(input_ids, attention_mask=attention_mask)
+ >>> start_logits = outputs.start_logits
+ >>> end_logits = outputs.end_logits
+ >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
- >>> answer_tokens = all_tokens[torch.argmax(start_logits) :torch.argmax(end_logits)+1]
- >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
-
- """
+ >>> answer_tokens = all_tokens[torch.argmax(start_logits) :torch.argmax(end_logits)+1]
+ >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if global_attention_mask is None:
@@ -2109,9 +2085,8 @@ class LongformerForTokenClassification(LongformerPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2202,10 +2177,9 @@ class LongformerForMultipleChoice(LongformerPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index 38018aff27..19d354248b 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -66,31 +66,29 @@ class TFLongformerBaseModelOutput(ModelOutput):
Base class for Longformer's outputs, with potential hidden states, local and global attentions.
Args:
- last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x +
- attention_window + 1)`, where ``x`` is the number of tokens with global attention mask.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x)`,
- where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+ where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -109,35 +107,33 @@ class TFLongformerBaseModelOutputWithPooling(ModelOutput):
Base class for Longformer's outputs that also contains a pooling of the last hidden states.
Args:
- last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
+ pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
prediction (classification) objective during pretraining.
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x +
- attention_window + 1)`, where ``x`` is the number of tokens with global attention mask.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x)`,
- where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+ where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -157,33 +153,31 @@ class TFLongformerMaskedLMOutput(ModelOutput):
Base class for masked language models outputs.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Masked language modeling (MLM) loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x +
- attention_window + 1)`, where ``x`` is the number of tokens with global attention mask.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x)`,
- where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+ where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -203,35 +197,33 @@ class TFLongformerQuestionAnsweringModelOutput(ModelOutput):
Base class for outputs of question answering Longformer models.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
- start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
- end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x +
- attention_window + 1)`, where ``x`` is the number of tokens with global attention mask.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x)`,
- where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+ where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -252,33 +244,31 @@ class TFLongformerSequenceClassifierOutput(ModelOutput):
Base class for outputs of sentence classification models.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x +
- attention_window + 1)`, where ``x`` is the number of tokens with global attention mask.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x)`,
- where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+ where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -298,35 +288,33 @@ class TFLongformerMultipleChoiceModelOutput(ModelOutput):
Base class for outputs of multiple choice models.
Args:
- loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
Classification loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
- `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+ logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
+ *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x +
- attention_window + 1)`, where ``x`` is the number of tokens with global attention mask.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x)`,
- where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+ where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -346,33 +334,31 @@ class TFLongformerTokenClassifierOutput(ModelOutput):
Base class for outputs of token classification models.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+ loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
Classification loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x +
- attention_window + 1)`, where ``x`` is the number of tokens with global attention mask.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + attention_window + 1)`, where `x` is the number of tokens with global attention mask.
Local attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token in the sequence to every token with
- global attention (first ``x`` values) and to every token in the attention window (remaining
- ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
- the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
- attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
- ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
- / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
- attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+ global attention (first `x` values) and to every token in the attention window (remaining
+ `attention_window + 1` values). Note that the first `x` values refer to tokens with fixed positions in
+ the text, but the remaining `attention_window + 1` values refer to tokens with relative positions: the
+ attention weight of a token to itself is located at index `x + attention_window / 2` and the
+ `attention_window / 2` preceding (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+ attention weight at the corresponding index is set to 0; the value should be accessed from the first `x`
attention weights. If a token has global attention, the attention weights to all other tokens in
- :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
- global_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, x)`,
- where ``x`` is the number of tokens with global attention mask.
+ `attentions` is set to 0, the values should be accessed from `global_attentions`.
+ global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+ where `x` is the number of tokens with global attention mask.
Global attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads. Those are the attention weights from every token with global attention to every token
@@ -536,7 +522,7 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
@@ -713,14 +699,14 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
training=False,
):
"""
- LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`. Padding to
- `attention_window` happens in LongformerModel.forward to avoid redoing the padding on each layer.
+ LongformerSelfAttention expects *len(hidden_states)* to be multiple of *attention_window*. Padding to
+ *attention_window* happens in LongformerModel.forward to avoid redoing the padding on each layer.
- The `attention_mask` is changed in :meth:`LongformerModel.forward` from 0, 1, 2 to:
+ The *attention_mask* is changed in [`LongformerModel.forward`] from 0, 1, 2 to:
- * -10000: no attention
- * 0: local attention
- * +10000: global attention
+ - -10000: no attention
+ - 0: local attention
+ - +10000: global attention
"""
# retrieve input args
(
@@ -1872,104 +1858,104 @@ class TFLongformerPreTrainedModel(TFPreTrainedModel):
LONGFORMER_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Parameters:
- config (:class:`~transformers.LongformerConfig`): Model configuration class with all the parameters of the model.
+ config ([`LongformerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
LONGFORMER_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.LongformerTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`LongformerTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- global_attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
+ global_attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
Mask to decide the attention given on each token, local attention or global attention. Tokens with global
attention attends to all other tokens, and all other tokens attend to them. This is important for
task-specific finetuning because it makes the model more flexible at representing the task. For example,
for classification, the token should be given global attention. For QA, all question tokens should also
- have global attention. Please refer to the `Longformer paper `__ for more
- details. Mask values selected in ``[0, 1]``:
+ have global attention. Please refer to the [Longformer paper](https://arxiv.org/abs/2004.05150) for more
+ details. Mask values selected in `[0, 1]`:
- 0 for local attention (a sliding window attention),
- 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
- token_type_ids (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ token_type_ids (`tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ [What are position IDs?](../glossary#position-ids)
+ inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1982,13 +1968,13 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
class TFLongformerModel(TFLongformerPreTrainedModel):
"""
- This class copies code from :class:`~transformers.TFRobertaModel` and overwrites standard self-attention with
+ This class copies code from [`TFRobertaModel`] and overwrites standard self-attention with
longformer self-attention to provide the ability to process long sequences following the self-attention approach
- described in `Longformer: the Long-Document Transformer `__ by Iz Beltagy,
+ described in [Longformer: the Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy,
Matthew E. Peters, and Arman Cohan. Longformer self-attention combines a local (sliding window) and global
attention to extend to long documents without the O(n^2) increase in memory and compute.
- The self-attention module :obj:`TFLongformerSelfAttention` implemented here supports the combination of local and
+ The self-attention module `TFLongformerSelfAttention` implemented here supports the combination of local and
global attention but it lacks support for autoregressive attention and dilated attention. Autoregressive and
dilated attention are more relevant for autoregressive language modeling than finetuning on downstream tasks.
Future release will add support for autoregressive attention, but the support for dilated attention requires a
@@ -2109,10 +2095,9 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
func=self.call,
@@ -2218,13 +2203,13 @@ class TFLongformerForQuestionAnswering(TFLongformerPreTrainedModel, TFQuestionAn
**kwargs,
):
r"""
- start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
"""
inputs = input_processing(
@@ -2512,10 +2497,9 @@ class TFLongformerForMultipleChoice(TFLongformerPreTrainedModel, TFMultipleChoic
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
inputs = input_processing(
func=self.call,
@@ -2665,9 +2649,8 @@ class TFLongformerForTokenClassification(TFLongformerPreTrainedModel, TFTokenCla
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index c2922935ad..468093f24a 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -54,24 +54,23 @@ class BaseLukeModelOutputWithPooling(BaseModelOutputWithPooling):
Base class for outputs of the LUKE model.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- entity_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, entity_length, hidden_size)`):
+ entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
Sequence of entity hidden-states at the output of the last layer of the model.
- pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
+ pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
Linear layer and a Tanh activation function.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
each layer plus the initial embedding outputs.
- entity_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output
+ entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output
of each layer plus the initial entity embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length + entity_length, sequence_length + entity_length)`. Attentions weights after the attention
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length + entity_length, sequence_length + entity_length)`. Attentions weights after the attention
softmax, used to compute the weighted average in the self-attention heads.
"""
@@ -85,22 +84,21 @@ class BaseLukeModelOutput(BaseModelOutput):
Base class for model's outputs, with potential hidden states and attentions.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- entity_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, entity_length, hidden_size)`):
+ entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
Sequence of entity hidden-states at the output of the last layer of the model.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- entity_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output
+ entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output
of each layer plus the initial entity embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -116,28 +114,27 @@ class LukeMaskedLMOutput(ModelOutput):
Base class for model's outputs, with potential hidden states and attentions.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
The sum of masked language modeling (MLM) loss and entity prediction loss.
- mlm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ mlm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Masked language modeling (MLM) loss.
- mep_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ mep_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Masked entity prediction (MEP) loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- entity_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ entity_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the entity prediction head (scores for each entity vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- entity_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output
+ entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output
of each layer plus the initial entity embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -159,21 +156,20 @@ class EntityClassificationOutput(ModelOutput):
Outputs of entity classification models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
each layer plus the initial embedding outputs.
- entity_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output
+ entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output
of each layer plus the initial entity embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
"""
@@ -190,21 +186,20 @@ class EntityPairClassificationOutput(ModelOutput):
Outputs of entity pair classification models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
each layer plus the initial embedding outputs.
- entity_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output
+ entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output
of each layer plus the initial entity embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
"""
@@ -221,21 +216,20 @@ class EntitySpanClassificationOutput(ModelOutput):
Outputs of entity span classification models.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Classification scores (before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
each layer plus the initial embedding outputs.
- entity_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output
+ entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output
of each layer plus the initial entity embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
"""
@@ -783,95 +777,92 @@ class LukePreTrainedModel(PreTrainedModel):
LUKE_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.LukeConfig`): Model configuration class with all the parameters of the
+ config ([`LukeConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
LUKE_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.LukeTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`LukeTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
+ [What are position IDs?](../glossary#position-ids)
- entity_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, entity_length)`):
+ entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
Indices of entity tokens in the entity vocabulary.
- Indices can be obtained using :class:`~transformers.LukeTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`LukeTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- entity_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, entity_length)`, `optional`):
- Mask to avoid performing attention on padding entity token indices. Mask values selected in ``[0, 1]``:
+ entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
+ Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:
- 1 for entity tokens that are **not masked**,
- 0 for entity tokens that are **masked**.
- entity_token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, entity_length)`, `optional`):
+ entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
Segment token indices to indicate first and second portions of the entity token inputs. Indices are
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- - 0 corresponds to a `portion A` entity token,
- - 1 corresponds to a `portion B` entity token.
+ - 0 corresponds to a *portion A* entity token,
+ - 1 corresponds to a *portion B* entity token.
- entity_position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, entity_length, max_mention_length)`, `optional`):
- Indices of positions of each input entity in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
+ Indices of positions of each input entity in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -1048,13 +1039,13 @@ class LukeModel(LukePreTrainedModel):
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
Arguments:
- word_attention_mask (:obj:`torch.LongTensor`):
+ word_attention_mask (`torch.LongTensor`):
Attention mask for word tokens with ones indicating tokens to attend to, zeros for tokens to ignore.
- entity_attention_mask (:obj:`torch.LongTensor`, `optional`):
+ entity_attention_mask (`torch.LongTensor`, *optional*):
Attention mask for entity tokens with ones indicating tokens to attend to, zeros for tokens to ignore.
Returns:
- :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+ `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
"""
attention_mask = word_attention_mask
if entity_attention_mask is not None:
@@ -1180,14 +1171,12 @@ class LukeForMaskedLM(LukePreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- entity_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, entity_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ entity_labels (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Returns:
@@ -1292,31 +1281,32 @@ class LukeForEntityClassification(LukePreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)` or :obj:`(batch_size, num_labels)`, `optional`):
- Labels for computing the classification loss. If the shape is :obj:`(batch_size,)`, the cross entropy loss
+ labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
+ Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss
is used for the single-label classification. In this case, labels should contain the indices that should be
- in :obj:`[0, ..., config.num_labels - 1]`. If the shape is :obj:`(batch_size, num_labels)`, the binary
+ in `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary
cross entropy loss is used for the multi-label classification. In this case, labels should only contain
- ``[0, 1]``, where 0 and 1 indicate false and true, respectively.
+ `[0, 1]`, where 0 and 1 indicate false and true, respectively.
Returns:
- Examples::
+ Examples:
- >>> from transformers import LukeTokenizer, LukeForEntityClassification
+ ```python
+ >>> from transformers import LukeTokenizer, LukeForEntityClassification
- >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
- >>> model = LukeForEntityClassification.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
+ >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
+ >>> model = LukeForEntityClassification.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
- >>> text = "Beyoncé lives in Los Angeles."
- >>> entity_spans = [(0, 7)] # character-based entity span corresponding to "Beyoncé"
- >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- >>> predicted_class_idx = logits.argmax(-1).item()
- >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
- Predicted class: person
- """
+ >>> text = "Beyoncé lives in Los Angeles."
+ >>> entity_spans = [(0, 7)] # character-based entity span corresponding to "Beyoncé"
+ >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ >>> predicted_class_idx = logits.argmax(-1).item()
+ >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+ Predicted class: person
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.luke(
@@ -1406,31 +1396,32 @@ class LukeForEntityPairClassification(LukePreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)` or :obj:`(batch_size, num_labels)`, `optional`):
- Labels for computing the classification loss. If the shape is :obj:`(batch_size,)`, the cross entropy loss
+ labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
+ Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss
is used for the single-label classification. In this case, labels should contain the indices that should be
- in :obj:`[0, ..., config.num_labels - 1]`. If the shape is :obj:`(batch_size, num_labels)`, the binary
+ in `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary
cross entropy loss is used for the multi-label classification. In this case, labels should only contain
- ``[0, 1]``, where 0 and 1 indicate false and true, respectively.
+ `[0, 1]`, where 0 and 1 indicate false and true, respectively.
Returns:
- Examples::
+ Examples:
- >>> from transformers import LukeTokenizer, LukeForEntityPairClassification
+ ```python
+ >>> from transformers import LukeTokenizer, LukeForEntityPairClassification
- >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
- >>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
+ >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
+ >>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
- >>> text = "Beyoncé lives in Los Angeles."
- >>> entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
- >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- >>> predicted_class_idx = logits.argmax(-1).item()
- >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
- Predicted class: per:cities_of_residence
- """
+ >>> text = "Beyoncé lives in Los Angeles."
+ >>> entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
+ >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ >>> predicted_class_idx = logits.argmax(-1).item()
+ >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+ Predicted class: per:cities_of_residence
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.luke(
@@ -1524,48 +1515,48 @@ class LukeForEntitySpanClassification(LukePreTrainedModel):
return_dict=None,
):
r"""
- entity_start_positions (:obj:`torch.LongTensor`):
+ entity_start_positions (`torch.LongTensor`):
The start positions of entities in the word token sequence.
- entity_end_positions (:obj:`torch.LongTensor`):
+ entity_end_positions (`torch.LongTensor`):
The end positions of entities in the word token sequence.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, entity_length)` or :obj:`(batch_size, entity_length, num_labels)`, `optional`):
- Labels for computing the classification loss. If the shape is :obj:`(batch_size, entity_length)`, the cross
+ labels (`torch.LongTensor` of shape `(batch_size, entity_length)` or `(batch_size, entity_length, num_labels)`, *optional*):
+ Labels for computing the classification loss. If the shape is `(batch_size, entity_length)`, the cross
entropy loss is used for the single-label classification. In this case, labels should contain the indices
- that should be in :obj:`[0, ..., config.num_labels - 1]`. If the shape is :obj:`(batch_size, entity_length,
- num_labels)`, the binary cross entropy loss is used for the multi-label classification. In this case,
- labels should only contain ``[0, 1]``, where 0 and 1 indicate false and true, respectively.
+ that should be in `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, entity_length, num_labels)`, the binary cross entropy loss is used for the multi-label classification. In this case,
+ labels should only contain `[0, 1]`, where 0 and 1 indicate false and true, respectively.
Returns:
- Examples::
+ Examples:
- >>> from transformers import LukeTokenizer, LukeForEntitySpanClassification
+ ```python
+ >>> from transformers import LukeTokenizer, LukeForEntitySpanClassification
- >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
- >>> model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
+ >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
+ >>> model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
- >>> text = "Beyoncé lives in Los Angeles"
+ >>> text = "Beyoncé lives in Los Angeles"
- # List all possible entity spans in the text
- >>> word_start_positions = [0, 8, 14, 17, 21] # character-based start positions of word tokens
- >>> word_end_positions = [7, 13, 16, 20, 28] # character-based end positions of word tokens
- >>> entity_spans = []
- >>> for i, start_pos in enumerate(word_start_positions):
- ... for end_pos in word_end_positions[i:]:
- ... entity_spans.append((start_pos, end_pos))
+ # List all possible entity spans in the text
+ >>> word_start_positions = [0, 8, 14, 17, 21] # character-based start positions of word tokens
+ >>> word_end_positions = [7, 13, 16, 20, 28] # character-based end positions of word tokens
+ >>> entity_spans = []
+ >>> for i, start_pos in enumerate(word_start_positions):
+ ... for end_pos in word_end_positions[i:]:
+ ... entity_spans.append((start_pos, end_pos))
- >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- >>> predicted_class_indices = logits.argmax(-1).squeeze().tolist()
- >>> for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
- ... if predicted_class_idx != 0:
- ... print(text[span[0]:span[1]], model.config.id2label[predicted_class_idx])
- Beyoncé PER
- Los Angeles LOC
- """
+ >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ >>> predicted_class_indices = logits.argmax(-1).squeeze().tolist()
+ >>> for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
+ ... if predicted_class_idx != 0:
+ ... print(text[span[0]:span[1]], model.config.id2label[predicted_class_idx])
+ Beyoncé PER
+ Los Angeles LOC
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.luke(
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index 1a54353d8b..cb9cdad9d2 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -66,30 +66,27 @@ class LxmertModelOutput(ModelOutput):
Args:
- language_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ language_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the language encoder.
- vision_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ vision_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the visual encoder.
- pooled_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
+ pooled_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification, CLS, token) further processed
by a Linear layer and a Tanh activation function. The Linear
- language_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
- vision_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
- language_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
+ vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
+ language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
- vision_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
- cross_encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
"""
@@ -106,31 +103,28 @@ class LxmertModelOutput(ModelOutput):
@dataclass
class LxmertForQuestionAnsweringOutput(ModelOutput):
"""
- Output type of :class:`~transformers.LxmertForQuestionAnswering`.
+ Output type of [`LxmertForQuestionAnswering`].
Args:
- loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+ loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.k.
- question_answering_score: (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, n_qa_answers)`, `optional`):
+ question_answering_score: (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*):
Prediction scores of question answering objective (classification).
- language_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
- vision_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
- language_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
+ vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
+ language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
- vision_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
- cross_encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
"""
@@ -146,36 +140,33 @@ class LxmertForQuestionAnsweringOutput(ModelOutput):
@dataclass
class LxmertForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.LxmertForPreTraining`.
+ Output type of [`LxmertForPreTraining`].
Args:
- loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+ loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.
- prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- cross_relationship_score: (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+ cross_relationship_score: (`torch.FloatTensor` of shape `(batch_size, 2)`):
Prediction scores of the textual matching objective (classification) head (scores of True/False
continuation before SoftMax).
- question_answering_score: (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, n_qa_answers)`):
+ question_answering_score: (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
Prediction scores of question answering objective (classification).
- language_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
- vision_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
- language_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
+ vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
+ language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
- vision_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
- cross_encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
"""
@@ -801,83 +792,81 @@ class LxmertPreTrainedModel(PreTrainedModel):
LXMERT_START_DOCSTRING = r"""
- The LXMERT model was proposed in `LXMERT: Learning Cross-Modality Encoder Representations from Transformers
- `__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
+ The LXMERT model was proposed in [LXMERT: Learning Cross-Modality Encoder Representations from Transformers](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
pretrained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome,
using a combination of masked language modeling, region of interest feature regression, cross entropy loss for
question answering attribute prediction, and object tag prediction.
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.LxmertConfig`): Model configuration class with all the parameters of the model.
+ config ([`LxmertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
LXMERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.LxmertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`LxmertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- visual_feats: (:obj:`torch.FloatTensor` of shape :obj:՝(batch_size, num_visual_features, visual_feat_dim)՝):
+ [What are input IDs?](../glossary#input-ids)
+ visual_feats: (`torch.FloatTensor` of shape :obj:՝(batch_size, num_visual_features, visual_feat_dim)՝):
This input represents visual features. They ROI pooled object features from bounding boxes using a
faster-RCNN model)
These are currently not provided by the transformers library.
- visual_pos: (:obj:`torch.FloatTensor` of shape :obj:՝(batch_size, num_visual_features, visual_pos_dim)՝):
+ visual_pos: (`torch.FloatTensor` of shape :obj:՝(batch_size, num_visual_features, visual_pos_dim)՝):
This input represents spacial features corresponding to their relative (via index) visual features. The
pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
1.
These are currently not provided by the transformers library.
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- visual_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ visual_attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ [What are token type IDs?](../glossary#token-type-ids)
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -1086,14 +1075,14 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
will add newly initialized weights. Reducing the size will remove weights from the end
Args:
- num_labels (:obj:`int`, `optional`):
+ num_labels (`int`, *optional*):
New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
- weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
- just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model without doing
+ weights at the end. Reducing the size will remove weights from the end. If not provided or `None`,
+ just returns a pointer to the qa labels ``torch.nn.Linear``` module of the model without doing
anything.
Return:
- :obj:`torch.nn.Linear`: Pointer to the resized Linear layer or the old Linear layer
+ `torch.nn.Linear`: Pointer to the resized Linear layer or the old Linear layer
"""
cur_qa_logit_layer = self.get_qa_logit_layer()
@@ -1116,7 +1105,7 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
Returns the the linear layer that produces question answering logits.
Returns:
- :obj:`nn.Module`: A torch module mapping the question answering prediction hidden states or :obj:`None` if
+ `nn.Module`: A torch module mapping the question answering prediction hidden states or `None` if
LXMERT does not have a visual answering head.
"""
if hasattr(self, "answer_head"):
@@ -1174,22 +1163,21 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
**kwargs,
):
r"""
- labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- obj_labels: (``Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]``, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ obj_labels: (`Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]`, *optional*):
each key is named after each one of the visual losses and each element of the tuple is of the shape
- ``(batch_size, num_features)`` and ``(batch_size, num_features, visual_feature_dim)`` for each the label id
+ `(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id
and the label score respectively
- matched_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+ matched_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the whether or not the text input matches the image (classification) loss. Input
- should be a sequence pair (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+ should be a sequence pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
- 0 indicates that the sentence does not match the image,
- 1 indicates that the sentence does match the image.
- ans: (``Torch.Tensor`` of shape ``(batch_size)``, `optional`):
- a one hot representation hof the correct answer `optional`
+ ans: (`Torch.Tensor` of shape `(batch_size)`, *optional*):
+ a one hot representation hof the correct answer *optional*
Returns:
"""
@@ -1317,14 +1305,14 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
will add newly initialized weights. Reducing the size will remove weights from the end
Args:
- num_labels (:obj:`int`, `optional`):
+ num_labels (`int`, *optional*):
New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
- weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
- just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model without doing
+ weights at the end. Reducing the size will remove weights from the end. If not provided or `None`,
+ just returns a pointer to the qa labels ``torch.nn.Linear``` module of the model without doing
anything.
Return:
- :obj:`torch.nn.Linear`: Pointer to the resized Linear layer or the old Linear layer
+ `torch.nn.Linear`: Pointer to the resized Linear layer or the old Linear layer
"""
cur_qa_logit_layer = self.get_qa_logit_layer()
@@ -1347,7 +1335,7 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
Returns the the linear layer that produces question answering logits
Returns:
- :obj:`nn.Module`: A torch module mapping the question answering prediction hidden states. :obj:`None`: A
+ `nn.Module`: A torch module mapping the question answering prediction hidden states. `None`: A
NoneType object if Lxmert does not have the visual answering head.
"""
@@ -1407,7 +1395,7 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
return_dict=None,
):
r"""
- labels: (``Torch.Tensor`` of shape ``(batch_size)``, `optional`):
+ labels: (`Torch.Tensor` of shape `(batch_size)`, *optional*):
A one-hot representation of the correct answer
Returns:
diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py
index b33c86ee21..1c8fd675de 100644
--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -55,30 +55,27 @@ class TFLxmertModelOutput(ModelOutput):
Args:
- language_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ language_output (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the language encoder.
- vision_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ vision_output (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the visual encoder.
- pooled_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
+ pooled_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification, CLS, token) further processed
by a Linear layer and a Tanh activation function. The Linear
- language_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for input features + one for the output of each cross-modality layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
- vision_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for input features + one for the output of each cross-modality layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
- language_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+ vision_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+ language_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
- vision_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ vision_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
- cross_encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ cross_encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
"""
@@ -95,36 +92,33 @@ class TFLxmertModelOutput(ModelOutput):
@dataclass
class TFLxmertForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.LxmertForPreTraining`.
+ Output type of [`LxmertForPreTraining`].
Args:
- loss (`optional`, returned when ``labels`` is provided, ``tf.Tensor`` of shape :obj:`(1,)`):
+ loss (*optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.
- prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- cross_relationship_score: (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`):
+ cross_relationship_score: (`tf.Tensor` of shape `(batch_size, 2)`):
Prediction scores of the textual matching objective (classification) head (scores of True/False
continuation before SoftMax).
- question_answering_score: (:obj:`tf.Tensor` of shape :obj:`(batch_size, n_qa_answers)`):
+ question_answering_score: (`tf.Tensor` of shape `(batch_size, n_qa_answers)`):
Prediction scores of question answering objective (classification).
- language_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for input features + one for the output of each cross-modality layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
- vision_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for input features + one for the output of each cross-modality layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
- language_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+ vision_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+ language_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
- vision_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ vision_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
- cross_encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ cross_encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
"""
@@ -220,7 +214,7 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
@@ -837,101 +831,101 @@ class TFLxmertPreTrainedModel(TFPreTrainedModel):
LXMERT_START_DOCSTRING = r"""
- The LXMERT model was proposed in `LXMERT: Learning Cross-Modality Encoder Representations from Transformers
- `__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
+ The LXMERT model was proposed in [LXMERT: Learning Cross-Modality Encoder Representations from Transformers](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
pre-trained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome,
using a combination of masked language modeling, region of interest feature regression, cross entropy loss for
question answering attribute prediction, and object tag prediction.
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Parameters:
- config (:class:`~transformers.LxmertConfig`): Model configuration class with all the parameters of the model.
+ config ([`LxmertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
LXMERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`np.ndarray` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.LxmertTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`LxmertTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- visual_feats: (:obj:`tf.Tensor` of shape :obj:՝(batch_size, num_visual_features, visual_feat_dim)՝):
+ [What are input IDs?](../glossary#input-ids)
+ visual_feats: (`tf.Tensor` of shape :obj:՝(batch_size, num_visual_features, visual_feat_dim)՝):
This input represents visual features. They ROI pooled object features from bounding boxes using a
faster-RCNN model)
These are currently not provided by the transformers library.
- visual_pos: (:obj:`tf.Tensor` of shape :obj:՝(batch_size, num_visual_features, visual_feat_dim)՝):
+ visual_pos: (`tf.Tensor` of shape :obj:՝(batch_size, num_visual_features, visual_feat_dim)՝):
This input represents spacial features corresponding to their relative (via index) visual features. The
pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
1.
These are currently not provided by the transformers library.
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- visual_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- MMask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ visual_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ MMask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ [What are token type IDs?](../glossary#token-type-ids)
+ inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1320,22 +1314,21 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel):
**kwargs,
):
r"""
- masked_lm_labels (``tf.Tensor`` of shape ``(batch_size, sequence_length)``, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- obj_labels: (``Dict[Str: Tuple[tf.Tensor, tf.Tensor]]``, `optional`, defaults to :obj: `None`):
+ masked_lm_labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ obj_labels: (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to :obj: *None*):
each key is named after each one of the visual losses and each element of the tuple is of the shape
- ``(batch_size, num_features)`` and ``(batch_size, num_features, visual_feature_dim)`` for each the label id
+ `(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id
and the label score respectively
- matched_label (``tf.Tensor`` of shape ``(batch_size,)``, `optional`):
+ matched_label (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the whether or not the text input matches the image (classification) loss. Input
- should be a sequence pair (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+ should be a sequence pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
- 0 indicates that the sentence does not match the image,
- 1 indicates that the sentence does match the image.
- ans: (``Torch.Tensor`` of shape ``(batch_size)``, `optional`, defaults to :obj: `None`):
- a one hot representation hof the correct answer `optional`
+ ans: (`Torch.Tensor` of shape `(batch_size)`, *optional*, defaults to :obj: *None*):
+ a one hot representation hof the correct answer *optional*
Returns:
"""
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index 4c9caadd8c..21d86aa2e0 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -366,13 +366,13 @@ class M2M100EncoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -450,19 +450,19 @@ class M2M100DecoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for cross-attention heads in a given layer of
- size `(decoder_attention_heads,)`.
- past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+ size *(decoder_attention_heads,)*.
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -547,19 +547,19 @@ class M2M100PreTrainedModel(PreTrainedModel):
M2M_100_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.M2M100Config`):
+ config ([`M2M100Config`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
M2M_100_GENERATION_EXAMPLE = r"""
@@ -580,101 +580,97 @@ M2M_100_GENERATION_EXAMPLE = r"""
M2M_100_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.M2M100Tokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`M2M100Tokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.M2M100Tokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`M2M100Tokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- M2M100 uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ M2M100 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in ``[0,
- 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
- have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
- :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
+ have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+ `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
- If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
- takes the value of :obj:`inputs_embeds`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+ takes the value of `inputs_embeds`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class M2M100Encoder(M2M100PreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`M2M100EncoderLayer`.
+ [`M2M100EncoderLayer`].
Args:
config: M2M100Config
@@ -721,40 +717,40 @@ class M2M100Encoder(M2M100PreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.M2M100Tokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`M2M100Tokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -843,7 +839,7 @@ class M2M100Encoder(M2M100PreTrainedModel):
class M2M100Decoder(M2M100PreTrainedModel):
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`M2M100DecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`M2M100DecoderLayer`]
Args:
config: M2M100Config
@@ -892,71 +888,68 @@ class M2M100Decoder(M2M100PreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.M2M100Tokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`M2M100Tokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
- cross-attention on hidden heads. Mask values selected in ``[0, 1]``:
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1277,27 +1270,27 @@ class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
- Example::
+ Example:
- >>> from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
+ ```python
+ >>> from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
- >>> model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M')
- >>> tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M')
+ >>> model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M')
+ >>> tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M')
- >>> text_to_translate = "Life is like a box of chocolates"
- >>> model_inputs = tokenizer(text_to_translate, return_tensors='pt')
+ >>> text_to_translate = "Life is like a box of chocolates"
+ >>> model_inputs = tokenizer(text_to_translate, return_tensors='pt')
- >>> # translate to French
- >>> gen_tokens = model.generate( **model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr"))
- >>> print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
- """
+ >>> # translate to French
+ >>> gen_tokens = model.generate( **model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr"))
+ >>> print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py
index dfe2d913d8..5d58f03877 100644
--- a/src/transformers/models/marian/modeling_flax_marian.py
+++ b/src/transformers/models/marian/modeling_flax_marian.py
@@ -51,170 +51,165 @@ _TOKENIZER_FOR_DOC = "MarianTokenizer"
MARIAN_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a Flax Linen `flax.nn.Module
- `__ subclass. Use it as a regular Flax
+ This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.MarianConfig`): Model configuration class with all the parameters of the model.
+ config ([`MarianConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
MARIAN_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in `the paper
- `__ for more information on the default strategy.
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ range `[0, config.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
MARIAN_ENCODE_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
MARIAN_DECODE_INPUTS_DOCSTRING = r"""
Args:
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`):
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- encoder_outputs (:obj:`tuple(tuple(jnp.ndarray)`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(jnp.ndarray)`):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- encoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in `the paper
- `__ for more information on the default strategy.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.max_position_embeddings - 1]``.
- past_key_values (:obj:`Dict[str, np.ndarray]`, `optional`, returned by ``init_cache`` or when passing previous ``past_key_values``):
+ range `[0, config.max_position_embeddings - 1]`.
+ past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
- auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -931,15 +926,14 @@ class FlaxMarianPreTrainedModel(FlaxPreTrainedModel):
def init_cache(self, batch_size, max_length, encoder_outputs):
r"""
Args:
- batch_size (:obj:`int`):
+ batch_size (`int`):
batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
- max_length (:obj:`int`):
+ max_length (`int`):
maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
cache.
- encoder_outputs (:obj:`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
- ``encoder_outputs`` consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`,
- `optional`: :obj:`attentions`). :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length,
- hidden_size)`, `optional`) is a sequence of hidden-states at the output of the last layer of the
+ encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+ `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`,
+ *optional*: `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the
encoder. Used in the cross-attention of the decoder.
"""
# init input variables to retrieve cache
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 9827f4caee..19903612ce 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -312,13 +312,13 @@ class MarianEncoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -396,19 +396,19 @@ class MarianDecoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
- encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for cross-attention heads in a given layer of
- size `(decoder_attention_heads,)`.
- past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+ size *(decoder_attention_heads,)*.
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -506,19 +506,19 @@ class MarianPreTrainedModel(PreTrainedModel):
MARIAN_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.MarianConfig`):
+ config ([`MarianConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
MARIAN_GENERATION_EXAMPLE = r"""
@@ -544,102 +544,98 @@ MARIAN_GENERATION_EXAMPLE = r"""
MARIAN_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- Marian uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ Marian uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in ``[0,
- 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
- have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
- :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
+ have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+ `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
- If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
- takes the value of :obj:`inputs_embeds`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+ takes the value of `inputs_embeds`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class MarianEncoder(MarianPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`MarianEncoderLayer`.
+ [`MarianEncoderLayer`].
Args:
config: MarianConfig
@@ -685,40 +681,40 @@ class MarianEncoder(MarianPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -805,7 +801,7 @@ class MarianEncoder(MarianPreTrainedModel):
class MarianDecoder(MarianPreTrainedModel):
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`MarianDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MarianDecoderLayer`]
Args:
config: MarianConfig
@@ -878,71 +874,68 @@ class MarianDecoder(MarianPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
- cross-attention on hidden heads. Mask values selected in ``[0, 1]``:
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1277,10 +1270,9 @@ class MarianMTModel(MarianPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
@@ -1383,7 +1375,7 @@ class MarianMTModel(MarianPreTrainedModel):
class MarianDecoderWrapper(MarianPreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
- used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
+ used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
@@ -1445,88 +1437,87 @@ class MarianForCausalLM(MarianPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
- in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
model.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels in ``[0, ...,
- config.vocab_size]``.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
Returns:
- Example::
+ Example:
- >>> from transformers import MarianTokenizer, MarianForCausalLM
+ ```python
+ >>> from transformers import MarianTokenizer, MarianForCausalLM
- >>> tokenizer = MarianTokenizer.from_pretrained('facebook/bart-large')
- >>> model = MarianForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
- >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> tokenizer = MarianTokenizer.from_pretrained('facebook/bart-large')
+ >>> model = MarianForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+ >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- """
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index 2412fca17c..d4bb501838 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -339,11 +339,11 @@ class TFMarianEncoderLayer(tf.keras.layers.Layer):
def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*
"""
residual = hidden_states
hidden_states, self_attn_weights, _ = self.self_attn(
@@ -416,17 +416,17 @@ class TFMarianDecoderLayer(tf.keras.layers.Layer):
) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`tf.Tensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`tf.Tensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(decoder_attention_heads,)`
- cross_attn_layer_head_mask (:obj:`tf.Tensor`): mask for heads of the cross-attention module.
- `(decoder_attention_heads,)`
- past_key_value (:obj:`Tuple(tf.Tensor)`): cached past key and value projection states
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(decoder_attention_heads,)*
+ cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
+ *(decoder_attention_heads,)*
+ past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
"""
residual = hidden_states
@@ -517,37 +517,39 @@ class TFMarianPreTrainedModel(TFPreTrainedModel):
MARIAN_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(input_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.MarianConfig`): Model configuration class with all the parameters of the model.
+ config ([`MarianConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
@@ -574,76 +576,76 @@ MARIAN_GENERATION_EXAMPLE = r"""
MARIAN_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- Marian uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ Marian uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
+ encoder_outputs (`tf.FloatTensor`, *optional*):
hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
- of shape :obj:`(batch_size, sequence_length, hidden_size)` is a sequence of
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -654,7 +656,7 @@ class TFMarianEncoder(tf.keras.layers.Layer):
config_class = MarianConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`TFMarianEncoderLayer`.
+ [`TFMarianEncoderLayer`].
Args:
config: MarianConfig
@@ -697,44 +699,43 @@ class TFMarianEncoder(tf.keras.layers.Layer):
):
"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
in the config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail. This argument can be used only in eager mode, in graph mode the value in the config
will be used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -821,7 +822,7 @@ class TFMarianEncoder(tf.keras.layers.Layer):
class TFMarianDecoder(tf.keras.layers.Layer):
config_class = MarianConfig
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`TFMarianDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFMarianDecoderLayer`]
Args:
config: MarianConfig
@@ -869,69 +870,66 @@ class TFMarianDecoder(tf.keras.layers.Layer):
):
r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`MarianTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
in the config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail. This argument can be used only in eager mode, in graph mode the value in the config
will be used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1375,10 +1373,9 @@ class TFMarianMTModel(TFMarianPreTrainedModel, TFCausalLanguageModelingLoss):
**kwargs,
):
r"""
- labels (:obj:`tf.tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
diff --git a/src/transformers/models/mbart/modeling_flax_mbart.py b/src/transformers/models/mbart/modeling_flax_mbart.py
index 4009a2296f..88d8b76b69 100644
--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -59,170 +59,165 @@ _TOKENIZER_FOR_DOC = "MBartTokenizer"
MBART_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a Flax Linen `flax.nn.Module
- `__ subclass. Use it as a regular Flax
+ This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.MBartConfig`): Model configuration class with all the parameters of the model.
+ config ([`MBartConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
MBART_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in `the paper
- `__ for more information on the default strategy.
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ range `[0, config.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
MBART_ENCODE_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
MBART_DECODE_INPUTS_DOCSTRING = r"""
Args:
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`):
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- encoder_outputs (:obj:`tuple(tuple(jnp.ndarray)`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(jnp.ndarray)`):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- encoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in `the paper
- `__ for more information on the default strategy.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.max_position_embeddings - 1]``.
- past_key_values (:obj:`Dict[str, np.ndarray]`, `optional`, returned by ``init_cache`` or when passing previous ``past_key_values``):
+ range `[0, config.max_position_embeddings - 1]`.
+ past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
- auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -992,15 +987,14 @@ class FlaxMBartPreTrainedModel(FlaxPreTrainedModel):
def init_cache(self, batch_size, max_length, encoder_outputs):
r"""
Args:
- batch_size (:obj:`int`):
+ batch_size (`int`):
batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
- max_length (:obj:`int`):
+ max_length (`int`):
maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
cache.
- encoder_outputs (:obj:`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
- ``encoder_outputs`` consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`,
- `optional`: :obj:`attentions`). :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length,
- hidden_size)`, `optional`) is a sequence of hidden-states at the output of the last layer of the
+ encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+ `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`,
+ *optional*: `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the
encoder. Used in the cross-attention of the decoder.
"""
# init input variables to retrieve cache
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 3cd5ba3ddc..0892b62b58 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -300,13 +300,13 @@ class MBartEncoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -383,19 +383,19 @@ class MBartDecoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for cross-attention heads in a given layer of
- size `(decoder_attention_heads,)`.
- past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+ size *(decoder_attention_heads,)*.
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -515,19 +515,19 @@ class MBartPreTrainedModel(PreTrainedModel):
MBART_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.MBartConfig`):
+ config ([`MBartConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
MBART_GENERATION_EXAMPLE = r"""
@@ -565,107 +565,103 @@ MBART_GENERATION_EXAMPLE = r"""
MBART_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- MBart uses a specific language id token as the starting token for :obj:`decoder_input_ids` generation that
- varies according to source and target language, *e.g.* 25004 for `en_XX`, and 25003 for `de_DE`. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
+ MBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
+ varies according to source and target language, *e.g.* 25004 for *en_XX*, and 25003 for *de_DE*. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in ``[0,
- 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
- have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
- :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
+ have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+ `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
- If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
- takes the value of :obj:`inputs_embeds`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+ takes the value of `inputs_embeds`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class MBartEncoder(MBartPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`MBartEncoderLayer`.
+ [`MBartEncoderLayer`].
Args:
config: MBartConfig
@@ -717,40 +713,40 @@ class MBartEncoder(MBartPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -840,7 +836,7 @@ class MBartEncoder(MBartPreTrainedModel):
class MBartDecoder(MBartPreTrainedModel):
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`MBartDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
Args:
config: MBartConfig
@@ -914,71 +910,68 @@ class MBartDecoder(MBartPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
- cross-attention on hidden heads. Mask values selected in ``[0, 1]``:
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1303,10 +1296,9 @@ class MBartForConditionalGeneration(MBartPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
@@ -1446,9 +1438,8 @@ class MBartForSequenceClassification(MBartPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
@@ -1572,13 +1563,13 @@ class MBartForQuestionAnswering(MBartPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1651,7 +1642,7 @@ class MBartForQuestionAnswering(MBartPreTrainedModel):
class MBartDecoderWrapper(MBartPreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
- used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
+ used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
@@ -1713,88 +1704,87 @@ class MBartForCausalLM(MBartPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
- in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
model.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels in ``[0, ...,
- config.vocab_size]``.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
Returns:
- Example::
+ Example:
- >>> from transformers import MBartTokenizer, MBartForCausalLM
+ ```python
+ >>> from transformers import MBartTokenizer, MBartForCausalLM
- >>> tokenizer = MBartTokenizer.from_pretrained('facebook/bart-large')
- >>> model = MBartForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
- >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> tokenizer = MBartTokenizer.from_pretrained('facebook/bart-large')
+ >>> model = MBartForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+ >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- """
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py
index 06e13cb29f..0702b8d178 100644
--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -298,11 +298,11 @@ class TFMBartEncoderLayer(tf.keras.layers.Layer):
def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -374,17 +374,17 @@ class TFMBartDecoderLayer(tf.keras.layers.Layer):
) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`tf.Tensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`tf.Tensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(decoder_attention_heads,)`
- cross_attn_layer_head_mask (:obj:`tf.Tensor`): mask for heads of the cross-attention module.
- `(decoder_attention_heads,)`
- past_key_value (:obj:`Tuple(tf.Tensor)`): cached past key and value projection states
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(decoder_attention_heads,)*
+ cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
+ *(decoder_attention_heads,)*
+ past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -475,117 +475,119 @@ class TFMBartPreTrainedModel(TFPreTrainedModel):
MBART_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(input_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.MBartConfig`): Model configuration class with all the parameters of the model.
+ config ([`MBartConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
MBART_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- MBart uses a specific language id token as the starting token for :obj:`decoder_input_ids` generation that
- varies according to source and target language, *e.g.* 25004 for `en_XX`, and 25003 for `de_DE`. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
+ MBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
+ varies according to source and target language, *e.g.* 25004 for *en_XX*, and 25003 for *de_DE*. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
- For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
- :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
the right for denoising pre-training following the paper.
- decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
+ encoder_outputs (`tf.FloatTensor`, *optional*):
hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
- of shape :obj:`(batch_size, sequence_length, hidden_size)` is a sequence of
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -625,7 +627,7 @@ class TFMBartEncoder(tf.keras.layers.Layer):
config_class = MBartConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`TFMBartEncoderLayer`.
+ [`TFMBartEncoderLayer`].
Args:
config: MBartConfig
@@ -670,44 +672,43 @@ class TFMBartEncoder(tf.keras.layers.Layer):
):
"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
in the config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail. This argument can be used only in eager mode, in graph mode the value in the config
will be used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -797,7 +798,7 @@ class TFMBartEncoder(tf.keras.layers.Layer):
class TFMBartDecoder(tf.keras.layers.Layer):
config_class = MBartConfig
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`TFMBartDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFMBartDecoderLayer`]
Args:
config: MBartConfig
@@ -847,69 +848,66 @@ class TFMBartDecoder(tf.keras.layers.Layer):
):
r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`MBartTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
in the config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail. This argument can be used only in eager mode, in graph mode the value in the config
will be used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1359,10 +1357,9 @@ class TFMBartForConditionalGeneration(TFMBartPreTrainedModel, TFCausalLanguageMo
**kwargs,
):
"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index f6b3caaf59..186541e3e6 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -731,25 +731,25 @@ class MegatronBertPreTrainedModel(PreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForPreTrainingOutput with Bert->MegatronBert
class MegatronBertForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.MegatronBertForPreTraining`.
+ Output type of [`MegatronBertForPreTraining`].
Args:
- loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+ loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.
- prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+ seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -764,69 +764,67 @@ class MegatronBertForPreTrainingOutput(ModelOutput):
MEGATRON_BERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.MegatronBertConfig`): Model configuration class with all the parameters of the model.
+ config ([`MegatronBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
MEGATRON_BERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -838,13 +836,13 @@ class MegatronBertModel(MegatronBertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
- cross-attention is added between the self-attention layers, following the architecture described in `Attention is
- all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+ cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+ all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
- To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
- set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
- argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration
+ set to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
+ argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an
input to the forward pass.
"""
@@ -898,24 +896,24 @@ class MegatronBertModel(MegatronBertPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1045,35 +1043,35 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
- (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+ (see `input_ids` docstring) Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
- kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
Returns:
- Example::
+ Example:
- >>> from transformers import BertTokenizer, MegatronBertForPreTraining
- >>> import torch
+ ```python
+ >>> from transformers import BertTokenizer, MegatronBertForPreTraining
+ >>> import torch
- >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
- >>> model = MegatronBertForPreTraining.from_pretrained('nvidia/megatron-bert-cased-345m')
+ >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
+ >>> model = MegatronBertForPreTraining.from_pretrained('nvidia/megatron-bert-cased-345m')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.prediction_logits
- >>> seq_relationship_logits = outputs.seq_relationship_logits
- """
+ >>> prediction_logits = outputs.prediction_logits
+ >>> seq_relationship_logits = outputs.seq_relationship_logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
@@ -1158,44 +1156,45 @@ class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
- ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
Returns:
- Example::
+ Example:
- >>> from transformers import BertTokenizer, MegatronBertForCausalLM, MegatronBertConfig
- >>> import torch
+ ```python
+ >>> from transformers import BertTokenizer, MegatronBertForCausalLM, MegatronBertConfig
+ >>> import torch
- >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
- >>> model = MegatronBertForCausalLM.from_pretrained('nvidia/megatron-bert-cased-345m', is_decoder=True)
+ >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
+ >>> model = MegatronBertForCausalLM.from_pretrained('nvidia/megatron-bert-cased-345m', is_decoder=True)
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.logits
- """
+ >>> prediction_logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
use_cache = False
@@ -1309,10 +1308,9 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1399,31 +1397,32 @@ class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel):
**kwargs
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
- (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+ (see `input_ids` docstring). Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
Returns:
- Example::
+ Example:
- >>> from transformers import BertTokenizer, MegatronBertForNextSentencePrediction
- >>> import torch
+ ```python
+ >>> from transformers import BertTokenizer, MegatronBertForNextSentencePrediction
+ >>> import torch
- >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
- >>> model = MegatronBertForNextSentencePrediction.from_pretrained('nvidia/megatron-bert-cased-345m')
+ >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
+ >>> model = MegatronBertForNextSentencePrediction.from_pretrained('nvidia/megatron-bert-cased-345m')
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
- >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
- >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+ >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+ >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
- >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
- >>> logits = outputs.logits
- >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
- """
+ >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+ >>> logits = outputs.logits
+ >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+ ```"""
if "next_sentence_label" in kwargs:
warnings.warn(
@@ -1507,10 +1506,9 @@ class MegatronBertForSequenceClassification(MegatronBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1606,10 +1604,9 @@ class MegatronBertForMultipleChoice(MegatronBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1702,9 +1699,8 @@ class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1794,13 +1790,13 @@ class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/mmbt/modeling_mmbt.py b/src/transformers/models/mmbt/modeling_mmbt.py
index 4abce49074..d9b76c6f6b 100644
--- a/src/transformers/models/mmbt/modeling_mmbt.py
+++ b/src/transformers/models/mmbt/modeling_mmbt.py
@@ -77,102 +77,99 @@ class ModalEmbeddings(nn.Module):
MMBT_START_DOCSTRING = r"""
- MMBT model was proposed in `Supervised Multimodal Bitransformers for Classifying Images and Text
- `__ by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
+ MMBT model was proposed in [Supervised Multimodal Bitransformers for Classifying Images and Text](https://github.com/facebookresearch/mmbt) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
It's a supervised multimodal bitransformer model that fuses information from text and other image encoders, and
obtain state-of-the-art performance on various multimodal classification benchmark tasks.
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.MMBTConfig`): Model configuration class with all the parameters of the model.
+ config ([`MMBTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration.
- transformer (:class: `~nn.Module`): A text transformer that is used by MMBT.
+ transformer (:class: *~nn.Module*): A text transformer that is used by MMBT.
It should have embeddings, encoder, and pooler attributes.
- encoder (:class: `~nn.Module`): Encoder for the second modality.
+ encoder (:class: *~nn.Module*): Encoder for the second modality.
It should take in a batch of modal inputs and return k, n dimension embeddings.
"""
MMBT_INPUTS_DOCSTRING = r"""
Args:
- input_modal (``torch.FloatTensor`` of shape ``(batch_size, ***)``):
+ input_modal (`torch.FloatTensor` of shape `(batch_size, ***)`):
The other modality data. It will be the shape that the encoder for that type expects. e.g. With an Image
Encoder, the shape would be (batch_size, channels, height, width)
- input_ids (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. It does not expect [CLS] token to be added as it's
appended to the end of other modality embeddings. Indices can be obtained using
- :class:`~transformers.BertTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and
- :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+ [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- modal_start_tokens (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+ [What are input IDs?](../glossary#input-ids)
+ modal_start_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Optional start token to be added to Other Modality Embedding. [CLS] Most commonly used for classification
tasks.
- modal_end_tokens (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+ modal_end_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Optional end token to be added to Other Modality Embedding. [SEP] Most commonly used.
- attention_mask (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (*optional*) `torch.FloatTensor` of shape `(batch_size, sequence_length)`:
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, sequence_length)`:
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- modal_token_type_ids (`optional`) ``torch.LongTensor`` of shape ``(batch_size, modal_sequence_length)``:
+ [What are token type IDs?](../glossary#token-type-ids)
+ modal_token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, modal_sequence_length)`:
Segment token indices to indicate different portions of the non-text modality. The embeddings from these
tokens will be summed with the respective token embeddings for the non-text modality.
- position_ids (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- modal_position_ids (``torch.LongTensor`` of shape ``(batch_size, modal_sequence_length)``, `optional`):
+ [What are position IDs?](../glossary#position-ids)
+ modal_position_ids (`torch.LongTensor` of shape `(batch_size, modal_sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings for the non-text modality.
- Selected in the range ``[0, config.max_position_embeddings - 1]``.
+ Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, embedding_dim)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- encoder_hidden_states (``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -313,31 +310,30 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
)
class MMBTForClassification(nn.Module):
r"""
- **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
- Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ...,
- config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
- If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+ **labels**: (*optional*) `torch.LongTensor` of shape `(batch_size,)`:
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
- Returns: `Tuple` comprising various elements depending on the configuration (config) and inputs: **loss**:
- (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Classification (or
- regression if config.num_labels==1) loss. **logits**: ``torch.FloatTensor`` of shape ``(batch_size,
- config.num_labels)`` Classification (or regression if config.num_labels==1) scores (before SoftMax).
- **hidden_states**: (`optional`, returned when ``output_hidden_states=True``) list of ``torch.FloatTensor`` (one for
- the output of each layer + the output of the embeddings) of shape ``(batch_size, sequence_length, hidden_size)``:
+ Returns: *Tuple* comprising various elements depending on the configuration (config) and inputs: **loss**:
+ (*optional*, returned when `labels` is provided) `torch.FloatTensor` of shape `(1,)`: Classification (or
+ regression if config.num_labels==1) loss. **logits**: `torch.FloatTensor` of shape `(batch_size, config.num_labels)` Classification (or regression if config.num_labels==1) scores (before SoftMax).
+ **hidden_states**: (*optional*, returned when `output_hidden_states=True`) list of `torch.FloatTensor` (one for
+ the output of each layer + the output of the embeddings) of shape `(batch_size, sequence_length, hidden_size)`:
Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**:
- (`optional`, returned when ``output_attentions=True``) list of ``torch.FloatTensor`` (one for each layer) of shape
- ``(batch_size, num_heads, sequence_length, sequence_length)``: Attentions weights after the attention softmax, used
+ (*optional*, returned when `output_attentions=True`) list of `torch.FloatTensor` (one for each layer) of shape
+ `(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used
to compute the weighted average in the self-attention heads.
- Examples::
+ Examples:
- # For example purposes. Not runnable.
- transformer = BertModel.from_pretrained('bert-base-uncased')
- encoder = ImageEncoder(args)
- model = MMBTForClassification(config, transformer, encoder)
- outputs = model(input_modal, input_ids, labels=labels)
- loss, logits = outputs[:2]
- """
+ ```python
+ # For example purposes. Not runnable.
+ transformer = BertModel.from_pretrained('bert-base-uncased')
+ encoder = ImageEncoder(args)
+ model = MMBTForClassification(config, transformer, encoder)
+ outputs = model(input_modal, input_ids, labels=labels)
+ loss, logits = outputs[:2]
+ ```"""
def __init__(self, config, transformer, encoder):
super().__init__()
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index 79519f6cf8..38738daeb9 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -683,25 +683,24 @@ class MobileBertPreTrainedModel(PreTrainedModel):
@dataclass
class MobileBertForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.MobileBertForPreTraining`.
+ Output type of [`MobileBertForPreTraining`].
Args:
- loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+ loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.
- prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+ seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -716,69 +715,67 @@ class MobileBertForPreTrainingOutput(ModelOutput):
MOBILEBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.MobileBertConfig`): Model configuration class with all the parameters of the model.
+ config ([`MobileBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
MOBILEBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -942,34 +939,33 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
- (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+ (see `input_ids` docstring) Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
Returns:
- Examples::
+ Examples:
- >>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
- >>> import torch
+ ```python
+ >>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
+ >>> import torch
- >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
- >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
+ >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+ >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
- >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
- >>> outputs = model(input_ids)
+ >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
+ >>> outputs = model(input_ids)
- >>> prediction_logits = outputs.prediction_logits
- >>> seq_relationship_logits = outputs.seq_relationship_logits
-
- """
+ >>> prediction_logits = outputs.prediction_logits
+ >>> seq_relationship_logits = outputs.seq_relationship_logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mobilebert(
@@ -1054,10 +1050,9 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1134,31 +1129,32 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
**kwargs,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
- (see ``input_ids`` docstring) Indices should be in ``[0, 1]``.
+ (see `input_ids` docstring) Indices should be in `[0, 1]`.
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
Returns:
- Examples::
+ Examples:
- >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
- >>> import torch
+ ```python
+ >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
+ >>> import torch
- >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
- >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
+ >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+ >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
- >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
- >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+ >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+ >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
- >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
- >>> loss = outputs.loss
- >>> logits = outputs.logits
- """
+ >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ ```"""
if "next_sentence_label" in kwargs:
warnings.warn(
@@ -1246,10 +1242,10 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1348,14 +1344,14 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
- sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
- sequence are not taken into account for computing the loss.
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+ are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1453,10 +1449,10 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1553,9 +1549,8 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index 3d81ced6a1..6de104a941 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -159,7 +159,7 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
@@ -799,22 +799,21 @@ class TFMobileBertPreTrainedModel(TFPreTrainedModel):
@dataclass
class TFMobileBertForPreTrainingOutput(ModelOutput):
"""
- Output type of :class:`~transformers.TFMobileBertForPreTraining`.
+ Output type of [`TFMobileBertForPreTraining`].
Args:
- prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- seq_relationship_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`):
+ seq_relationship_logits (`tf.Tensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -829,92 +828,92 @@ class TFMobileBertForPreTrainingOutput(ModelOutput):
MOBILEBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Parameters:
- config (:class:`~transformers.MobileBertConfig`): Model configuration class with all the parameters of the model.
+ config ([`MobileBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
MOBILEBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.MobileBertTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`MobileBertTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1143,9 +1142,8 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels
"""
inputs = input_processing(
@@ -1367,10 +1365,9 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
func=self.call,
@@ -1476,13 +1473,13 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
**kwargs,
):
r"""
- start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
inputs = input_processing(
@@ -1610,10 +1607,9 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
inputs = input_processing(
func=self.call,
@@ -1761,9 +1757,8 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py
index c4eadbf439..59ce04003e 100644
--- a/src/transformers/models/mpnet/modeling_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_mpnet.py
@@ -419,61 +419,60 @@ class MPNetPooler(nn.Module):
MPNET_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.MPNetConfig`): Model configuration class with all the parameters of the model.
+ config ([`MPNetConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
MPNET_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.MPNetTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`MPNetTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -613,10 +612,9 @@ class MPNetForMaskedLM(MPNetPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -716,10 +714,9 @@ class MPNetForSequenceClassification(MPNetPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -811,10 +808,9 @@ class MPNetForMultipleChoice(MPNetPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -904,9 +900,8 @@ class MPNetForTokenClassification(MPNetPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1014,13 +1009,13 @@ class MPNetForQuestionAnswering(MPNetPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py
index b043fc5705..c2322d13d5 100644
--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -137,7 +137,7 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
@@ -588,84 +588,85 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
MPNET_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensor in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensor in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "attention_mask": attention_mask})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "attention_mask": attention_mask})`
+
+
Args:
- config (:class:`~transformers.MPNetConfig`): Model configuration class with all the parameters of the model.
+ config ([`MPNetConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
MPNET_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.MPNetTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`MPNetTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -831,10 +832,9 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
@@ -952,10 +952,9 @@ class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassif
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
@@ -1058,10 +1057,9 @@ class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss):
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
inputs = input_processing(
func=self.call,
@@ -1190,9 +1188,8 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
@@ -1290,13 +1287,13 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
**kwargs,
):
r"""
- start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
diff --git a/src/transformers/models/mt5/modeling_flax_mt5.py b/src/transformers/models/mt5/modeling_flax_mt5.py
index 4d2437e8c0..43abc1794c 100644
--- a/src/transformers/models/mt5/modeling_flax_mt5.py
+++ b/src/transformers/models/mt5/modeling_flax_mt5.py
@@ -27,52 +27,54 @@ _TOKENIZER_FOR_DOC = "T5Tokenizer"
class FlaxMT5Model(FlaxT5Model):
r"""
- This class overrides :class:`~transformers.FlaxT5Model`. Please check the superclass for the appropriate
+ This class overrides [`FlaxT5Model`]. Please check the superclass for the appropriate
documentation alongside usage examples.
- Examples::
+ Examples:
- >>> from transformers import FlaxMT5Model, T5Tokenizer
+ ```python
+ >>> from transformers import FlaxMT5Model, T5Tokenizer
- >>> model = FlaxMT5Model.from_pretrained("google/mt5-small")
- >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+ >>> model = FlaxMT5Model.from_pretrained("google/mt5-small")
+ >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
- >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
- >>> summary = "Weiter Verhandlung in Syrien."
- >>> inputs = tokenizer(article, return_tensors="np")
+ >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+ >>> summary = "Weiter Verhandlung in Syrien."
+ >>> inputs = tokenizer(article, return_tensors="np")
- >>> with tokenizer.as_target_tokenizer():
- ... decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids
+ >>> with tokenizer.as_target_tokenizer():
+ ... decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids
- >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=decoder_input_ids)
- >>> hidden_states = outputs.last_hidden_state
- """
+ >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=decoder_input_ids)
+ >>> hidden_states = outputs.last_hidden_state
+ ```"""
model_type = "mt5"
config_class = MT5Config
class FlaxMT5ForConditionalGeneration(FlaxT5ForConditionalGeneration):
r"""
- This class overrides :class:`~transformers.FlaxT5ForConditionalGeneration`. Please check the superclass for the
+ This class overrides [`FlaxT5ForConditionalGeneration`]. Please check the superclass for the
appropriate documentation alongside usage examples.
- Examples::
+ Examples:
- >>> from transformers import FlaxMT5ForConditionalGeneration, T5Tokenizer
+ ```python
+ >>> from transformers import FlaxMT5ForConditionalGeneration, T5Tokenizer
- >>> model = FlaxMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
- >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+ >>> model = FlaxMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+ >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
- >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
- >>> summary = "Weiter Verhandlung in Syrien."
- >>> inputs = tokenizer(article, return_tensors="np")
+ >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+ >>> summary = "Weiter Verhandlung in Syrien."
+ >>> inputs = tokenizer(article, return_tensors="np")
- >>> with tokenizer.as_target_tokenizer():
- ... decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids
+ >>> with tokenizer.as_target_tokenizer():
+ ... decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids
- >>> outputs = model(**inputs, decoder_input_ids=decoder_input_ids)
- >>> logits = outputs.logits
- """
+ >>> outputs = model(**inputs, decoder_input_ids=decoder_input_ids)
+ >>> logits = outputs.logits
+ ```"""
model_type = "mt5"
config_class = MT5Config
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index 8276dd472b..87ca5d2b83 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -27,23 +27,24 @@ _TOKENIZER_FOR_DOC = "T5Tokenizer"
class MT5Model(T5Model):
r"""
- This class overrides :class:`~transformers.T5Model`. Please check the superclass for the appropriate documentation
+ This class overrides [`T5Model`]. Please check the superclass for the appropriate documentation
alongside usage examples.
- Examples::
+ Examples:
- >>> from transformers import MT5Model, T5Tokenizer
- >>> model = MT5Model.from_pretrained("google/mt5-small")
- >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
- >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
- >>> summary = "Weiter Verhandlung in Syrien."
- >>> inputs = tokenizer(article, return_tensors="pt")
- >>> with tokenizer.as_target_tokenizer():
- ... labels = tokenizer(summary, return_tensors="pt")
+ ```python
+ >>> from transformers import MT5Model, T5Tokenizer
+ >>> model = MT5Model.from_pretrained("google/mt5-small")
+ >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+ >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+ >>> summary = "Weiter Verhandlung in Syrien."
+ >>> inputs = tokenizer(article, return_tensors="pt")
+ >>> with tokenizer.as_target_tokenizer():
+ ... labels = tokenizer(summary, return_tensors="pt")
- >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
- >>> hidden_states = outputs.last_hidden_state
- """
+ >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
+ >>> hidden_states = outputs.last_hidden_state
+ ```"""
model_type = "mt5"
config_class = MT5Config
_keys_to_ignore_on_load_missing = [
@@ -59,23 +60,24 @@ class MT5Model(T5Model):
class MT5ForConditionalGeneration(T5ForConditionalGeneration):
r"""
- This class overrides :class:`~transformers.T5ForConditionalGeneration`. Please check the superclass for the
+ This class overrides [`T5ForConditionalGeneration`]. Please check the superclass for the
appropriate documentation alongside usage examples.
- Examples::
+ Examples:
- >>> from transformers import MT5ForConditionalGeneration, T5Tokenizer
- >>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
- >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
- >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
- >>> summary = "Weiter Verhandlung in Syrien."
- >>> inputs = tokenizer(article, return_tensors="pt")
- >>> with tokenizer.as_target_tokenizer():
- ... labels = tokenizer(summary, return_tensors="pt")
+ ```python
+ >>> from transformers import MT5ForConditionalGeneration, T5Tokenizer
+ >>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+ >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+ >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+ >>> summary = "Weiter Verhandlung in Syrien."
+ >>> inputs = tokenizer(article, return_tensors="pt")
+ >>> with tokenizer.as_target_tokenizer():
+ ... labels = tokenizer(summary, return_tensors="pt")
- >>> outputs = model(**inputs,labels=labels["input_ids"])
- >>> loss = outputs.loss
- """
+ >>> outputs = model(**inputs,labels=labels["input_ids"])
+ >>> loss = outputs.loss
+ ```"""
model_type = "mt5"
config_class = MT5Config
@@ -89,19 +91,20 @@ class MT5ForConditionalGeneration(T5ForConditionalGeneration):
class MT5EncoderModel(T5EncoderModel):
r"""
- This class overrides :class:`~transformers.T5EncoderModel`. Please check the superclass for the appropriate
+ This class overrides [`T5EncoderModel`]. Please check the superclass for the appropriate
documentation alongside usage examples.
- Examples::
+ Examples:
- >>> from transformers import MT5EncoderModel, T5Tokenizer
- >>> model = MT5EncoderModel.from_pretrained("google/mt5-small")
- >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
- >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
- >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
- >>> outputs = model(input_ids)
- >>> hidden_state = outputs.last_hidden_state
- """
+ ```python
+ >>> from transformers import MT5EncoderModel, T5Tokenizer
+ >>> model = MT5EncoderModel.from_pretrained("google/mt5-small")
+ >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+ >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+ >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
+ >>> outputs = model(input_ids)
+ >>> hidden_state = outputs.last_hidden_state
+ ```"""
model_type = "mt5"
config_class = MT5Config
diff --git a/src/transformers/models/mt5/modeling_tf_mt5.py b/src/transformers/models/mt5/modeling_tf_mt5.py
index cd16067693..274c30c23a 100644
--- a/src/transformers/models/mt5/modeling_tf_mt5.py
+++ b/src/transformers/models/mt5/modeling_tf_mt5.py
@@ -27,46 +27,48 @@ _TOKENIZER_FOR_DOC = "T5Tokenizer"
class TFMT5Model(TFT5Model):
r"""
- This class overrides :class:`~transformers.TFT5Model`. Please check the superclass for the appropriate
+ This class overrides [`TFT5Model`]. Please check the superclass for the appropriate
documentation alongside usage examples.
- Examples::
+ Examples:
- >>> from transformers import TFMT5Model, T5Tokenizer
- >>> model = TFMT5Model.from_pretrained("google/mt5-small")
- >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
- >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
- >>> summary = "Weiter Verhandlung in Syrien."
- >>> inputs = tokenizer(article, return_tensors="tf")
- >>> with tokenizer.as_target_tokenizer():
- ... labels = tokenizer(summary, return_tensors="tf")
+ ```python
+ >>> from transformers import TFMT5Model, T5Tokenizer
+ >>> model = TFMT5Model.from_pretrained("google/mt5-small")
+ >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+ >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+ >>> summary = "Weiter Verhandlung in Syrien."
+ >>> inputs = tokenizer(article, return_tensors="tf")
+ >>> with tokenizer.as_target_tokenizer():
+ ... labels = tokenizer(summary, return_tensors="tf")
- >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
- >>> hidden_states = outputs.last_hidden_state
- """
+ >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
+ >>> hidden_states = outputs.last_hidden_state
+ ```"""
model_type = "mt5"
config_class = MT5Config
class TFMT5ForConditionalGeneration(TFT5ForConditionalGeneration):
r"""
- This class overrides :class:`~transformers.TFT5ForConditionalGeneration`. Please check the superclass for the
+ This class overrides [`TFT5ForConditionalGeneration`]. Please check the superclass for the
appropriate documentation alongside usage examples.
- Examples::
+ Examples:
- >>> from transformers import TFMT5ForConditionalGeneration, T5Tokenizer
- >>> model = TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
- >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
- >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
- >>> summary = "Weiter Verhandlung in Syrien."
- >>> inputs = tokenizer(article, return_tensors="tf")
- >>> with tokenizer.as_target_tokenizer():
- ... labels = tokenizer(summary, return_tensors="tf")
+ ```python
+ >>> from transformers import TFMT5ForConditionalGeneration, T5Tokenizer
+ >>> model = TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+ >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+ >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+ >>> summary = "Weiter Verhandlung in Syrien."
+ >>> inputs = tokenizer(article, return_tensors="tf")
+ >>> with tokenizer.as_target_tokenizer():
+ ... labels = tokenizer(summary, return_tensors="tf")
- >>> outputs = model(**inputs,labels=labels["input_ids"])
- >>> loss = outputs.loss
- """
+ >>> outputs = model(**inputs,labels=labels["input_ids"])
+ >>> loss = outputs.loss
+ ```"""
model_type = "mt5"
config_class = MT5Config
@@ -74,19 +76,20 @@ class TFMT5ForConditionalGeneration(TFT5ForConditionalGeneration):
class TFMT5EncoderModel(TFT5EncoderModel):
r"""
- This class overrides :class:`~transformers.TFT5EncoderModel`. Please check the superclass for the appropriate
+ This class overrides [`TFT5EncoderModel`]. Please check the superclass for the appropriate
documentation alongside usage examples.
- Examples::
+ Examples:
- >>> from transformers import TFMT5EncoderModel, T5Tokenizer
- >>> model = TFMT5EncoderModel.from_pretrained("google/mt5-small")
- >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
- >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
- >>> input_ids = tokenizer(article, return_tensors="tf").input_ids
- >>> outputs = model(input_ids)
- >>> hidden_state = outputs.last_hidden_state
- """
+ ```python
+ >>> from transformers import TFMT5EncoderModel, T5Tokenizer
+ >>> model = TFMT5EncoderModel.from_pretrained("google/mt5-small")
+ >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+ >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+ >>> input_ids = tokenizer(article, return_tensors="tf").input_ids
+ >>> outputs = model(input_ids)
+ >>> hidden_state = outputs.last_hidden_state
+ ```"""
model_type = "mt5"
config_class = MT5Config
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index 6153a87301..e9ceeb6af4 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -303,22 +303,21 @@ class OpenAIGPTDoubleHeadsModelOutput(ModelOutput):
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss.
- mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
+ mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided):
Multiple choice classification loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+ mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -334,69 +333,67 @@ class OpenAIGPTDoubleHeadsModelOutput(ModelOutput):
OPENAI_GPT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+ config ([`OpenAIGPTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
OPENAI_GPT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.OpenAIGPTTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`OpenAIGPTTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -571,10 +568,10 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
- ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
- ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to
+ `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -658,38 +655,37 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
return_dict=None,
):
r"""
- mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
- Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) -
- 1]``.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
+ Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) - 1]`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
- ``labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to
- ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
- mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
- `input_ids` above)
+ `labels = input_ids` Indices are selected in `[-1, 0, ..., config.vocab_size]` All labels set to
+ `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+ mc_labels (`torch.LongTensor` of shape `(batch_size)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where *num_choices* is the size of the second dimension of the input tensors. (see
+ *input_ids* above)
Return:
- Examples::
+ Examples:
- >>> from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
- >>> import torch
+ ```python
+ >>> from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
+ >>> import torch
- >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
- >>> model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
- >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
- >>> model.resize_token_embeddings(len(tokenizer))
+ >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+ >>> model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+ >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
+ >>> model.resize_token_embeddings(len(tokenizer))
- >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
- >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
- >>> mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0) # Batch size 1
+ >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+ >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
+ >>> mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0) # Batch size 1
- >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
- >>> lm_logits = outputs.lm_logits
- >>> mc_logits = outputs.mc_logits
- """
+ >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+ >>> lm_logits = outputs.lm_logits
+ >>> mc_logits = outputs.mc_logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
@@ -737,12 +733,12 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
@add_start_docstrings(
"""
The Original OpenAI GPT Model transformer with a sequence classification head on top (linear layer).
- :class:`~transformers.OpenAIGPTForSequenceClassification` uses the last token in order to do the classification, as
+ [`OpenAIGPTForSequenceClassification`] uses the last token in order to do the classification, as
other causal models (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the
- position of the last token. If a :obj:`pad_token_id` is defined in the configuration, it finds the last token that
- is not a padding token in each row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each
- row of the batch. Since it cannot guess the padding tokens when :obj:`inputs_embeds` are passed instead of
- :obj:`input_ids`, it does the same (take the last value in each row of the batch).
+ position of the last token. If a `pad_token_id` is defined in the configuration, it finds the last token that
+ is not a padding token in each row. If no `pad_token_id` is defined, it simply takes the last value in each
+ row of the batch. Since it cannot guess the padding tokens when `inputs_embeds` are passed instead of
+ `input_ids`, it does the same (take the last value in each row of the batch).
""",
OPENAI_GPT_START_DOCSTRING,
)
@@ -777,10 +773,9 @@ class OpenAIGPTForSequenceClassification(OpenAIGPTPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index 221f9c63cb..2f31e5539a 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -393,18 +393,17 @@ class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
+ logits (`tf.Tensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- mc_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
+ mc_logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
- hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -418,93 +417,92 @@ class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
OPENAI_GPT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
Parameters:
- config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+ config ([`OpenAIGPTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
OPENAI_GPT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.OpenAIGPTTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`OpenAIGPTTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`tf.Tensor` or `Numpy array` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -618,9 +616,8 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the cross entropy classification loss. Indices should be in ``[0, ...,
- config.vocab_size - 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., config.vocab_size - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -715,32 +712,32 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
**kwargs,
):
r"""
- mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
- Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) -
- 1]``.
+ mc_token_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
+ Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) - 1]`.
Return:
- Examples::
+ Examples:
- >>> import tensorflow as tf
- >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
+ ```python
+ >>> import tensorflow as tf
+ >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
- >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
- >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+ >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+ >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
- >>> # Add a [CLS] to the vocabulary (we should train it also!)
- >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
- >>> model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
- >>> print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
+ >>> # Add a [CLS] to the vocabulary (we should train it also!)
+ >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+ >>> model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
+ >>> print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
- >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
- >>> encoding = tokenizer(choices, return_tensors="tf")
- >>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
- >>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :] # Batch size 1
- >>> outputs = model(inputs)
- >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
- """
+ >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+ >>> encoding = tokenizer(choices, return_tensors="tf")
+ >>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
+ >>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :] # Batch size 1
+ >>> outputs = model(inputs)
+ >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+ ```"""
inputs = input_processing(
func=self.call,
@@ -830,13 +827,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
"""
The OpenAI GPT Model transformer with a sequence classification head on top (linear layer).
- :class:`~transformers.TFOpenAIGPTForSequenceClassification` uses the last token in order to do the classification,
+ [`TFOpenAIGPTForSequenceClassification`] uses the last token in order to do the classification,
as other causal models (e.g. GPT-2) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
- :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
- row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
- guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
+ row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
+ guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take
the last value in each row of the batch).
""",
OPENAI_GPT_START_DOCSTRING,
@@ -876,9 +873,8 @@ class TFOpenAIGPTForSequenceClassification(TFOpenAIGPTPreTrainedModel, TFSequenc
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the cross entropy classification loss. Indices should be in ``[0, ...,
- config.vocab_size - 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., config.vocab_size - 1]`.
"""
inputs = input_processing(
func=self.call,
diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py
index 48fe22c5d5..f55a993801 100644
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -58,162 +58,157 @@ _CONFIG_FOR_DOC = "PegasusConfig"
_TOKENIZER_FOR_DOC = "PegasusTokenizer"
PEGASUS_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a Flax Linen `flax.nn.Module
- `__ subclass. Use it as a regular Flax
+ This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.PegasusConfig`): Model configuration class with all the parameters of the model.
+ config ([`PegasusConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
+ [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
"""
PEGASUS_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in `the paper
- `__ for more information on the default strategy.
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ range `[0, config.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
PEGASUS_ENCODE_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
PEGASUS_DECODE_INPUTS_DOCSTRING = r"""
Args:
- decoder_input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`):
+ decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
- encoder_outputs (:obj:`tuple(tuple(jnp.ndarray)`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
+ encoder_outputs (`tuple(tuple(jnp.ndarray)`):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- encoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_attention_mask (:obj:`jnp.ndarray` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- If you want to change padding behavior, you should modify to your needs. See diagram 1 in `the paper
- `__ for more information on the default strategy.
- decoder_position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
- range ``[0, config.max_position_embeddings - 1]``.
- past_key_values (:obj:`Dict[str, np.ndarray]`, `optional`, returned by ``init_cache`` or when passing previous ``past_key_values``):
+ range `[0, config.max_position_embeddings - 1]`.
+ past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
- auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -940,15 +935,14 @@ class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel):
def init_cache(self, batch_size, max_length, encoder_outputs):
r"""
Args:
- batch_size (:obj:`int`):
+ batch_size (`int`):
batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
- max_length (:obj:`int`):
+ max_length (`int`):
maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
cache.
- encoder_outputs (:obj:`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
- ``encoder_outputs`` consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`,
- `optional`: :obj:`attentions`). :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length,
- hidden_size)`, `optional`) is a sequence of hidden-states at the output of the last layer of the
+ encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+ `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`,
+ *optional*: `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the
encoder. Used in the cross-attention of the decoder.
"""
# init input variables to retrieve cache
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index e898826950..b5e4879be2 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -312,13 +312,13 @@ class PegasusEncoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -396,19 +396,19 @@ class PegasusDecoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`.
- cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for cross-attention heads in a given layer of
- size `(decoder_attention_heads,)`.
- past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*.
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+ size *(decoder_attention_heads,)*.
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -495,19 +495,19 @@ class PegasusPreTrainedModel(PreTrainedModel):
PEGASUS_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.PegasusConfig`):
+ config ([`PegasusConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
PEGASUS_GENERATION_EXAMPLE = r"""
@@ -532,102 +532,98 @@ PEGASUS_GENERATION_EXAMPLE = r"""
PEGASUS_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- Pegasus uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ Pegasus uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in ``[0,
- 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
- have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
- :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
+ have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+ `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
- If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
- takes the value of :obj:`inputs_embeds`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+ takes the value of `inputs_embeds`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class PegasusEncoder(PegasusPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`PegasusEncoderLayer`.
+ [`PegasusEncoderLayer`].
Args:
config: PegasusConfig
@@ -664,11 +660,10 @@ class PegasusEncoder(PegasusPreTrainedModel):
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
- Resizes position embeddings matrix of the model if :obj:`new_num_position_embeddings !=
- config.max_position_embeddings`.
+ Resizes position embeddings matrix of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
- new_num_position_embeddings (:obj:`int`):
+ new_num_position_embeddings (`int`):
The number of new position embeddings. If position embeddings are learned, increasing the size will add
newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
@@ -703,40 +698,40 @@ class PegasusEncoder(PegasusPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -826,7 +821,7 @@ class PegasusEncoder(PegasusPreTrainedModel):
class PegasusDecoder(PegasusPreTrainedModel):
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`PegasusDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`PegasusDecoderLayer`]
Args:
config: PegasusConfig
@@ -885,11 +880,10 @@ class PegasusDecoder(PegasusPreTrainedModel):
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
- Resizes position embeddings matrix of the model if :obj:`new_num_position_embeddings !=
- config.max_position_embeddings`.
+ Resizes position embeddings matrix of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
- new_num_position_embeddings (:obj:`int`):
+ new_num_position_embeddings (`int`):
The number of new position embeddings. If position embeddings are learned, increasing the size will add
newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
@@ -929,71 +923,68 @@ class PegasusDecoder(PegasusPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in decoder to avoid performing
- cross-attention on hidden heads. Mask values selected in ``[0, 1]``:
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1163,11 +1154,10 @@ class PegasusModel(PegasusPreTrainedModel):
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
- Resizes position embeddings matrix of the model if :obj:`new_num_position_embeddings !=
- config.max_position_embeddings`.
+ Resizes position embeddings matrix of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
- new_num_position_embeddings (:obj:`int`):
+ new_num_position_embeddings (`int`):
The number of new position embeddings. If position embeddings are learned, increasing the size will add
newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
@@ -1327,11 +1317,10 @@ class PegasusForConditionalGeneration(PegasusPreTrainedModel):
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
- Resizes position embeddings matrix of the model if :obj:`new_num_position_embeddings !=
- config.max_position_embeddings`.
+ Resizes position embeddings matrix of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
- new_num_position_embeddings (:obj:`int`):
+ new_num_position_embeddings (`int`):
The number of new position embeddings. If position embeddings are learned, increasing the size will add
newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
@@ -1371,10 +1360,9 @@ class PegasusForConditionalGeneration(PegasusPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
@@ -1473,7 +1461,7 @@ class PegasusForConditionalGeneration(PegasusPreTrainedModel):
class PegasusDecoderWrapper(PegasusPreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
- used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
+ used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
@@ -1523,11 +1511,10 @@ class PegasusForCausalLM(PegasusPreTrainedModel):
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
- Resizes position embeddings matrix of the model if :obj:`new_num_position_embeddings !=
- config.max_position_embeddings`.
+ Resizes position embeddings matrix of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
- new_num_position_embeddings (:obj:`int`):
+ new_num_position_embeddings (`int`):
The number of new position embeddings. If position embeddings are learned, increasing the size will add
newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
@@ -1557,88 +1544,87 @@ class PegasusForCausalLM(PegasusPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
- in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
model.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels in ``[0, ...,
- config.vocab_size]``.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
Returns:
- Example::
+ Example:
- >>> from transformers import PegasusTokenizer, PegasusForCausalLM
+ ```python
+ >>> from transformers import PegasusTokenizer, PegasusForCausalLM
- >>> tokenizer = PegasusTokenizer.from_pretrained('facebook/bart-large')
- >>> model = PegasusForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
- >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> tokenizer = PegasusTokenizer.from_pretrained('facebook/bart-large')
+ >>> model = PegasusForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+ >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- """
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py
index e3b90377c7..d0496dc215 100644
--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -340,11 +340,11 @@ class TFPegasusEncoderLayer(tf.keras.layers.Layer):
def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(encoder_attention_heads,)`
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(encoder_attention_heads,)*
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -417,17 +417,17 @@ class TFPegasusDecoderLayer(tf.keras.layers.Layer):
) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
"""
Args:
- hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
- attention_mask (:obj:`tf.Tensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`tf.Tensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`tf.Tensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`tf.Tensor`): mask for attention heads in a given layer of size
- `(decoder_attention_heads,)`
- cross_attn_layer_head_mask (:obj:`tf.Tensor`): mask for heads of the cross-attention module.
- `(decoder_attention_heads,)`
- past_key_value (:obj:`Tuple(tf.Tensor)`): cached past key and value projection states
+ hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+ attention_mask (`tf.Tensor`): attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+ encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
+ *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+ layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+ *(decoder_attention_heads,)*
+ cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
+ *(decoder_attention_heads,)*
+ past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -518,37 +518,39 @@ class TFPegasusPreTrainedModel(TFPreTrainedModel):
PEGASUS_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(input_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.PegasusConfig`): Model configuration class with all the parameters of the model.
+ config ([`PegasusConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
@@ -574,79 +576,79 @@ PEGASUS_GENERATION_EXAMPLE = r"""
PEGASUS_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- Pegasus uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ Pegasus uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
+ encoder_outputs (`tf.FloatTensor`, *optional*):
hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
- of shape :obj:`(batch_size, sequence_length, hidden_size)` is a sequence of
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
- output_attentions (:obj:`bool`, `optional`): Whether or not to return the attentions tensors of all
- attention layers. See ``attentions`` under returned tensors for more detail. This argument can be used only
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
+ output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all
+ attention layers. See `attentions` under returned tensors for more detail. This argument can be used only
in eager mode, in graph mode the value in the config will be used instead.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -657,7 +659,7 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
config_class = PegasusConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`TFPegasusEncoderLayer`.
+ [`TFPegasusEncoderLayer`].
Args:
config: PegasusConfig
@@ -701,44 +703,43 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
):
"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
in the config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail. This argument can be used only in eager mode, in graph mode the value in the config
will be used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -827,7 +828,7 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
class TFPegasusDecoder(tf.keras.layers.Layer):
config_class = PegasusConfig
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`TFPegasusDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFPegasusDecoderLayer`]
Args:
config: PegasusConfig
@@ -876,69 +877,66 @@ class TFPegasusDecoder(tf.keras.layers.Layer):
):
r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`PegasusTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
in the config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail. This argument can be used only in eager mode, in graph mode the value in the config
will be used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -1384,10 +1382,9 @@ class TFPegasusForConditionalGeneration(TFPegasusPreTrainedModel, TFCausalLangua
**kwargs,
):
"""
- labels (:obj:`tf.tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index e231fa8877..5ab22e7bb6 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -69,21 +69,19 @@ class PerceiverModelOutput(ModelOutput):
Base class for Perceiver base model's outputs, with potential hidden states, attentions and cross-attentions.
Args:
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
attention softmax, used to compute the weighted average in the cross-attention heads.
"""
@@ -100,11 +98,10 @@ class PerceiverDecoderOutput(ModelOutput):
Base class for Perceiver decoder outputs, with potential cross-attentions.
Args:
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
Output of the basic decoder.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
attention softmax, used to compute the weighted average in the cross-attention heads.
"""
@@ -118,21 +115,19 @@ class PerceiverMaskedLMOutput(ModelOutput):
Base class for Perceiver's masked language model outputs.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Masked language modeling (MLM) loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, num_latents,
- num_latents)`. Attentions weights after the attention softmax, used to compute the weighted average in the
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_latents, num_latents)`. Attentions weights after the attention softmax, used to compute the weighted average in the
self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
attention softmax, used to compute the weighted average in the cross-attention heads.
"""
@@ -150,21 +145,19 @@ class PerceiverClassifierOutput(ModelOutput):
autoencoding.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
weighted average in the self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
attention softmax, used to compute the weighted average in the cross-attention heads.
"""
@@ -646,74 +639,74 @@ class PerceiverPreTrainedModel(PreTrainedModel):
PERCEIVER_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.PerceiverConfig`): Model configuration class with all the parameters of the model.
+ config ([`PerceiverConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
PERCEIVER_MODEL_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.PerceiverConfig`): Model configuration class with all the parameters of the model.
+ config ([`PerceiverConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
- decoder (`DecoderType`, `optional`):
+ decoder (*DecoderType*, *optional*):
Optional decoder to use to decode the latent representation of the encoder. Examples include
- `transformers.models.perceiver.modeling_perceiver.PerceiverBasicDecoder`,
- `transformers.models.perceiver.modeling_perceiver.PerceiverClassificationDecoder`,
- `transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder`.
- input_preprocessor (`PreprocessorType`, `optional`):
+ *transformers.models.perceiver.modeling_perceiver.PerceiverBasicDecoder*,
+ *transformers.models.perceiver.modeling_perceiver.PerceiverClassificationDecoder*,
+ *transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder*.
+ input_preprocessor (*PreprocessorType*, *optional*):
Optional input preprocessor to use. Examples include
- `transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor`,
- `transformers.models.perceiver.modeling_perceiver.PerceiverAudioPreprocessor`,
- `transformers.models.perceiver.modeling_perceiver.PerceiverTextPreprocessor`,
- `transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor`.
- output_postprocessor (`PostprocessorType`, `optional`):
+ *transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor*,
+ *transformers.models.perceiver.modeling_perceiver.PerceiverAudioPreprocessor*,
+ *transformers.models.perceiver.modeling_perceiver.PerceiverTextPreprocessor*,
+ *transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor*.
+ output_postprocessor (*PostprocessorType*, *optional*):
Optional output postprocessor to use. Examples include
- `transformers.models.perceiver.modeling_perceiver.PerceiverImagePostprocessor`,
- `transformers.models.perceiver.modeling_perceiver.PerceiverAudioPostprocessor`,
- `transformers.models.perceiver.modeling_perceiver.PerceiverClassificationPostprocessor`,
- `transformers.models.perceiver.modeling_perceiver.PerceiverProjectionPostprocessor`,
- `transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalPostprocessor`.
+ *transformers.models.perceiver.modeling_perceiver.PerceiverImagePostprocessor*,
+ *transformers.models.perceiver.modeling_perceiver.PerceiverAudioPostprocessor*,
+ *transformers.models.perceiver.modeling_perceiver.PerceiverClassificationPostprocessor*,
+ *transformers.models.perceiver.modeling_perceiver.PerceiverProjectionPostprocessor*,
+ *transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalPostprocessor*.
Note that you can define your own decoders, preprocessors and/or postprocessors to fit your use-case.
"""
PERCEIVER_INPUTS_DOCSTRING = r"""
Args:
- inputs (:obj:`torch.FloatTensor`):
+ inputs (`torch.FloatTensor`):
Inputs to the perceiver. Can be anything: images, text, audio, video, etc.
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -990,47 +983,48 @@ class PerceiverForMaskedLM(PerceiverPreTrainedModel):
input_ids=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Returns:
- Examples::
- >>> from transformers import PerceiverTokenizer, PerceiverForMaskedLM
- >>> import torch
+ Examples:
- >>> tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')
- >>> model = PerceiverForMaskedLM.from_pretrained('deepmind/language-perceiver')
+ ```python
+ >>> from transformers import PerceiverTokenizer, PerceiverForMaskedLM
+ >>> import torch
- >>> # training
- >>> text = "This is an incomplete sentence where some words are missing."
- >>> inputs = tokenizer(text, padding="max_length", return_tensors="pt")
- >>> # mask " missing."
- >>> inputs['input_ids'][0, 52:61] = tokenizer.mask_token_id
- >>> labels = tokenizer(text, padding="max_length", return_tensors="pt").input_ids
+ >>> tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')
+ >>> model = PerceiverForMaskedLM.from_pretrained('deepmind/language-perceiver')
- >>> outputs = model(**inputs, labels=labels)
- >>> loss = outputs.loss
- >>> logits = outputs.logits
+ >>> # training
+ >>> text = "This is an incomplete sentence where some words are missing."
+ >>> inputs = tokenizer(text, padding="max_length", return_tensors="pt")
+ >>> # mask " missing."
+ >>> inputs['input_ids'][0, 52:61] = tokenizer.mask_token_id
+ >>> labels = tokenizer(text, padding="max_length", return_tensors="pt").input_ids
- >>> # inference
- >>> text = "This is an incomplete sentence where some words are missing."
- >>> encoding = tokenizer(text, padding="max_length", return_tensors="pt")
+ >>> outputs = model(**inputs, labels=labels)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
- >>> # mask bytes corresponding to " missing.". Note that the model performs much better if the masked span starts with a space.
- >>> encoding['input_ids'][0, 52:61] = tokenizer.mask_token_id
+ >>> # inference
+ >>> text = "This is an incomplete sentence where some words are missing."
+ >>> encoding = tokenizer(text, padding="max_length", return_tensors="pt")
- >>> # forward pass
- >>> with torch.no_grad():
- >>> outputs = model(**encoding)
- >>> logits = outputs.logits
+ >>> # mask bytes corresponding to " missing.". Note that the model performs much better if the masked span starts with a space.
+ >>> encoding['input_ids'][0, 52:61] = tokenizer.mask_token_id
- >>> masked_tokens_predictions = logits[0, 52:61].argmax(dim=-1).tolist()
- >>> tokenizer.decode(masked_tokens_predictions)
- ' missing.'
- """
+ >>> # forward pass
+ >>> with torch.no_grad():
+ >>> outputs = model(**encoding)
+ >>> logits = outputs.logits
+
+ >>> masked_tokens_predictions = logits[0, 52:61].argmax(dim=-1).tolist()
+ >>> tokenizer.decode(masked_tokens_predictions)
+ ' missing.'
+ ```"""
if inputs is not None and input_ids is not None:
raise ValueError("You cannot use both `inputs` and `input_ids`")
elif inputs is None and input_ids is not None:
@@ -1110,25 +1104,25 @@ class PerceiverForSequenceClassification(PerceiverPreTrainedModel):
input_ids=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import PerceiverTokenizer, PerceiverForSequenceClassification
+ ```python
+ >>> from transformers import PerceiverTokenizer, PerceiverForSequenceClassification
- >>> tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')
- >>> model = PerceiverForSequenceClassification.from_pretrained('deepmind/language-perceiver')
+ >>> tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')
+ >>> model = PerceiverForSequenceClassification.from_pretrained('deepmind/language-perceiver')
- >>> text = "hello world"
- >>> inputs = tokenizer(images=image, return_tensors="pt").input_ids
- >>> outputs = model(inputs=inputs)
- >>> logits = outputs.logits
- """
+ >>> text = "hello world"
+ >>> inputs = tokenizer(images=image, return_tensors="pt").input_ids
+ >>> outputs = model(inputs=inputs)
+ >>> logits = outputs.logits
+ ```"""
if inputs is not None and input_ids is not None:
raise ValueError("You cannot use both `inputs` and `input_ids`")
elif inputs is None and input_ids is not None:
@@ -1190,11 +1184,11 @@ Example use of Perceiver for image classification, for tasks such as ImageNet.
This model uses learned position embeddings. In other words, this model is not given any privileged information about
the structure of images. As shown in the paper, this model can achieve a top-1 accuracy of 72.7 on ImageNet.
-:class:`~transformers.PerceiverForImageClassificationLearned` uses
-:class:`~transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor` (with :obj:`prep_type="conv1x1"`)
+[`PerceiverForImageClassificationLearned`] uses
+[`~models.perceiver.modeling_perceiver.PerceiverImagePreprocessor`] (with `prep_type="conv1x1"`)
to preprocess the input images, and
-:class:`~transformers.models.perceiver.modeling_perceiver.PerceiverClassificationDecoder` to decode the latent
-representation of :class:`~transformers.PerceiverModel` into classification logits.
+[`~models.perceiver.modeling_perceiver.PerceiverClassificationDecoder`] to decode the latent
+representation of [`PerceiverModel`] into classification logits.
""",
PERCEIVER_START_DOCSTRING,
)
@@ -1243,32 +1237,32 @@ class PerceiverForImageClassificationLearned(PerceiverPreTrainedModel):
pixel_values=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import PerceiverFeatureExtractor, PerceiverForImageClassificationLearned
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import PerceiverFeatureExtractor, PerceiverForImageClassificationLearned
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained('deepmind/vision-perceiver-learned')
- >>> model = PerceiverForImageClassificationLearned.from_pretrained('deepmind/vision-perceiver-learned')
+ >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained('deepmind/vision-perceiver-learned')
+ >>> model = PerceiverForImageClassificationLearned.from_pretrained('deepmind/vision-perceiver-learned')
- >>> inputs = feature_extractor(images=image, return_tensors="pt").pixel_values
- >>> outputs = model(inputs=inputs)
- >>> logits = outputs.logits
- >>> # model predicts one of the 1000 ImageNet classes
- >>> predicted_class_idx = logits.argmax(-1).item()
- >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt").pixel_values
+ >>> outputs = model(inputs=inputs)
+ >>> logits = outputs.logits
+ >>> # model predicts one of the 1000 ImageNet classes
+ >>> predicted_class_idx = logits.argmax(-1).item()
+ >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+ ```"""
if inputs is not None and pixel_values is not None:
raise ValueError("You cannot use both `inputs` and `pixel_values`")
elif inputs is None and pixel_values is not None:
@@ -1329,11 +1323,11 @@ Example use of Perceiver for image classification, for tasks such as ImageNet.
This model uses fixed 2D Fourier position embeddings. As shown in the paper, this model can achieve a top-1 accuracy of
79.0 on ImageNet, and 84.5 when pre-trained on a large-scale dataset (i.e. JFT).
-:class:`~transformers.PerceiverForImageClassificationLearned` uses
-:class:`~transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor` (with :obj:`prep_type="pixels"`)
+[`PerceiverForImageClassificationLearned`] uses
+[`~models.perceiver.modeling_perceiver.PerceiverImagePreprocessor`] (with `prep_type="pixels"`)
to preprocess the input images, and
-:class:`~transformers.models.perceiver.modeling_perceiver.PerceiverClassificationDecoder` to decode the latent
-representation of :class:`~transformers.PerceiverModel` into classification logits.
+[`~models.perceiver.modeling_perceiver.PerceiverClassificationDecoder`] to decode the latent
+representation of [`PerceiverModel`] into classification logits.
""",
PERCEIVER_START_DOCSTRING,
)
@@ -1380,32 +1374,32 @@ class PerceiverForImageClassificationFourier(PerceiverPreTrainedModel):
pixel_values=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import PerceiverFeatureExtractor, PerceiverForImageClassificationFourier
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import PerceiverFeatureExtractor, PerceiverForImageClassificationFourier
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained('deepmind/vision-perceiver-fourier')
- >>> model = PerceiverForImageClassificationFourier.from_pretrained('deepmind/vision-perceiver-fourier')
+ >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained('deepmind/vision-perceiver-fourier')
+ >>> model = PerceiverForImageClassificationFourier.from_pretrained('deepmind/vision-perceiver-fourier')
- >>> inputs = feature_extractor(images=image, return_tensors="pt").pixel_values
- >>> outputs = model(inputs=inputs)
- >>> logits = outputs.logits
- >>> # model predicts one of the 1000 ImageNet classes
- >>> predicted_class_idx = logits.argmax(-1).item()
- >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt").pixel_values
+ >>> outputs = model(inputs=inputs)
+ >>> logits = outputs.logits
+ >>> # model predicts one of the 1000 ImageNet classes
+ >>> predicted_class_idx = logits.argmax(-1).item()
+ >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+ ```"""
if inputs is not None and pixel_values is not None:
raise ValueError("You cannot use both `inputs` and `pixel_values`")
elif inputs is None and pixel_values is not None:
@@ -1465,11 +1459,11 @@ Example use of Perceiver for image classification, for tasks such as ImageNet.
This model uses a 2D conv+maxpool preprocessing network. As shown in the paper, this model can achieve a top-1 accuracy
of 82.1 on ImageNet.
-:class:`~transformers.PerceiverForImageClassificationLearned` uses
-:class:`~transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor` (with :obj:`prep_type="conv"`) to
+[`PerceiverForImageClassificationLearned`] uses
+[`~models.perceiver.modeling_perceiver.PerceiverImagePreprocessor`] (with `prep_type="conv"`) to
preprocess the input images, and
-:class:`~transformers.models.perceiver.modeling_perceiver.PerceiverClassificationDecoder` to decode the latent
-representation of :class:`~transformers.PerceiverModel` into classification logits.
+[`~models.perceiver.modeling_perceiver.PerceiverClassificationDecoder`] to decode the latent
+representation of [`PerceiverModel`] into classification logits.
""",
PERCEIVER_START_DOCSTRING,
)
@@ -1517,32 +1511,32 @@ class PerceiverForImageClassificationConvProcessing(PerceiverPreTrainedModel):
pixel_values=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import PerceiverFeatureExtractor, PerceiverForImageClassificationConvProcessing
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import PerceiverFeatureExtractor, PerceiverForImageClassificationConvProcessing
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained('deepmind/vision-perceiver-conv')
- >>> model = PerceiverForImageClassificationConvProcessing.from_pretrained('deepmind/vision-perceiver-conv')
+ >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained('deepmind/vision-perceiver-conv')
+ >>> model = PerceiverForImageClassificationConvProcessing.from_pretrained('deepmind/vision-perceiver-conv')
- >>> inputs = feature_extractor(images=image, return_tensors="pt").pixel_values
- >>> outputs = model(inputs=inputs)
- >>> logits = outputs.logits
- >>> # model predicts one of the 1000 ImageNet classes
- >>> predicted_class_idx = logits.argmax(-1).item()
- >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt").pixel_values
+ >>> outputs = model(inputs=inputs)
+ >>> logits = outputs.logits
+ >>> # model predicts one of the 1000 ImageNet classes
+ >>> predicted_class_idx = logits.argmax(-1).item()
+ >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+ ```"""
if inputs is not None and pixel_values is not None:
raise ValueError("You cannot use both `inputs` and `pixel_values`")
elif inputs is None and pixel_values is not None:
@@ -1598,10 +1592,10 @@ class PerceiverForImageClassificationConvProcessing(PerceiverPreTrainedModel):
@add_start_docstrings(
"""
Example use of Perceiver for optical flow, for tasks such as Sintel and KITTI.
-:class:`~transformers.PerceiverForOpticalFlow` uses
-:class:`~transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor` (with `prep_type="patches"`) to
-preprocess the input images, and :class:`~transformers.models.perceiver.modeling_perceiver.PerceiverOpticalFlowDecoder`
-to decode the latent representation of :class:`~transformers.PerceiverModel`.
+[`PerceiverForOpticalFlow`] uses
+[`~models.perceiver.modeling_perceiver.PerceiverImagePreprocessor`] (with *prep_type="patches"*) to
+preprocess the input images, and [`~models.perceiver.modeling_perceiver.PerceiverOpticalFlowDecoder`]
+to decode the latent representation of [`PerceiverModel`].
As input, one concatenates 2 subsequent frames along the channel dimension and extract a 3 x 3 patch around each pixel
(leading to 3 x 3 x 3 x 2 = 54 values for each pixel). Fixed Fourier position encodings are used to encode the position
@@ -1670,26 +1664,27 @@ class PerceiverForOpticalFlow(PerceiverPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the optical flow loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the optical flow loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Returns:
- Examples::
+ Examples:
- >>> from transformers import PerceiverForOpticalFlow
- >>> import torch
+ ```python
+ >>> from transformers import PerceiverForOpticalFlow
+ >>> import torch
- >>> model = PerceiverForOpticalFlow.from_pretrained('deepmind/optical-flow-perceiver')
+ >>> model = PerceiverForOpticalFlow.from_pretrained('deepmind/optical-flow-perceiver')
- >>> # in the Perceiver IO paper, the authors extract a 3 x 3 patch around each pixel,
- >>> # leading to 3 x 3 x 3 = 27 values for each pixel (as each pixel also has 3 color channels)
- >>> # patches have shape (batch_size, num_frames, num_channels, height, width)
- >>> # the authors train on resolutions of 368 x 496
- >>> patches = torch.randn(1, 2, 27, 368, 496)
- >>> outputs = model(inputs=patches)
- >>> logits = outputs.logits
- """
+ >>> # in the Perceiver IO paper, the authors extract a 3 x 3 patch around each pixel,
+ >>> # leading to 3 x 3 x 3 = 27 values for each pixel (as each pixel also has 3 color channels)
+ >>> # patches have shape (batch_size, num_frames, num_channels, height, width)
+ >>> # the authors train on resolutions of 368 x 496
+ >>> patches = torch.randn(1, 2, 27, 368, 496)
+ >>> outputs = model(inputs=patches)
+ >>> logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.perceiver(
@@ -1723,25 +1718,25 @@ class PerceiverForOpticalFlow(PerceiverPreTrainedModel):
"""
Example use of Perceiver for multimodal (video) autoencoding, for tasks such as Kinetics-700.
-:class:`~transformers.PerceiverForMultimodalAutoencoding` uses
-:class:`~transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor` to preprocess the 3
+[`PerceiverForMultimodalAutoencoding`] uses
+[`~models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor`] to preprocess the 3
modalities: images, audio and class labels. This preprocessor uses modality-specific preprocessors to preprocess every
modality separately, after which they are concatenated. Trainable position embeddings are used to pad each modality to
the same number of channels to make concatenation along the time dimension possible. Next, one applies the Perceiver
encoder.
-:class:`~transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder` is used to decode the latent
-representation of :class:`~transformers.PerceiverModel`. This decoder uses each modality-specific decoder to construct
+[`~models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder`] is used to decode the latent
+representation of [`PerceiverModel`]. This decoder uses each modality-specific decoder to construct
queries. The decoder queries are created based on the inputs after preprocessing. However, autoencoding an entire video
in a single forward pass is computationally infeasible, hence one only uses parts of the decoder queries to do
cross-attention with the latent representation. This is determined by the subsampled indices for each modality, which
-can be provided as additional input to the forward pass of :class:`~transformers.PerceiverForMultimodalAutoencoding`.
+can be provided as additional input to the forward pass of [`PerceiverForMultimodalAutoencoding`].
-:class:`~transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder` also pads the decoder queries of
+[`~models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder`] also pads the decoder queries of
the different modalities to the same number of channels, in order to concatenate them along the time dimension. Next,
-cross-attention is performed with the latent representation of :class:`~transformers.PerceiverModel`.
+cross-attention is performed with the latent representation of [`PerceiverModel`].
-Finally, :class:`~transformers.models.perceiver.modeling_perceiver.PerceiverMultiModalPostprocessor` is used to turn
+Finally, [`~models.perceiver.modeling_perceiver.PerceiverMultiModalPostprocessor`] is used to turn
this tensor into an actual video. It first splits up the output into the different modalities, and then applies the
respective postprocessor for each modality.
@@ -1880,27 +1875,27 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import PerceiverForMultimodalAutoencoding
- >>> import torch
+ ```python
+ >>> from transformers import PerceiverForMultimodalAutoencoding
+ >>> import torch
- >>> images = torch.randn((1, 16, 3, 224, 224))
- >>> audio = torch.randn((1, 30720, 1))
- >>> inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700)))
+ >>> images = torch.randn((1, 16, 3, 224, 224))
+ >>> audio = torch.randn((1, 30720, 1))
+ >>> inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700)))
- >>> model = PerceiverForMultimodalAutoencoding.from_pretrained('deepmind/multimodal-perceiver')
+ >>> model = PerceiverForMultimodalAutoencoding.from_pretrained('deepmind/multimodal-perceiver')
- >>> outputs = model(inputs=inputs)
- >>> logits = outputs.logits
- """
+ >>> outputs = model(inputs=inputs)
+ >>> logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.perceiver(
@@ -2021,31 +2016,31 @@ class PerceiverBasicDecoder(PerceiverAbstractDecoder):
The shape of the output of this class depends on how one defines the output queries (also called decoder queries).
Args:
- config ([`PerceiverConfig`]):
+ config ([*PerceiverConfig*]):
Model configuration.
- output_num_channels (:obj:`int`, `optional`):
- The number of channels in the output. Will only be used in case `final_project` is set to `True`.
- position_encoding_type (:obj:`str`, `optional`, defaults to "trainable"):
+ output_num_channels (`int`, *optional*):
+ The number of channels in the output. Will only be used in case *final_project* is set to *True*.
+ position_encoding_type (`str`, *optional*, defaults to "trainable"):
The type of position encoding to use. Can be either "trainable", "fourier", or "none".
- output_index_dims (:obj:`int`, `optional`):
+ output_index_dims (`int`, *optional*):
The number of dimensions of the output queries. Ignored if 'position_encoding_type' == 'none'.
- num_channels (:obj:`int`, `optional`):
+ num_channels (`int`, *optional*):
The number of channels of the decoder queries. Ignored if 'position_encoding_type' == 'none'.
- qk_channels (:obj:`int`, `optional`):
+ qk_channels (`int`, *optional*):
The number of channels of the queries and keys in the cross-attention layer.
- v_channels (:obj:`int`, `optional`, defaults to 128):
+ v_channels (`int`, *optional*, defaults to 128):
The number of channels of the values in the cross-attention layer.
- num_heads (:obj:`int`, `optional`, defaults to 1):
+ num_heads (`int`, *optional*, defaults to 1):
The number of attention heads in the cross-attention layer.
- widening_factor (:obj:`int`, `optional`, defaults to 1):
+ widening_factor (`int`, *optional*, defaults to 1):
The widening factor of the cross-attention layer.
- use_query_residual (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ use_query_residual (`bool`, *optional*, defaults to `False`):
Whether to use a residual connection between the query and the output of the cross-attention layer.
- concat_preprocessed_input (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ concat_preprocessed_input (`bool`, *optional*, defaults to `False`):
Whether to concatenate the preprocessed input to the query.
- final_project (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ final_project (`bool`, *optional*, defaults to `True`):
Whether to project the output of the cross-attention layer to a target dimension.
- position_encoding_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ position_encoding_only (`bool`, *optional*, defaults to `False`):
Whether to only use this class to define output queries.
"""
@@ -2264,15 +2259,15 @@ class PerceiverOpticalFlowDecoder(PerceiverAbstractDecoder):
class PerceiverBasicVideoAutoencodingDecoder(PerceiverAbstractDecoder):
"""
- Cross-attention based video-autoencoding decoder. Light-weight wrapper of [`PerceiverBasicDecoder`] with video
+ Cross-attention based video-autoencoding decoder. Light-weight wrapper of [*PerceiverBasicDecoder*] with video
reshaping logic.
Args:
- config ([`PerceiverConfig`]):
+ config ([*PerceiverConfig*]):
Model configuration.
- output_shape (:obj:`List[int]`):
+ output_shape (`List[int]`):
Shape of the output as (batch_size, num_frames, height, width), excluding the channel dimension.
- position_encoding_type (:obj:`str`):
+ position_encoding_type (`str`):
The type of position encoding to use. Can be either "trainable", "fourier", or "none".
"""
@@ -2337,7 +2332,7 @@ def restructure(modality_sizes: ModalitySizeType, inputs: torch.Tensor) -> Mappi
class PerceiverMultimodalDecoder(PerceiverAbstractDecoder):
"""
- Multimodal decoding by composing uni-modal decoders. The `modalities` argument of the constructor is a dictionary
+ Multimodal decoding by composing uni-modal decoders. The *modalities* argument of the constructor is a dictionary
mapping modality name to the decoder of that modality. That decoder will be used to construct queries for that
modality. Modality-specific queries are padded with trainable modality-specific parameters, after which they are
concatenated along the time dimension.
@@ -2345,18 +2340,18 @@ class PerceiverMultimodalDecoder(PerceiverAbstractDecoder):
Next, there is a shared cross attention operation across all modalities.
Args:
- config ([`PerceiverConfig`]):
+ config ([*PerceiverConfig*]):
Model configuration.
- modalities (:obj:`Dict[str, PerceiverAbstractDecoder]`):
+ modalities (`Dict[str, PerceiverAbstractDecoder]`):
Dictionary mapping modality name to the decoder of that modality.
- num_outputs (:obj:`int`):
+ num_outputs (`int`):
The number of outputs of the decoder.
- output_num_channels (:obj:`int`):
+ output_num_channels (`int`):
The number of channels in the output.
- min_padding_size (:obj:`int`, `optional`, defaults to 2):
+ min_padding_size (`int`, *optional*, defaults to 2):
The minimum padding size for all modalities. The final output will have num_channels equal to the maximum
channels across all modalities plus min_padding_size.
- subsampled_index_dims (:obj:`Dict[str, PerceiverAbstractDecoder]`, `optional`):
+ subsampled_index_dims (`Dict[str, PerceiverAbstractDecoder]`, *optional*):
Dictionary mapping modality name to the subsampled index dimensions to use for the decoder query of that
modality.
"""
@@ -2530,11 +2525,11 @@ class Conv2DDownsample(nn.Module):
Constructs a Conv2DDownsample model.
Args:
- in_channels (:obj:`int`, `optional`, defaults to 3):
+ in_channels (`int`, *optional*, defaults to 3):
The number of input channels.
- out_channels (:obj:`int`, `optional`, defaults to 64):
+ out_channels (`int`, *optional*, defaults to 64):
The number of conv output channels.
- use_batchnorm (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_batchnorm (`bool`, *optional*, defaults to `True`):
Whether to use batchnorm.
"""
super().__init__()
@@ -2559,20 +2554,20 @@ def generate_fourier_features(pos, num_bands, max_resolution=(224, 224), concat_
Generate a Fourier frequency position encoding with linear spacing.
Args:
- pos (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length, dim)`):
+ pos (`torch.LongTensor` of shape `(batch_size, sequence_length, dim)`):
The Tensor containing the position of n points in d dimensional space.
- num_bands (:obj:`int`):
+ num_bands (`int`):
The number of frequency bands (K) to use.
- max_resolution (:obj:`Tuple[int]`, `optional`, defaults to (224, 224)):
+ max_resolution (`Tuple[int]`, *optional*, defaults to (224, 224)):
The maximum resolution (i.e. the number of pixels per dim). A tuple representing resolution for each dimension.
- concat_pos (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ concat_pos (`bool`, *optional*, defaults to `True`):
Whether to concatenate the input position encoding to the Fourier features.
- sine_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ sine_only (`bool`, *optional*, defaults to `False`):
Whether to use a single phase (sin) or two (sin/cos) for each frequency band.
Returns:
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, n_channels)`: The Fourier position
- embeddings. If :obj:`concat_pos` is `True` and :obj:`sine_only` is `False`, output dimensions are ordered as:
+ `torch.FloatTensor` of shape `(batch_size, sequence_length, n_channels)`: The Fourier position
+ embeddings. If `concat_pos` is *True* and `sine_only` is *False*, output dimensions are ordered as:
[dim_1, dim_2, ..., dim_d, sin(pi*f_1*dim_1), ..., sin(pi*f_K*dim_1), ..., sin(pi*f_1*dim_d), ...,
sin(pi*f_K*dim_d), cos(pi*f_1*dim_1), ..., cos(pi*f_K*dim_1), ..., cos(pi*f_1*dim_d), ..., cos(pi*f_K*dim_d)],
where dim_i is pos[:, i] and f_k is the kth frequency band.
@@ -2611,13 +2606,13 @@ def build_linear_positions(index_dims, output_range=(-1.0, 1.0)):
Generate an array of position indices for an N-D input array.
Args:
- index_dims (:obj:`List[int]`):
+ index_dims (`List[int]`):
The shape of the index dimensions of the input array.
- output_range (:obj:`Tuple[float]`, `optional`, defaults to :obj:`(-1.0, 1.0)`):
+ output_range (`Tuple[float]`, *optional*, defaults to `(-1.0, 1.0)`):
The min and max values taken by each input index dimension.
Returns:
- :obj:`torch.FloatTensor` of shape :obj:`(index_dims[0], index_dims[1], .., index_dims[-1], N)`.
+ `torch.FloatTensor` of shape `(index_dims[0], index_dims[1], .., index_dims[-1], N)`.
"""
def _linspace(n_xels_per_dim):
@@ -2678,15 +2673,15 @@ def _check_or_build_spatial_positions(pos, index_dims, batch_size):
Checks or builds spatial position features (x, y, ...).
Args:
- pos (:obj:`torch.FloatTensor`):
+ pos (`torch.FloatTensor`):
None, or an array of position features. If None, position features are built. Otherwise, their size is checked.
- index_dims (:obj:`List[int]`):
+ index_dims (`List[int]`):
An iterable giving the spatial/index size of the data to be featurized.
- batch_size (:obj:`int`):
+ batch_size (`int`):
The batch size of the data to be featurized.
Returns:
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size, prod(index_dims))` an array of position features.
+ `torch.FloatTensor` of shape `(batch_size, prod(index_dims))` an array of position features.
"""
if pos is None:
pos = build_linear_positions(index_dims)
@@ -2805,11 +2800,11 @@ class PerceiverMultimodalPostprocessor(nn.Module):
postprocessor.
Args:
- modalities (:obj:`Dict[str, PostprocessorType]`):
+ modalities (`Dict[str, PostprocessorType]`):
Dictionary mapping modality name to postprocessor class for that modality.
- input_is_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ input_is_dict (`bool`, *optional*, defaults to `False`):
If True, input is assumed to be dictionary structured, and outputs keep the same dictionary shape. If
- False, input is a tensor which is sliced up during postprocessing by `modality_sizes`.
+ False, input is a tensor which is sliced up during postprocessing by *modality_sizes*.
"""
def __init__(self, modalities: Mapping[str, PostprocessorType], input_is_dict: bool = False):
@@ -2838,9 +2833,9 @@ class PerceiverClassificationPostprocessor(nn.Module):
Classification postprocessing for Perceiver. Can be used to convert the decoder output to classification logits.
Args:
- config ([`PerceiverConfig`]):
+ config ([*PerceiverConfig*]):
Model configuration.
- in_channels (:obj:`int`):
+ in_channels (`int`):
Number of channels in the input.
"""
@@ -2858,11 +2853,11 @@ class PerceiverAudioPostprocessor(nn.Module):
Audio postprocessing for Perceiver. Can be used to convert the decoder output to audio features.
Args:
- config ([`PerceiverConfig`]):
+ config ([*PerceiverConfig*]):
Model configuration.
- in_channels (:obj:`int`):
+ in_channels (`int`):
Number of channels in the input.
- postproc_type (:obj:`str`, `optional`, defaults to :obj:`"patches"`):
+ postproc_type (`str`, *optional*, defaults to `"patches"`):
Postprocessor type to use. Currently, only "patches" is supported.
"""
@@ -2887,9 +2882,9 @@ class PerceiverProjectionPostprocessor(nn.Module):
dimension.
Args:
- in_channels (:obj:`int`):
+ in_channels (`int`):
Number of channels in the input.
- out_channels (:obj:`int`):
+ out_channels (`int`):
Number of channels in the output.
"""
@@ -2906,36 +2901,36 @@ class PerceiverImagePreprocessor(AbstractPreprocessor):
"""
Image preprocessing for Perceiver Encoder.
- Note: the `out_channels` argument refers to the output channels of a convolutional layer, if `prep_type` is set to
- "conv1x1" or "conv". If one adds absolute position embeddings, one must make sure the `num_channels` of the
- position encoding kwargs are set equal to the `out_channels`.
+ Note: the *out_channels* argument refers to the output channels of a convolutional layer, if *prep_type* is set to
+ "conv1x1" or "conv". If one adds absolute position embeddings, one must make sure the *num_channels* of the
+ position encoding kwargs are set equal to the *out_channels*.
Args:
- config ([`PerceiverConfig`]):
+ config ([*PerceiverConfig*]):
Model configuration.
- prep_type (:obj:`str`, `optional`, defaults to :obj:`"conv"`):
+ prep_type (`str`, *optional*, defaults to `"conv"`):
Preprocessing type. Can be "conv1x1", "conv", "patches", "pixels".
- spatial_downsample (:obj:`int`, `optional`, defaults to 4):
+ spatial_downsample (`int`, *optional*, defaults to 4):
Spatial downsampling factor.
- temporal_downsample (:obj:`int`, `optional`, defaults to 1):
+ temporal_downsample (`int`, *optional*, defaults to 1):
Temporal downsampling factor (only relevant in case a time dimension is present).
- position_encoding_type (:obj:`str`, `optional`, defaults to :obj:`"fourier"`):
+ position_encoding_type (`str`, *optional*, defaults to `"fourier"`):
Position encoding type. Can be "fourier" or "trainable".
- in_channels (:obj:`int`, `optional`, defaults to 3):
+ in_channels (`int`, *optional*, defaults to 3):
Number of channels in the input.
- out_channels (:obj:`int`, `optional`, defaults to 64):
+ out_channels (`int`, *optional*, defaults to 64):
Number of channels in the output.
- conv_after_patching (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ conv_after_patching (`bool`, *optional*, defaults to `False`):
Whether to apply a convolutional layer after patching.
- conv_after_patching_in_channels (:obj:`int`, `optional`, defaults to 54):
+ conv_after_patching_in_channels (`int`, *optional*, defaults to 54):
Number of channels in the input of the convolutional layer after patching.
- conv2d_use_batchnorm (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ conv2d_use_batchnorm (`bool`, *optional*, defaults to `True`):
Whether to use batch normalization in the convolutional layer.
- concat_or_add_pos (:obj:`str`, `optional`, defaults to :obj:`"concat"`):
+ concat_or_add_pos (`str`, *optional*, defaults to `"concat"`):
How to concatenate the position encoding to the input. Can be "concat" or "add".
- project_pos_dim (:obj:`int`, `optional`, defaults to -1):
+ project_pos_dim (`int`, *optional*, defaults to -1):
Dimension of the position encoding to project to. If -1, no projection is applied.
- **position_encoding_kwargs (:obj:`Dict`, `optional`):
+ **position_encoding_kwargs (`Dict`, *optional*):
Keyword arguments for the position encoding.
"""
@@ -3162,21 +3157,21 @@ class PerceiverAudioPreprocessor(AbstractPreprocessor):
Audio preprocessing for Perceiver Encoder.
Args:
- config ([`PerceiverConfig`]):
+ config ([*PerceiverConfig*]):
Model configuration.
- prep_type (:obj:`str`, `optional`, defaults to :obj:`"patches"`):
+ prep_type (`str`, *optional*, defaults to `"patches"`):
Preprocessor type to use. Only "patches" is supported.
- samples_per_patch (:obj:`int`, `optional`, defaults to 96):
+ samples_per_patch (`int`, *optional*, defaults to 96):
Number of samples per patch.
- position_encoding_type (:obj:`str`, `optional`, defaults to :obj:`"fourier"`):
+ position_encoding_type (`str`, *optional*, defaults to `"fourier"`):
Type of position encoding to use. Can be "trainable" or "fourier".
- concat_or_add_pos (:obj:`str`, `optional`, defaults to :obj:`"concat"`):
+ concat_or_add_pos (`str`, *optional*, defaults to `"concat"`):
How to concatenate the position encoding to the input. Can be "concat" or "add".
- out_channels (:obj:`int`, `optional`, defaults to 64):
+ out_channels (`int`, *optional*, defaults to 64):
Number of channels in the output.
- project_pos_dim (:obj:`int`, `optional`, defaults to -1):
+ project_pos_dim (`int`, *optional*, defaults to -1):
Dimension of the position encoding to project to. If -1, no projection is applied.
- **position_encoding_kwargs (:obj:`Dict`, `optional`):
+ **position_encoding_kwargs (`Dict`, *optional*):
Keyword arguments for the position encoding.
"""
@@ -3262,11 +3257,11 @@ class PerceiverMultimodalPreprocessor(AbstractPreprocessor):
of channels.
Args:
- modalities (:obj:`Dict[str, PreprocessorType]`):
+ modalities (`Dict[str, PreprocessorType]`):
Dict mapping modality name to preprocessor.
- mask_probs (:obj:`Dict[str, float]`):
+ mask_probs (`Dict[str, float]`):
Dict mapping modality name to masking probability of that modality.
- min_padding_size (:obj:`int`, `optional`, defaults to 2):
+ min_padding_size (`int`, *optional*, defaults to 2):
The minimum padding size for all modalities. The final output will have num_channels equal to the maximum
channels across all modalities plus min_padding_size.
"""
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index a3e89aa69f..34f8f1904f 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -51,133 +51,133 @@ PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
PROPHETNET_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
Original ProphetNet code can be found at . Checkpoints were converted
from original Fairseq checkpoints. For more information on the checkpoint conversion, please take a look at the
- file ``convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py``.
+ file `convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py`.
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matters related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.ProphetNetConfig`): Model configuration class with all the parameters of the model.
+ config ([`ProphetNetConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
PROPHETNET_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.ProphetNetTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`ProphetNetTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.ProphetNetTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`ProphetNetTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- ProphetNet uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
+ ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
- decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
- Indices can be obtained using :class:`~transformers.ProphetNetTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`ProphetNetTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -259,59 +259,54 @@ class ProphetNetSeq2SeqLMOutput(ModelOutput):
Base class for sequence-to-sequence language models outputs.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, decoder_sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
Prediction scores of the main stream language modeling head (scores for each vocabulary token before
SoftMax).
- logits_ngram (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
+ logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
SoftMax).
- past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
- batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+ past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, decoder_sequence_length, hidden_size)`.
Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_ngram_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+ decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.
Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- decoder_sequence_length, decoder_sequence_length)`.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- decoder_ngram_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- decoder_sequence_length, decoder_sequence_length)`.
+ decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
weighted average in the self-attention heads.
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- encoder_sequence_length, decoder_sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, encoder_sequence_length, decoder_sequence_length)`.
Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
compute the weighted average in the
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, encoder_sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- encoder_sequence_length, encoder_sequence_length)`. Attentions weights of the encoder, after the attention
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, encoder_sequence_length, encoder_sequence_length)`. Attentions weights of the encoder, after the attention
softmax, used to compute the weighted average in the self-attention heads.
"""
@@ -344,58 +339,52 @@ class ProphetNetSeq2SeqModelOutput(ModelOutput):
decoding.
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.
- If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
- 1, hidden_size)` is output.
- last_hidden_state_ngram (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,ngram * decoder_sequence_length, config.vocab_size)`):
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
+ last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`):
Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
- past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
- batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+ past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, decoder_sequence_length, hidden_size)`.
Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
- decoder_ngram_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+ decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.
Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
outputs.
- decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- decoder_sequence_length, decoder_sequence_length)`.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- decoder_ngram_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- decoder_sequence_length, decoder_sequence_length)`.
+ decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
weighted average in the
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- encoder_sequence_length, decoder_sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, encoder_sequence_length, decoder_sequence_length)`.
Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
compute the weighted average in the
- encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, encoder_sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- encoder_sequence_length, encoder_sequence_length)`.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, encoder_sequence_length, encoder_sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
@@ -428,45 +417,40 @@ class ProphetNetDecoderModelOutput(ModelOutput):
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.
- If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
- 1, hidden_size)` is output.
- last_hidden_state_ngram (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
+ last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
- past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
- batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+ past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, decoder_sequence_length, hidden_size)`.
Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
- ngram_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+ ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.
Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- decoder_sequence_length, decoder_sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- ngram_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- decoder_sequence_length, decoder_sequence_length)`.
+ ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
weighted average in the
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- encoder_sequence_length, decoder_sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, encoder_sequence_length, decoder_sequence_length)`.
Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
compute the weighted average in the
@@ -488,46 +472,42 @@ class ProphetNetDecoderLMOutput(ModelOutput):
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, decoder_sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
Prediction scores of the main stream language modeling head (scores for each vocabulary token before
SoftMax).
- logits_ngram (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
+ logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
SoftMax).
- past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
- batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+ past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
- used (see :obj:`past_key_values` input) to speed up sequential decoding.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`.
+ used (see `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, decoder_sequence_length, hidden_size)`.
Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
- ngram_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+ ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.
Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- decoder_sequence_length, decoder_sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
- ngram_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- decoder_sequence_length, decoder_sequence_length)`.
+ ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
weighted average in the
- cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads,
- encoder_sequence_length, decoder_sequence_length)`.
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, encoder_sequence_length, decoder_sequence_length)`.
Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
compute the weighted average in the
@@ -1248,8 +1228,8 @@ class ProphetNetDecoderLayer(nn.Module):
)
class ProphetNetEncoder(ProphetNetPreTrainedModel):
r"""
- word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
- The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
+ word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
+ The word embedding parameters. This can be used to initialize [`ProphetNetEncoder`] with
pre-defined word embeddings instead of randomly initialized word embeddings.
"""
@@ -1387,8 +1367,8 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
)
class ProphetNetDecoder(ProphetNetPreTrainedModel):
r"""
- word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
- The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
+ word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
+ The word embedding parameters. This can be used to initialize [`ProphetNetEncoder`] with
pre-defined word embeddings instead of randomly initialized word embeddings.
"""
@@ -1440,46 +1420,47 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
Returns:
- Example::
+ Example:
- >>> from transformers import ProphetNetTokenizer, ProphetNetDecoder
- >>> import torch
+ ```python
+ >>> from transformers import ProphetNetTokenizer, ProphetNetDecoder
+ >>> import torch
- >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
- >>> model = ProphetNetDecoder.from_pretrained('microsoft/prophetnet-large-uncased', add_cross_attention=False)
- >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+ >>> model = ProphetNetDecoder.from_pretrained('microsoft/prophetnet-large-uncased', add_cross_attention=False)
+ >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> last_hidden_states = outputs.last_hidden_state
- """
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
use_cache = use_cache if use_cache is not None else self.config.use_cache
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1919,27 +1900,27 @@ class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ...,
- config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for
- labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
+ labels in `[0, ..., config.vocab_size]`
Returns:
- Example::
+ Example:
- >>> from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration
+ ```python
+ >>> from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration
- >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
- >>> model = ProphetNetForConditionalGeneration.from_pretrained('microsoft/prophetnet-large-uncased')
+ >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+ >>> model = ProphetNetForConditionalGeneration.from_pretrained('microsoft/prophetnet-large-uncased')
- >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
- >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
- >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+ >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
+ >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
+ >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
- >>> logits_next_token = outputs.logits # logits to predict next token as usual
- >>> logits_ngram_next_tokens = outputs.logits_ngram # logits to predict 2nd, 3rd, ... next tokens
- """
+ >>> logits_next_token = outputs.logits # logits to predict next token as usual
+ >>> logits_ngram_next_tokens = outputs.logits_ngram # logits to predict 2nd, 3rd, ... next tokens
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -2136,70 +2117,71 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
- ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
Returns:
- Example::
+ Example:
- >>> from transformers import ProphetNetTokenizer, ProphetNetForCausalLM
- >>> import torch
+ ```python
+ >>> from transformers import ProphetNetTokenizer, ProphetNetForCausalLM
+ >>> import torch
- >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
- >>> model = ProphetNetForCausalLM.from_pretrained('microsoft/prophetnet-large-uncased')
- >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+ >>> model = ProphetNetForCausalLM.from_pretrained('microsoft/prophetnet-large-uncased')
+ >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> logits = outputs.logits
+ >>> logits = outputs.logits
- >>> # Model can also be used with EncoderDecoder framework
- >>> from transformers import BertTokenizer, EncoderDecoderModel, ProphetNetTokenizer
- >>> import torch
+ >>> # Model can also be used with EncoderDecoder framework
+ >>> from transformers import BertTokenizer, EncoderDecoderModel, ProphetNetTokenizer
+ >>> import torch
- >>> tokenizer_enc = BertTokenizer.from_pretrained('bert-large-uncased')
- >>> tokenizer_dec = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
- >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-large-uncased", "microsoft/prophetnet-large-uncased")
+ >>> tokenizer_enc = BertTokenizer.from_pretrained('bert-large-uncased')
+ >>> tokenizer_dec = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+ >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-large-uncased", "microsoft/prophetnet-large-uncased")
- >>> ARTICLE = (
- ... "the us state department said wednesday it had received no "
- ... "formal word from bolivia that it was expelling the us ambassador there "
- ... "but said the charges made against him are `` baseless ."
- ... )
- >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
- >>> labels = tokenizer_dec("us rejects charges against its ambassador in bolivia", return_tensors="pt").input_ids
- >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])
+ >>> ARTICLE = (
+ ... "the us state department said wednesday it had received no "
+ ... "formal word from bolivia that it was expelling the us ambassador there "
+ ... "but said the charges made against him are `` baseless ."
+ ... )
+ >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
+ >>> labels = tokenizer_dec("us rejects charges against its ambassador in bolivia", return_tensors="pt").input_ids
+ >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])
- >>> loss = outputs.loss
- """
+ >>> loss = outputs.loss
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
@@ -2309,7 +2291,7 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
class ProphetNetDecoderWrapper(ProphetNetPreTrainedModel):
"""
- This is a wrapper class, so that :class:`~transformers.ProphetNetForCausalLM` can correctly be loaded from
+ This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from
pretrained prophetnet classes.
"""
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py
index 1e66403692..0d6495c69a 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/qdqbert/modeling_qdqbert.py
@@ -760,69 +760,67 @@ class QDQBertPreTrainedModel(PreTrainedModel):
QDQBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.QDQBertConfig`): Model configuration class with all the parameters of the model.
+ config ([`QDQBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
QDQBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -834,13 +832,13 @@ class QDQBertModel(QDQBertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
- cross-attention is added between the self-attention layers, following the architecture described in `Attention is
- all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+ cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+ all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
- To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
- set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
- argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration
+ set to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
+ argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an
input to the forward pass.
"""
@@ -895,24 +893,24 @@ class QDQBertModel(QDQBertPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1055,46 +1053,47 @@ class QDQBertLMHeadModel(QDQBertPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
- ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
Returns:
- Example::
+ Example:
- >>> from transformers import BertTokenizer, QDQBertLMHeadModel, QDQBertConfig
- >>> import torch
+ ```python
+ >>> from transformers import BertTokenizer, QDQBertLMHeadModel, QDQBertConfig
+ >>> import torch
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
- >>> config = QDQBertConfig.from_pretrained("bert-base-cased")
- >>> config.is_decoder = True
- >>> model = QDQBertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+ >>> config = QDQBertConfig.from_pretrained("bert-base-cased")
+ >>> config.is_decoder = True
+ >>> model = QDQBertLMHeadModel.from_pretrained('bert-base-cased', config=config)
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.logits
- """
+ >>> prediction_logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
use_cache = False
@@ -1208,10 +1207,9 @@ class QDQBertForMaskedLM(QDQBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1297,31 +1295,32 @@ class QDQBertForNextSentencePrediction(QDQBertPreTrainedModel):
**kwargs,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
- (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+ (see `input_ids` docstring). Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
Returns:
- Example::
+ Example:
- >>> from transformers import BertTokenizer, QDQBertForNextSentencePrediction
- >>> import torch
+ ```python
+ >>> from transformers import BertTokenizer, QDQBertForNextSentencePrediction
+ >>> import torch
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- >>> model = QDQBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+ >>> model = QDQBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
- >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
- >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+ >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+ >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
- >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
- >>> logits = outputs.logits
- >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
- """
+ >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+ >>> logits = outputs.logits
+ >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+ ```"""
if "next_sentence_label" in kwargs:
warnings.warn(
@@ -1405,10 +1404,9 @@ class QDQBertForSequenceClassification(QDQBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1502,10 +1500,9 @@ class QDQBertForMultipleChoice(QDQBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1598,9 +1595,8 @@ class QDQBertForTokenClassification(QDQBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1690,13 +1686,13 @@ class QDQBertForQuestionAnswering(QDQBertPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 9c64ed5995..d6124c8b47 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -41,71 +41,66 @@ class RetrievAugLMMarginOutput(ModelOutput):
Base class for retriever augmented marginalized models outputs.
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss.
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
each vocabulary token.
- doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
- Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
- :obj:`question_encoder_last_hidden_state`.
- past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
- batch_size, num_heads, sequence_length, embed_size_per_head)`).
+ doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
+ Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
+ `question_encoder_last_hidden_state`.
+ past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
- (see :obj:`past_key_values` input) to speed up sequential decoding.
- retrieved_doc_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs, hidden_size)`, `optional`, returned when `output_retrieved=True`):
- Embedded documents retrieved by the retriever. Is used with ``question_encoder_last_hidden_state`` to
- compute the ``doc_scores``.
- retrieved_doc_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, config.n_docs)`, `optional`, returned when `output_retrieved=True`):
+ (see `past_key_values` input) to speed up sequential decoding.
+ retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
+ Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to
+ compute the `doc_scores`.
+ retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
The indexes of the embedded documents retrieved by the retriever.
- context_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
+ context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
- context_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Attention mask post-processed from the retrieved documents and the question encoder :obj:`input_ids` by the
+ context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
+ Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.
- question_encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
model.
- question_enc_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings and one for the output of each
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
- question_enc_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
- generator_enc_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
- generator_enc_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings and one for the output of each
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
- generator_enc_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
- generator_dec_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings and one for the output of each
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
- generator_dec_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
- generator_cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
@@ -134,69 +129,64 @@ class RetrievAugLMMarginOutput(ModelOutput):
class RetrievAugLMOutput(ModelOutput):
"""
Args:
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
each vocabulary token.
- doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
- Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
- :obj:`question_encoder_last_hidden_state`.
- past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
- batch_size, num_heads, sequence_length, embed_size_per_head)`).
+ doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
+ Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
+ `question_encoder_last_hidden_state`.
+ past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
- (see :obj:`past_key_values` input) to speed up sequential decoding.
- retrieved_doc_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs, hidden_size)`, `optional`, returned when `output_retrieved=True`):
- Embedded documents retrieved by the retriever. Is used with ``question_encoder_last_hidden_state`` to
- compute the ``doc_scores``.
- retrieved_doc_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, config.n_docs)`, `optional`, returned when `output_retrieved=True`):
+ (see `past_key_values` input) to speed up sequential decoding.
+ retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
+ Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to
+ compute the `doc_scores`.
+ retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
The indexes of the embedded documents retrieved by the retriever.
- context_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
+ context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
- context_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Attention mask post-processed from the retrieved documents and the question encoder :obj:`input_ids` by the
+ context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
+ Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.
- question_encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
model.
- question_enc_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings and one for the output of each
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
- question_enc_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
- generator_enc_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
- generator_enc_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings and one for the output of each
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
- generator_enc_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
- generator_dec_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings and one for the output of each
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
- generator_dec_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
- generator_cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
@@ -252,62 +242,62 @@ class RagPreTrainedModel(PreTrainedModel):
Instantiates an question encoder and a generator from one or two base classes of the library from pretrained
model checkpoints.
- The model is set in evaluation mode by default using :obj:`model.eval()` (Dropout modules are deactivated). To
- train the model, you need to first set it back in training mode with :obj:`model.train()`.
+ The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To
+ train the model, you need to first set it back in training mode with `model.train()`.
Params:
- question_encoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
+ question_encoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
Information necessary to initiate the question encoder. Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
- this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
- as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+ this case, `from_tf` should be set to `True` and a configuration object should be provided
+ as `config` argument. This loading path is slower than converting the TensorFlow checkpoint in
a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- generator_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
+ generator_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
Information necessary to initiate the generator. Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
- this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
- as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+ this case, `from_tf` should be set to `True` and a configuration object should be provided
+ as `config` argument. This loading path is slower than converting the TensorFlow checkpoint in
a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- model_args (remaining positional arguments, `optional`):
- All remaining positional arguments will be passed to the underlying model's ``__init__`` method.
- retriever (:class:`~transformers.RagRetriever`, `optional`):
+ model_args (remaining positional arguments, *optional*):
+ All remaining positional arguments will be passed to the underlying model's `__init__` method.
+ retriever ([`RagRetriever`], *optional*):
The retriever to use.
- kwwargs (remaining dictionary of keyword arguments, `optional`):
+ kwwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- ``output_attentions=True``).
+ `output_attentions=True`).
- - To update the question_encoder configuration, use the prefix `question_encoder_` for each
+ - To update the question_encoder configuration, use the prefix *question_encoder_* for each
configuration parameter.
- - To update the generator configuration, use the prefix `generator_` for each configuration parameter.
+ - To update the generator configuration, use the prefix *generator_* for each configuration parameter.
- To update the parent model configuration, do not use a prefix for each configuration parameter.
- Behaves differently depending on whether a :obj:`config` is provided or automatically loaded.
+ Behaves differently depending on whether a `config` is provided or automatically loaded.
- Example::
+ Example:
- >>> from transformers import RagModel
- >>> # initialize a RAG from two pretrained models.
- >>> model = RagModel.from_question_encoder_generator_pretrained('facebook/dpr-question_encoder-single-nq-base', 't5-small')
- >>> # saving model after fine-tuning
- >>> model.save_pretrained("./rag")
- >>> # load fine-tuned model
- >>> model = RagModel.from_pretrained("./rag")
-
- """
+ ```python
+ >>> from transformers import RagModel
+ >>> # initialize a RAG from two pretrained models.
+ >>> model = RagModel.from_question_encoder_generator_pretrained('facebook/dpr-question_encoder-single-nq-base', 't5-small')
+ >>> # saving model after fine-tuning
+ >>> model.save_pretrained("./rag")
+ >>> # load fine-tuned model
+ >>> model = RagModel.from_pretrained("./rag")
+ ```"""
kwargs_question_encoder = {
argument[len("question_encoder_") :]: value
@@ -387,104 +377,101 @@ RAG_START_DOCSTRING = r"""
pass, we encode the input with the question encoder and pass it to the retriever to extract relevant context
documents. The documents are then prepended to the input. Such contextualized inputs is passed to the generator.
- The question encoder can be any `autoencoding` model, preferably :class:`~transformers.DPRQuestionEncoder`, and the
- generator can be any `seq2seq` model, preferably :class:`~transformers.BartForConditionalGeneration`.
+ The question encoder can be any *autoencoding* model, preferably [`DPRQuestionEncoder`], and the
+ generator can be any *seq2seq* model, preferably [`BartForConditionalGeneration`].
- The model can be initialized with a :class:`~transformers.RagRetriever` for end-to-end generation or used in
+ The model can be initialized with a [`RagRetriever`] for end-to-end generation or used in
combination with the outputs of a retriever in multiple steps---see examples for more details. The model is
- compatible any `autoencoding` model as the ``question_encoder`` and any `seq2seq` model with language model head as
- the ``generator``. It has been tested with :class:`~transformers.DPRQuestionEncoder` as the ``question_encoder``
- and :class:`~transformers.BartForConditionalGeneration` or :class:`~transformers.T5ForConditionalGeneration` as the
- ``generator``.
+ compatible any *autoencoding* model as the `question_encoder` and any *seq2seq* model with language model head as
+ the `generator`. It has been tested with [`DPRQuestionEncoder`] as the `question_encoder`
+ and [`BartForConditionalGeneration`] or [`T5ForConditionalGeneration`] as the
+ `generator`.
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Args:
- config (:class:`~transformers.RagConfig`):
+ config ([`RagConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
- question_encoder (:class:`transformers.PreTrainedModel`):
- An encoder model compatible with the faiss index encapsulated by the ``retriever``.
- generator (:class:`transformers.PreTrainedModel`):
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+ question_encoder ([`PreTrainedModel`]):
+ An encoder model compatible with the faiss index encapsulated by the `retriever`.
+ generator ([`PreTrainedModel`]):
A seq2seq model used as the generator in the RAG architecture.
- retriever (:class:`~transformers.RagRetriever`):
+ retriever ([`RagRetriever`]):
A retriever class encapsulating a faiss index queried to obtain context documents for current inputs.
"""
RAG_FORWARD_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. :class:`~transformers.RagConfig`, used to initialize
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize
the model, specifies which generator to use, it also specifies a compatible generator tokenizer. Use that
tokenizer class to obtain the indices.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`)
- Tuple consists of (:obj:`generator_enc_last_hidden_state`, `optional`: :obj:`generator_enc_hidden_states`,
- `optional`: :obj:`generator_enc_attentions`). :obj:`generator_enc_last_hidden_state` of shape
- :obj:`(batch_size, n_docs * sequence_length, hidden_size)` is a sequence of hidden-states at the output of
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
+ Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
+ *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape
+ `(batch_size, n_docs * sequence_length, hidden_size)` is a sequence of hidden-states at the output of
the last layer of the generator's encoder.
- Used by the (:class:`~transformers.RagModel`) model during decoding.
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Provide for generation tasks. `None` by default, construct as per instructions for the generator model
+ Used by the ([`RagModel`]) model during decoding.
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Provide for generation tasks. *None* by default, construct as per instructions for the generator model
you're using with your RAG instance.
- decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`):
- Tuple consists of two elements: :obj:`encoder_outputs` of the RAG model (see :obj:`encoder_outputs`) and
- :obj:`past_key_values` of the underlying generator. Can be used to speed up decoding.
- :obj:`past_key_values` are used in the (:class:`~transformers.RagTokenForGeneration`) model during
+ past_key_values (`tuple(tuple(torch.FloatTensor))`):
+ Tuple consists of two elements: `encoder_outputs` of the RAG model (see `encoder_outputs`) and
+ `past_key_values` of the underlying generator. Can be used to speed up decoding.
+ `past_key_values` are used in the ([`RagTokenForGeneration`]) model during
decoding.
- doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
- Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
- :obj:`question_encoder_last_hidden_state`. If the model has is not initialized with a ``retriever``
- :obj:`doc_scores` has to be provided to the forward pass. :obj:`doc_scores` can be computed via
- :obj:`question_encoder_last_hidden_state` and :obj:`retrieved_doc_embeds`, see examples for more
+ doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
+ Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
+ `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever`
+ `doc_scores` has to be provided to the forward pass. `doc_scores` can be computed via
+ `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more
information.
- context_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Input IDs post-processed from the retrieved documents and the question encoder :obj:`input_ids` by the
+ context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
+ Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.
- If the model has is not initialized with a ``retriever`` :obj:`context_input_ids` has to be provided to the
- forward pass. :obj:`context_input_ids` are returned by :meth:`~transformers.RagRetriever.__call__`.
- context_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Attention mask post-processed from the retrieved documents and the question encoder :obj:`input_ids` by the
+ If the model has is not initialized with a `retriever` ``context_input_ids` has to be provided to the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`]. context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*): Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.
- If the model has is not initialized with a ``retriever`` :obj:`context_attention_mask` has to be provided
- to the forward pass. :obj:`context_attention_mask` are returned by
- :meth:`~transformers.RagRetriever.__call__`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If the model has is not initialized with a `retriever` `context_attention_mask` has to be provided
+ to the forward pass. `context_attention_mask` are returned by
+ [`~RagRetriever.__call__`].
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- output_retrieved(:obj:`bool`, `optional`):
- Whether or not to return the :obj:`retrieved_doc_embeds`, :obj:`retrieved_doc_ids`,
- :obj:`context_input_ids` and :obj:`context_attention_mask`. See returned tensors for more detail.
- n_docs (:obj:`int`, `optional`, defaults to :obj:`config.n_docs`)
+ output_retrieved(`bool`, *optional*):
+ Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`,
+ `context_input_ids` and `context_attention_mask`. See returned tensors for more detail.
+ n_docs (`int`, *optional*, defaults to `config.n_docs``)
Number of documents to retrieve and/or number of documents for which to generate an answer.
"""
@@ -784,44 +771,45 @@ class RagSequenceForGeneration(RagPreTrainedModel):
**kwargs # needs kwargs for generation
):
r"""
- exclude_bos_score (:obj:`bool`, `optional`):
- Only relevant if ``labels`` is passed. If :obj:`True`, the score of the BOS token is disregarded when
+ exclude_bos_score (`bool`, *optional*):
+ Only relevant if `labels` is passed. If `True`, the score of the BOS token is disregarded when
computing the loss.
- reduce_loss (:obj:`bool`, `optional`):
- Only relevant if ``labels`` is passed. If :obj:`True`, the NLL loss is reduced using the
- ``torch.Tensor.sum`` operation.
- kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
- Legacy dictionary, which is required so that model can use `generate()` function.
+ reduce_loss (`bool`, *optional*):
+ Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the
+ `torch.Tensor.sum` operation.
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+ Legacy dictionary, which is required so that model can use *generate()* function.
Returns:
- Example::
+ Example:
- >>> from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
- >>> import torch
+ ```python
+ >>> from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
+ >>> import torch
- >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
- >>> retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True)
- >>> # initialize with RagRetriever to do everything in one forward call
- >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
+ >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+ >>> retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True)
+ >>> # initialize with RagRetriever to do everything in one forward call
+ >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
- >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
- >>> with tokenizer.as_target_tokenizer():
- ... targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
- >>> input_ids = inputs["input_ids"]
- >>> labels = targets["input_ids"]
- >>> outputs = model(input_ids=input_ids, labels=labels)
+ >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
+ >>> with tokenizer.as_target_tokenizer():
+ ... targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
+ >>> input_ids = inputs["input_ids"]
+ >>> labels = targets["input_ids"]
+ >>> outputs = model(input_ids=input_ids, labels=labels)
- >>> # or use retriever separately
- >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
- >>> # 1. Encode
- >>> question_hidden_states = model.question_encoder(input_ids)[0]
- >>> # 2. Retrieve
- >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
- >>> doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)).squeeze(1)
- >>> # 3. Forward to generator
- >>> outputs = model(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=labels)
- """
+ >>> # or use retriever separately
+ >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
+ >>> # 1. Encode
+ >>> question_hidden_states = model.question_encoder(input_ids)[0]
+ >>> # 2. Retrieve
+ >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
+ >>> doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)).squeeze(1)
+ >>> # 3. Forward to generator
+ >>> outputs = model(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=labels)
+ ```"""
n_docs = n_docs if n_docs is not None else self.config.n_docs
exclude_bos_score = exclude_bos_score if exclude_bos_score is not None else self.config.exclude_bos_score
reduce_loss = reduce_loss if reduce_loss is not None else self.config.reduce_loss
@@ -908,56 +896,56 @@ class RagSequenceForGeneration(RagPreTrainedModel):
):
"""
Implements RAG sequence "thorough" decoding. Read the
- :meth:`~transformers.generation_utils.GenerationMixin.generate`` documentation for more information on how to
+ [`~generation_utils.GenerationMixin.generate`]` documentation for more information on how to
set other generate input parameters.
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- The sequence used as a prompt for the generation. If :obj:`input_ids` is not passed, then
- :obj:`context_input_ids` has to be provided.
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ The sequence used as a prompt for the generation. If `input_ids` is not passed, then
+ `context_input_ids` has to be provided.
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- context_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
+ [What are attention masks?](../glossary#attention-mask)
+ context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Input IDs post-processed from the retrieved documents and the question encoder input_ids by the
retriever.
- context_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Attention mask post-processed from the retrieved documents and the question encoder :obj:`input_ids` by
+ context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
+ Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by
the retriever.
- If the model is not initialized with a ``retriever`` or ``input_ids`` is not given,
- :obj:`context_input_ids` and :obj:`context_attention_mask` have to be provided to the forward pass.
- They are returned by :meth:`~transformers.RagRetriever.__call__`.
- doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
- Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
- :obj:`question_encoder_last_hidden_state`.
+ If the model is not initialized with a `retriever` or `input_ids` is not given,
+ `context_input_ids` and `context_attention_mask` have to be provided to the forward pass.
+ They are returned by [`~RagRetriever.__call__`].
+ doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
+ Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
+ `question_encoder_last_hidden_state`.
- If the model is not initialized with a ``retriever`` or ``input_ids`` is not given, :obj:`doc_scores`
- has to be provided to the forward pass. :obj:`doc_scores` are returned by
- :meth:`~transformers.RagRetriever.__call__`.
- do_deduplication (:obj:`bool`, `optional`):
+ If the model is not initialized with a `retriever` or `input_ids` is not given, `doc_scores`
+ has to be provided to the forward pass. `doc_scores` are returned by
+ [`~RagRetriever.__call__`].
+ do_deduplication (`bool`, *optional*):
Whether or not to deduplicate the generations from different context documents for a given input. Has
- to be set to :obj:`False` if used while training with distributed backend.
- num_return_sequences(:obj:`int`, `optional`, defaults to 1):
+ to be set to `False` if used while training with distributed backend.
+ num_return_sequences(`int`, *optional*, defaults to 1):
The number of independently computed returned sequences for each element in the batch. Note that this
- is not the value we pass to the ``generator``'s
- `:func:`~transformers.generation_utils.GenerationMixin.generate`` function, where we set
- ``num_return_sequences`` to :obj:`num_beams`.
- num_beams (:obj:`int`, `optional`, defaults to 1):
+ is not the value we pass to the `generator`'s
+ `[`~generation_utils.GenerationMixin.generate`]` function, where we set
+ `num_return_sequences` to `num_beams`.
+ num_beams (`int`, *optional*, defaults to 1):
Number of beams for beam search. 1 means no beam search.
- n_docs (:obj:`int`, `optional`, defaults to :obj:`config.n_docs`)
+ n_docs (`int`, *optional*, defaults to `config.n_docs`)
Number of documents to retrieve and/or number of documents for which to generate an answer.
kwargs:
- Additional kwargs will be passed to :meth:`~transformers.generation_utils.GenerationMixin.generate`.
+ Additional kwargs will be passed to [`~generation_utils.GenerationMixin.generate`].
Return:
- :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
- sequences. The second dimension (sequence length) is either equal to :obj:`max_length` or shorter if all
- batches finished early due to the :obj:`eos_token_id`.
+ `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
+ sequences. The second dimension (sequence length) is either equal to `max_length` or shorter if all
+ batches finished early due to the `eos_token_id`.
"""
n_docs = n_docs if n_docs is not None else self.config.n_docs
@@ -1244,48 +1232,49 @@ class RagTokenForGeneration(RagPreTrainedModel):
**kwargs # needs kwargs for generation
):
r"""
- do_marginalize (:obj:`bool`, `optional`):
- If :obj:`True`, the logits are marginalized over all documents by making use of
- ``torch.nn.functional.log_softmax``.
- reduce_loss (:obj:`bool`, `optional`):
- Only relevant if ``labels`` is passed. If :obj:`True`, the NLL loss is reduced using the
- ``torch.Tensor.sum`` operation.
- kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
- Legacy dictionary, which is required so that model can use `generate()` function.
+ do_marginalize (`bool`, *optional*):
+ If `True`, the logits are marginalized over all documents by making use of
+ `torch.nn.functional.log_softmax`.
+ reduce_loss (`bool`, *optional*):
+ Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the
+ `torch.Tensor.sum` operation.
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+ Legacy dictionary, which is required so that model can use *generate()* function.
Returns:
- Example::
+ Example:
- >>> from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
- >>> import torch
+ ```python
+ >>> from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
+ >>> import torch
- >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
- >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
- >>> # initialize with RagRetriever to do everything in one forward call
- >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
+ >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
+ >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
+ >>> # initialize with RagRetriever to do everything in one forward call
+ >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
- >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
- >>> with tokenizer.as_target_tokenizer():
- ... targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
- >>> input_ids = inputs["input_ids"]
- >>> labels = targets["input_ids"]
- >>> outputs = model(input_ids=input_ids, labels=labels)
+ >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
+ >>> with tokenizer.as_target_tokenizer():
+ ... targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
+ >>> input_ids = inputs["input_ids"]
+ >>> labels = targets["input_ids"]
+ >>> outputs = model(input_ids=input_ids, labels=labels)
- >>> # or use retriever separately
- >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
- >>> # 1. Encode
- >>> question_hidden_states = model.question_encoder(input_ids)[0]
- >>> # 2. Retrieve
- >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
- >>> doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)).squeeze(1)
- >>> # 3. Forward to generator
- >>> outputs = model(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=labels)
+ >>> # or use retriever separately
+ >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
+ >>> # 1. Encode
+ >>> question_hidden_states = model.question_encoder(input_ids)[0]
+ >>> # 2. Retrieve
+ >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
+ >>> doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)).squeeze(1)
+ >>> # 3. Forward to generator
+ >>> outputs = model(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=labels)
- >>> # or directly generate
- >>> generated = model.generate(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores)
- >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
- """
+ >>> # or directly generate
+ >>> generated = model.generate(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores)
+ >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
+ ```"""
n_docs = n_docs if n_docs is not None else self.config.n_docs
do_marginalize = do_marginalize if do_marginalize is not None else self.config.do_marginalize
reduce_loss = reduce_loss if reduce_loss is not None else self.config.reduce_loss
@@ -1384,105 +1373,103 @@ class RagTokenForGeneration(RagPreTrainedModel):
Implements RAG token decoding.
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- The sequence used as a prompt for the generation. If :obj:`input_ids` is not passed, then
- :obj:`context_input_ids` has to be provided.
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ The sequence used as a prompt for the generation. If `input_ids` is not passed, then
+ `context_input_ids` has to be provided.
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- context_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Input IDs post-processed from the retrieved documents and the question encoder :obj:`input_ids` by the
+ [What are attention masks?](../glossary#attention-mask)
+ context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
+ Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.
- If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided
- to the forward pass. :obj:`context_input_ids` are returned by
- :meth:`~transformers.RagRetriever.__call__`.
- context_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Attention mask post-processed from the retrieved documents and the question encoder :obj:`input_ids` by
+ If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided
+ to the forward pass. `context_input_ids` are returned by
+ [`~RagRetriever.__call__`].
+ context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
+ Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by
the retriever.
- If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided
- to the forward pass. :obj:`context_input_ids` are returned by
- :meth:`~transformers.RagRetriever.__call__`.
- doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
- Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
- :obj:`question_encoder_last_hidden_state`.
+ If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided
+ to the forward pass. `context_input_ids` are returned by
+ [`~RagRetriever.__call__`].
+ doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
+ Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
+ `question_encoder_last_hidden_state`.
- If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided
- to the forward pass. :obj:`context_input_ids` are returned by
- :meth:`~transformers.RagRetriever.__call__`.
- max_length (:obj:`int`, `optional`, defaults to 20):
+ If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided
+ to the forward pass. `context_input_ids` are returned by
+ [`~RagRetriever.__call__`].
+ max_length (`int`, *optional*, defaults to 20):
The maximum length of the sequence to be generated.
- min_length (:obj:`int`, `optional`, defaults to 10):
+ min_length (`int`, *optional*, defaults to 10):
The minimum length of the sequence to be generated.
- early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether or not to stop the beam search when at least ``num_beams`` sentences are finished per batch or
+ early_stopping (`bool`, *optional*, defaults to `False`):
+ Whether or not to stop the beam search when at least `num_beams` sentences are finished per batch or
not.
- use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache: (`bool`, *optional*, defaults to `True`):
Whether or not the model should use the past last key/values attentions (if applicable to the model) to
speed up decoding.
- pad_token_id (:obj:`int`, `optional`):
- The id of the `padding` token.
- bos_token_id (:obj:`int`, `optional`):
- The id of the `beginning-of-sequence` token.
- eos_token_id (:obj:`int`, `optional`):
- The id of the `end-of-sequence` token.
- length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ bos_token_id (`int`, *optional*):
+ The id of the *beginning-of-sequence* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
+ length_penalty (`float`, *optional*, defaults to 1.0):
Exponential penalty to the length. 1.0 means no penalty.
Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
order to encourage the model to produce longer sequences.
- no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
+ no_repeat_ngram_size (`int`, *optional*, defaults to 0):
If set to int > 0, all ngrams of that size can only occur once.
- encoder_no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
- If set to int > 0, all ngrams of that size that occur in the ``encoder_input_ids`` cannot occur in the
- ``decoder_input_ids``.
- bad_words_ids(:obj:`List[int]`, `optional`):
+ encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
+ If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
+ `decoder_input_ids`.
+ bad_words_ids(`List[int]`, *optional*):
List of token ids that are not allowed to be generated. In order to get the tokens of the words that
- should not appear in the generated text, use :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
- num_beams (:obj:`int`, `optional`, defaults to 1):
+ should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.
+ num_beams (`int`, *optional*, defaults to 1):
Number of beams for beam search. 1 means no beam search.
- num_beam_groups (:obj:`int`, `optional`, defaults to 1):
- Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
- beams. `this paper `__ for more details.
- diversity_penalty (:obj:`float`, `optional`, defaults to 0.0):
+ num_beam_groups (`int`, *optional*, defaults to 1):
+ Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
+ beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+ diversity_penalty (`float`, *optional*, defaults to 0.0):
This value is subtracted from a beam's score if it generates a token same as any beam from other group
- at a particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is
+ at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is
enabled.
- num_return_sequences(:obj:`int`, `optional`, defaults to 1):
+ num_return_sequences(`int`, *optional*, defaults to 1):
The number of independently computed returned sequences for each element in the batch. Note that this
- is not the value we pass to the ``generator``'s
- `:func:`~transformers.generation_utils.GenerationMixin.generate` function, where we set
- ``num_return_sequences`` to :obj:`num_beams`.
- decoder_start_token_id (:obj:`int`, `optional`):
- If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
- n_docs (:obj:`int`, `optional`, defaults to :obj:`config.n_docs`)
+ is not the value we pass to the `generator`'s
+ `[`~generation_utils.GenerationMixin.generate`] function, where we set `num_return_sequences` to `num_beams`. decoder_start_token_id (`int`, *optional*):
+ If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
+ n_docs (`int`, *optional*, defaults to `config.n_docs`)
Number of documents to retrieve and/or number of documents for which to generate an answer.
- prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`, `optional`):
+ prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
If provided, this function constraints the beam search to allowed tokens only at each step. If not
- provided no constraint is applied. This function takes 2 arguments :obj:`inputs_ids` and the batch ID
- :obj:`batch_id`. It has to return a list with the allowed tokens for the next generation step
- conditioned on the previously generated tokens :obj:`inputs_ids` and the batch ID :obj:`batch_id`. This
+ provided no constraint is applied. This function takes 2 arguments `inputs_ids` and the batch ID
+ `batch_id`. It has to return a list with the allowed tokens for the next generation step
+ conditioned on the previously generated tokens `inputs_ids` and the batch ID `batch_id`. This
argument is useful for constrained generation conditioned on the prefix, as described in
- `Autoregressive Entity Retrieval `__.
- forced_bos_token_id (:obj:`int`, `optional`):
- The id of the token to force as the first generated token after the :obj:`decoder_start_token_id`.
- Useful for multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token
+ [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904).
+ forced_bos_token_id (`int`, *optional*):
+ The id of the token to force as the first generated token after the `decoder_start_token_id`.
+ Useful for multilingual models like [mBART](../model_doc/mbart) where the first generated token
needs to be the target language token.
- forced_eos_token_id (:obj:`int`, `optional`):
- The id of the token to force as the last generated token when :obj:`max_length` is reached.
- remove_invalid_values (:obj:`bool`, `optional`):
- Whether to remove possible `nan` and `inf` outputs of the model to prevent the generation method to
- crash. Note that using ``remove_invalid_values`` can slow down generation.
+ forced_eos_token_id (`int`, *optional*):
+ The id of the token to force as the last generated token when `max_length` is reached.
+ remove_invalid_values (`bool`, *optional*):
+ Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to
+ crash. Note that using `remove_invalid_values` can slow down generation.
Return:
- :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
- sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all
- batches finished early due to the :obj:`eos_token_id`.
+ `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
+ sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all
+ batches finished early due to the `eos_token_id`.
"""
# set default parameters
n_docs = n_docs if n_docs is not None else self.config.n_docs
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index 063e078cec..ccd1524bb6 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -41,65 +41,61 @@ class TFRetrievAugLMMarginOutput(ModelOutput):
Base class for retriever augmented marginalized models outputs.
Args:
- loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+ loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss.
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
each vocabulary token.
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
- (see :obj:`past_key_values` input) to speed up sequential decoding.
- doc_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.n_docs)`):
- Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
- :obj:`question_encoder_last_hidden_state`.
- retrieved_doc_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.n_docs, hidden_size)`, `optional`, returned when `output_retrieved=True`):
- Embedded documents retrieved by the retriever. Is used with ``question_encoder_last_hidden_state`` to
- compute the ``doc_scores``.
- retrieved_doc_ids (:obj:`tf.Tensor` (int32) of shape :obj:`(batch_size, config.n_docs)`, `optional`, returned when `output_retrieved=True`):
+ (see `past_key_values` input) to speed up sequential decoding.
+ doc_scores (`tf.Tensor` of shape `(batch_size, config.n_docs)`):
+ Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
+ `question_encoder_last_hidden_state`.
+ retrieved_doc_embeds (`tf.Tensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
+ Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to
+ compute the `doc_scores`.
+ retrieved_doc_ids (`tf.Tensor` (int32) of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
The indexes of the embedded documents retrieved by the retriever.
- context_input_ids (:obj:`tf.Tensor`(int32) of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
+ context_input_ids (`tf.Tensor`(int32) of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
- context_attention_mask (:obj:`tf.Tensor` (int32) of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Attention mask post-processed from the retrieved documents and the question encoder :obj:`input_ids` by the
+ context_attention_mask (`tf.Tensor` (int32) of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
+ Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.
- question_encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ question_encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
model.
- question_enc_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings and one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ question_enc_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings and one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
- question_enc_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ question_enc_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
- generator_enc_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ generator_enc_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
- generator_enc_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings and one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ generator_enc_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings and one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
- generator_enc_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ generator_enc_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
- generator_dec_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings and one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ generator_dec_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings and one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
- generator_dec_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ generator_dec_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
@@ -127,63 +123,59 @@ class TFRetrievAugLMMarginOutput(ModelOutput):
class TFRetrievAugLMOutput(ModelOutput):
"""
Args:
- logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+ logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
each vocabulary token.
- past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
- num_heads, sequence_length, embed_size_per_head)`).
+ past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
- (see :obj:`past_key_values` input) to speed up sequential decoding.
- doc_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.n_docs)`):
- Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
- :obj:`question_encoder_last_hidden_state`.
- retrieved_doc_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.n_docs, hidden_size)`, `optional`, returned when `output_retrieved=True`):
- Embedded documents retrieved by the retriever. Is used with ``question_encoder_last_hidden_state`` to
- compute the ``doc_scores``.
- retrieved_doc_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.n_docs)`, `optional`, returned when `output_retrieved=True`):
+ (see `past_key_values` input) to speed up sequential decoding.
+ doc_scores (`tf.Tensor` of shape `(batch_size, config.n_docs)`):
+ Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
+ `question_encoder_last_hidden_state`.
+ retrieved_doc_embeds (`tf.Tensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
+ Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to
+ compute the `doc_scores`.
+ retrieved_doc_ids (`tf.Tensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
The indexes of the embedded documents retrieved by the retriever.
- context_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
+ context_input_ids (`tf.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
- context_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Attention mask post-processed from the retrieved documents and the question encoder :obj:`input_ids` by the
+ context_attention_mask (`tf.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
+ Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.
- question_encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ question_encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
model.
- question_enc_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings and one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ question_enc_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings and one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
- question_enc_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ question_enc_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
- generator_enc_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ generator_enc_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
- generator_enc_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings and one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ generator_enc_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings and one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
- generator_enc_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ generator_enc_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
- generator_dec_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings and one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ generator_dec_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings and one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
- generator_dec_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
- sequence_length)`.
+ generator_dec_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
@@ -233,61 +225,62 @@ class TFRagPreTrainedModel(TFPreTrainedModel):
model checkpoints.
Params:
- question_encoder_pretrained_model_name_or_path (:obj: `str`, `optional`):
+ question_encoder_pretrained_model_name_or_path (:obj: *str*, *optional*):
Information necessary to initiate the question encoder. Can be either:
- - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g.,
- ``bert-base-uncased``.
- - A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g.,
- ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `pytorch index checkpoint file` (e.g, ``./pt_model/``). In this case,
- ``question_encoder_from_pt`` should be set to :obj:`True`.
+ - A string with the *shortcut name* of a pretrained model to load from cache or download, e.g.,
+ `bert-base-uncased`.
+ - A string with the *identifier name* of a pretrained model that was user-uploaded to our S3, e.g.,
+ `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *pytorch index checkpoint file* (e.g, `./pt_model/`). In this case,
+ `question_encoder_from_pt` should be set to `True`.
- generator_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
+ generator_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
Information necessary to initiate the generator. Can be either:
- - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g.,
- ``t5-small``.
- - A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g.,
- ``facebook/bart-base``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `pytorch checkpoint file` (e.g, ``./pt_model/``). In this case,
- ``generator_from_pt`` should be set to :obj:`True`.
+ - A string with the *shortcut name* of a pretrained model to load from cache or download, e.g.,
+ `t5-small`.
+ - A string with the *identifier name* of a pretrained model that was user-uploaded to our S3, e.g.,
+ `facebook/bart-base`.
+ - A path to a *directory* containing model weights saved using
+ [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *pytorch checkpoint file* (e.g, `./pt_model/`). In this case,
+ `generator_from_pt` should be set to `True`.
- model_args (remaining positional arguments, `optional`):
- All remaining positional arguments will be passed to the underlying model's ``__init__`` method.
- retriever (:class:`~transformers.RagRetriever`, `optional`):
+ model_args (remaining positional arguments, *optional*):
+ All remaining positional arguments will be passed to the underlying model's `__init__` method.
+ retriever ([`RagRetriever`], *optional*):
The retriever to use.
- kwargs (remaining dictionary of keyword arguments, `optional`):
+ kwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- ``output_attentions=True``).
+ `output_attentions=True`).
- - To update the question_encoder configuration, use the prefix `question_encoder_` for each
+ - To update the question_encoder configuration, use the prefix *question_encoder_* for each
configuration parameter.
- - To update the generator configuration, use the prefix `generator_` for each configuration parameter.
+ - To update the generator configuration, use the prefix *generator_* for each configuration parameter.
- To update the parent model configuration, do not use a prefix for each configuration parameter.
- Behaves differently depending on whether a :obj:`config` is provided or automatically loaded.
+ Behaves differently depending on whether a `config` is provided or automatically loaded.
- Example::
+ Example:
- >>> from transformers import RagRetriever, TFRagModel
- >>> # initialize a RAG from two pretrained models.
- >>> model = TFRagModel.from_pretrained_question_encoder_generator('facebook/dpr-question_encoder-single-nq-base', 't5-small')
- >>> # alternatively, initialize from pytorch pretrained models can also be done
- >>> model = TFRagModel.from_pretrained_question_encoder_generator('facebook/dpr-question_encoder-single-nq-base', "facebook/bart-base", generator_from_pt=True, question_encoder_from_pt=True)
+ ```python
+ >>> from transformers import RagRetriever, TFRagModel
+ >>> # initialize a RAG from two pretrained models.
+ >>> model = TFRagModel.from_pretrained_question_encoder_generator('facebook/dpr-question_encoder-single-nq-base', 't5-small')
+ >>> # alternatively, initialize from pytorch pretrained models can also be done
+ >>> model = TFRagModel.from_pretrained_question_encoder_generator('facebook/dpr-question_encoder-single-nq-base', "facebook/bart-base", generator_from_pt=True, question_encoder_from_pt=True)
- >>> # saving model after fine-tuning
- >>> model.save_pretrained("./rag")
+ >>> # saving model after fine-tuning
+ >>> model.save_pretrained("./rag")
- >>> # load retriever
- >>> retriever = RagRetriever.from_pretrained(PATH, index_name="exact", use_dummy_dataset=True)
- >>> # load fine-tuned model with retriever
- >>> model = TFRagModel.from_pretrained("./rag", retriever=retriever)
- """
+ >>> # load retriever
+ >>> retriever = RagRetriever.from_pretrained(PATH, index_name="exact", use_dummy_dataset=True)
+ >>> # load fine-tuned model with retriever
+ >>> model = TFRagModel.from_pretrained("./rag", retriever=retriever)
+ ```"""
kwargs_question_encoder = {
argument[len("question_encoder_") :]: value
@@ -370,20 +363,20 @@ RAG_START_DOCSTRING = r"""
relevant context documents. The documents are then prepended to the input. Such contextualized inputs is passed to
the generator.
- The question encoder can be any `autoencoding` model, preferably :class:`~transformers.TFDPRQuestionEncoder`, and
- the generator can be any `seq2seq` model, preferably :class:`~transformers.TFBartForConditionalGeneration`.
+ The question encoder can be any *autoencoding* model, preferably [`TFDPRQuestionEncoder`], and
+ the generator can be any *seq2seq* model, preferably [`TFBartForConditionalGeneration`].
- The model can be initialized with a :class:`~transformers.RagRetriever` for end-to-end generation or used in
+ The model can be initialized with a [`RagRetriever`] for end-to-end generation or used in
combination with the outputs of a retriever in multiple steps---see examples for more details. The model is
- compatible any `autoencoding` model as the ``question_encoder`` and any `seq2seq` model with language model head as
- the ``generator``. It has been tested with :class:`~transformers.TFDPRQuestionEncoder` as the ``question_encoder``
- and :class:`~transformers.TFBartForConditionalGeneration` as the ``generator``.
+ compatible any *autoencoding* model as the `question_encoder` and any *seq2seq* model with language model head as
+ the `generator`. It has been tested with [`TFDPRQuestionEncoder`] as the `question_encoder`
+ and [`TFBartForConditionalGeneration`] as the `generator`.
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a Tensorflow `tf.keras.Model `__
+ This model is also a Tensorflow [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)
subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to
general usage and behavior.
@@ -391,84 +384,81 @@ RAG_START_DOCSTRING = r"""
SavedModel format.
Args:
- config (:class:`~transformers.RagConfig`):
+ config ([`RagConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the model weights.
- question_encoder (:class:`transformers.TFPreTrainedModel`):
- An encoder model compatible with the faiss index encapsulated by the ``retriever``.
- generator (:class:`transformers.TFPreTrainedModel`):
+ [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+ question_encoder ([`TFPreTrainedModel`]):
+ An encoder model compatible with the faiss index encapsulated by the `retriever`.
+ generator ([`TFPreTrainedModel`]):
A seq2seq model used as the generator in the RAG architecture.
- retriever (:class:`~transformers.RagRetriever`):
+ retriever ([`RagRetriever`]):
A retriever class encapsulating a faiss index queried to obtain context documents for current inputs.
"""
RAG_FORWARD_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. :class:`~transformers.RagConfig`, used to initialize
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize
the model, specifies which generator to use, it also specifies a compatible generator tokenizer. Use that
tokenizer class to obtain the indices.
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_outputs (:obj:`tuple(tuple(tf.Tensor)`, `optional`)
- Tuple consists of (:obj:`generator_enc_last_hidden_state`, `optional`: :obj:`generator_enc_hidden_states`,
- `optional`: :obj:`generator_enc_attentions`). :obj:`generator_enc_last_hidden_state` of shape
- :obj:`(batch_size, n_docs * sequence_length, hidden_size)` is a sequence of hidden-states at the output of
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_outputs (`tuple(tuple(tf.Tensor)`, *optional*)
+ Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
+ *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape
+ `(batch_size, n_docs * sequence_length, hidden_size)` is a sequence of hidden-states at the output of
the last layer of the generator's encoder.
- Used by the (:class:`~transformers.TFRagModel`) model during decoding.
- decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Provide for generation tasks. `None` by default, construct as per instructions for the generator model
+ Used by the ([`TFRagModel`]) model during decoding.
+ decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Provide for generation tasks. *None* by default, construct as per instructions for the generator model
you're using with your RAG instance.
- decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- past_key_values (:obj:`tuple(tuple(tf.Tensor))`):
- Tuple consists of two elements: :obj:`encoder_outputs` of the RAG model (see :obj:`encoder_outputs`) and
- :obj:`past_key_values` of the underlying generator. Can be used to speed up decoding.
- :obj:`past_key_values` are used in the (:class:`~transformers.RagTokenForGeneration`) model during
+ past_key_values (`tuple(tuple(tf.Tensor))`):
+ Tuple consists of two elements: `encoder_outputs` of the RAG model (see `encoder_outputs`) and
+ `past_key_values` of the underlying generator. Can be used to speed up decoding.
+ `past_key_values` are used in the ([`RagTokenForGeneration`]) model during
decoding.
- doc_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.n_docs)`):
- Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
- :obj:`question_encoder_last_hidden_state`. If the model has is not initialized with a ``retriever``
- :obj:`doc_scores` has to be provided to the forward pass. :obj:`doc_scores` can be computed via
- :obj:`question_encoder_last_hidden_state` and :obj:`retrieved_doc_embeds`, see examples for more
+ doc_scores (`tf.Tensor` of shape `(batch_size, config.n_docs)`):
+ Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
+ `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever`
+ `doc_scores` has to be provided to the forward pass. `doc_scores` can be computed via
+ `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more
information.
- context_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Input IDs post-processed from the retrieved documents and the question encoder :obj:`input_ids` by the
+ context_input_ids (`tf.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
+ Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.
- If the model has is not initialized with a ``retriever`` :obj:`context_input_ids` has to be provided to the
- forward pass. :obj:`context_input_ids` are returned by :meth:`~transformers.RagRetriever.__call__`.
- context_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Attention mask post-processed from the retrieved documents and the question encoder :obj:`input_ids` by the
+ If the model has is not initialized with a `retriever` ``context_input_ids` has to be provided to the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`]. context_attention_mask (`tf.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*): Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.
- If the model has is not initialized with a ``retriever`` :obj:`context_attention_mask` has to be provided
- to the forward pass. :obj:`context_attention_mask` are returned by
- :meth:`~transformers.RagRetriever.__call__`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If the model has is not initialized with a `retriever` `context_attention_mask` has to be provided
+ to the forward pass. `context_attention_mask` are returned by
+ [`~RagRetriever.__call__`].
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- output_retrieved(:obj:`bool`, `optional`):
- Whether or not to return the :obj:`retrieved_doc_embeds`, :obj:`retrieved_doc_ids`,
- :obj:`context_input_ids` and :obj:`context_attention_mask`. See returned tensors for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~TFRetrievAugLMOutput` instead of a plain tuple.
- n_docs (:obj:`int`, `optional`, defaults to :obj:`config.n_docs`)
+ output_retrieved(`bool`, *optional*):
+ Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`,
+ `context_input_ids` and `context_attention_mask`. See returned tensors for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`TFRetrievAugLMOutput`] instead of a plain tuple.
+ n_docs (`int`, *optional*, defaults to `config.n_docs``)
Number of documents to retrieve and/or number of documents for which to generate an answer.
"""
@@ -910,47 +900,48 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
**kwargs # needs kwargs for generation
):
r"""
- do_marginalize (:obj:`bool`, `optional`):
- If :obj:`True`, the logits are marginalized over all documents by making use of
- ``torch.nn.functional.log_softmax``.
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ do_marginalize (`bool`, *optional*):
+ If `True`, the logits are marginalized over all documents by making use of
+ `torch.nn.functional.log_softmax`.
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the cross entropy classification loss according to Rag-Token model formulation See
https://arxiv.org/pdf/2005.11401.pdf Section 2.1 for details about Rag-Token formulation. Indices should be
- in ``[0, ..., config.vocab_size - 1]``.
- reduce_loss (:obj:`bool`, `optional`):
- Only relevant if ``labels`` is passed. If :obj:`True`, the NLL loss is reduced using the ``tf.Tensor.sum``
+ in `[0, ..., config.vocab_size - 1]`.
+ reduce_loss (`bool`, *optional*):
+ Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
operation.
- kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
- Legacy dictionary, which is required so that model can use `generate()` function.
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+ Legacy dictionary, which is required so that model can use *generate()* function.
Returns:
- Example::
+ Example:
- >>> from transformers import RagTokenizer, RagRetriever, TFRagTokenForGeneration
+ ```python
+ >>> from transformers import RagTokenizer, RagRetriever, TFRagTokenForGeneration
- >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
- >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
- >>> # initialize with RagRetriever to do everything in one forward call
- >>> model = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever, from_pt=True)
+ >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
+ >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
+ >>> # initialize with RagRetriever to do everything in one forward call
+ >>> model = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever, from_pt=True)
- >>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf")
- >>> outputs = model(input_dict, output_retrieved=True)
+ >>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf")
+ >>> outputs = model(input_dict, output_retrieved=True)
- >>> # or use retriever separately
- >>> # 1. Encode
- >>> input_ids = input_dict["input_ids"]
- >>> question_hidden_states = model.question_encoder(input_ids)[0]
- >>> # 2. Retrieve
- >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.numpy(), return_tensors="tf")
- >>> doc_scores = tf.squeeze(tf.matmul(tf.expand_dims(question_hidden_states, axis=1), docs_dict["retrieved_doc_embeds"], transpose_b=True), axis=1)
- >>> # 3. Forward to generator
- >>> outputs = model(inputs=None, context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=input_dict["labels"])
+ >>> # or use retriever separately
+ >>> # 1. Encode
+ >>> input_ids = input_dict["input_ids"]
+ >>> question_hidden_states = model.question_encoder(input_ids)[0]
+ >>> # 2. Retrieve
+ >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.numpy(), return_tensors="tf")
+ >>> doc_scores = tf.squeeze(tf.matmul(tf.expand_dims(question_hidden_states, axis=1), docs_dict["retrieved_doc_embeds"], transpose_b=True), axis=1)
+ >>> # 3. Forward to generator
+ >>> outputs = model(inputs=None, context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=input_dict["labels"])
- >>> # or directly generate
- >>> generated = model.generate(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores)
- >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
- """
+ >>> # or directly generate
+ >>> generated = model.generate(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores)
+ >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
+ ```"""
assert (
"decoder_cached_states" not in kwargs
@@ -1073,91 +1064,89 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
Implements TFRAG token decoding.
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- The sequence used as a prompt for the generation. If :obj:`input_ids` is not passed, then
- :obj:`context_input_ids` has to be provided.
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ The sequence used as a prompt for the generation. If `input_ids` is not passed, then
+ `context_input_ids` has to be provided.
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- context_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Input IDs post-processed from the retrieved documents and the question encoder :obj:`input_ids` by the
+ [What are attention masks?](../glossary#attention-mask)
+ context_input_ids (`tf.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
+ Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.
- If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided
- to the forward pass. :obj:`context_input_ids` are returned by
- :meth:`~transformers.RagRetriever.__call__`.
- context_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Attention mask post-processed from the retrieved documents and the question encoder :obj:`input_ids` by
+ If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided
+ to the forward pass. `context_input_ids` are returned by
+ [`~RagRetriever.__call__`].
+ context_attention_mask (`tf.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
+ Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by
the retriever.
- If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided
- to the forward pass. :obj:`context_input_ids` are returned by
- :meth:`~transformers.RagRetriever.__call__`.
- doc_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.n_docs)`):
- Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
- :obj:`question_encoder_last_hidden_state`.
+ If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided
+ to the forward pass. `context_input_ids` are returned by
+ [`~RagRetriever.__call__`].
+ doc_scores (`tf.Tensor` of shape `(batch_size, config.n_docs)`):
+ Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
+ `question_encoder_last_hidden_state`.
- If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided
- to the forward pass. :obj:`context_input_ids` are returned by
- :meth:`~transformers.RagRetriever.__call__`.
- max_length (:obj:`int`, `optional`, defaults to 20):
+ If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided
+ to the forward pass. `context_input_ids` are returned by
+ [`~RagRetriever.__call__`].
+ max_length (`int`, *optional*, defaults to 20):
The maximum length of the sequence to be generated.
- min_length (:obj:`int`, `optional`, defaults to 10):
+ min_length (`int`, *optional*, defaults to 10):
The minimum length of the sequence to be generated.
- early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether or not to stop the beam search when at least ``num_beams`` sentences are finished per batch or
+ early_stopping (`bool`, *optional*, defaults to `False`):
+ Whether or not to stop the beam search when at least `num_beams` sentences are finished per batch or
not.
- use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache: (`bool`, *optional*, defaults to `True`):
Whether or not the model should use the past last key/values attentions (if applicable to the model) to
speed up decoding.
- pad_token_id (:obj:`int`, `optional`):
- The id of the `padding` token.
- bos_token_id (:obj:`int`, `optional`):
- The id of the `beginning-of-sequence` token.
- eos_token_id (:obj:`int`, `optional`):
- The id of the `end-of-sequence` token.
- length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ bos_token_id (`int`, *optional*):
+ The id of the *beginning-of-sequence* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
+ length_penalty (`float`, *optional*, defaults to 1.0):
Exponential penalty to the length. 1.0 means no penalty.
Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
order to encourage the model to produce longer sequences.
- no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
+ no_repeat_ngram_size (`int`, *optional*, defaults to 0):
If set to int > 0, all ngrams of that size can only occur once.
- bad_words_ids(:obj:`List[int]`, `optional`):
+ bad_words_ids(`List[int]`, *optional*):
List of token ids that are not allowed to be generated. In order to get the tokens of the words that
- should not appear in the generated text, use :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
- num_beams (:obj:`int`, `optional`, defaults to 1):
+ should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.
+ num_beams (`int`, *optional*, defaults to 1):
Number of beams for beam search. 1 means no beam search.
- num_return_sequences(:obj:`int`, `optional`, defaults to 1):
+ num_return_sequences(`int`, *optional*, defaults to 1):
The number of independently computed returned sequences for each element in the batch. Note that this
- is not the value we pass to the ``generator``'s
- `:func:`~transformers.generation_utils.GenerationMixin.generate` function, where we set
- ``num_return_sequences`` to :obj:`num_beams`.
- decoder_start_token_id (:obj:`int`, `optional`):
- If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
- n_docs (:obj:`int`, `optional`, defaults to :obj:`config.n_docs`)
+ is not the value we pass to the `generator`'s
+ `[`~generation_utils.GenerationMixin.generate`] function, where we set `num_return_sequences` to `num_beams`. decoder_start_token_id (`int`, *optional*):
+ If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
+ n_docs (`int`, *optional*, defaults to `config.n_docs`)
Number of documents to retrieve and/or number of documents for which to generate an answer.
- output_attentions (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
- output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
- output_scores (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
- return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ output_scores (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+ return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
model_specific_kwargs:
- Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model.
+ Additional model specific kwargs will be forwarded to the `forward` function of the model.
Return:
- :obj:`tf.Tensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
- sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all
- batches finished early due to the :obj:`eos_token_id`.
+ `tf.Tensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
+ sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all
+ batches finished early due to the `eos_token_id`.
"""
# set default parameters
n_docs = n_docs if n_docs is not None else self.config.n_docs
@@ -1503,47 +1492,48 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
**kwargs # needs kwargs for generation
):
r"""
- exclude_bos_score (:obj:`bool`, `optional`):
- Only relevant if ``labels`` is passed. If :obj:`True`, the score of the BOS token is disregarded when
+ exclude_bos_score (`bool`, *optional*):
+ Only relevant if `labels` is passed. If `True`, the score of the BOS token is disregarded when
computing the loss.
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the cross entropy classification loss according to Rag-Sequence model formulation See
https://arxiv.org/pdf/2005.11401.pdf Section 2.1 for details about Rag-Sequence formulation. Indices should
- be in ``[0, ..., config.vocab_size - 1]``.
- reduce_loss (:obj:`bool`, `optional`):
- Only relevant if ``labels`` is passed. If :obj:`True`, the NLL loss is reduced using the ``tf.Tensor.sum``
+ be in `[0, ..., config.vocab_size - 1]`.
+ reduce_loss (`bool`, *optional*):
+ Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
operation.
- kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
- Legacy dictionary, which is required so that model can use `generate()` function.
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+ Legacy dictionary, which is required so that model can use *generate()* function.
Returns:
- Example::
+ Example:
- >>> from transformers import RagTokenizer, RagRetriever, TFRagSequenceForGeneration
+ ```python
+ >>> from transformers import RagTokenizer, RagRetriever, TFRagSequenceForGeneration
- >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
- >>> retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True)
- >>> # initialize with RagRetriever to do everything in one forward call
- >>> model = TFRagRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever, from_pt=True)
+ >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+ >>> retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True)
+ >>> # initialize with RagRetriever to do everything in one forward call
+ >>> model = TFRagRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever, from_pt=True)
- >>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf")
- >>> outputs = model(input_dict, output_retrieved=True)
+ >>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf")
+ >>> outputs = model(input_dict, output_retrieved=True)
- >>> # or use retriever separately
- >>> # 1. Encode
- >>> input_ids = input_dict["input_ids"]
- >>> question_hidden_states = model.question_encoder(input_ids)[0]
- >>> # 2. Retrieve
- >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.numpy(), return_tensors="tf")
- >>> doc_scores = tf.squeeze(tf.matmul(tf.expand_dims(question_hidden_states, axis=1), docs_dict["retrieved_doc_embeds"], transpose_b=True), axis=1)
- >>> # 3. Forward to generator
- >>> outputs = model(inputs=None, context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=input_dict["labels"])
+ >>> # or use retriever separately
+ >>> # 1. Encode
+ >>> input_ids = input_dict["input_ids"]
+ >>> question_hidden_states = model.question_encoder(input_ids)[0]
+ >>> # 2. Retrieve
+ >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.numpy(), return_tensors="tf")
+ >>> doc_scores = tf.squeeze(tf.matmul(tf.expand_dims(question_hidden_states, axis=1), docs_dict["retrieved_doc_embeds"], transpose_b=True), axis=1)
+ >>> # 3. Forward to generator
+ >>> outputs = model(inputs=None, context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=input_dict["labels"])
- >>> # or directly generate
- >>> generated = model.generate(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores)
- >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
- """
+ >>> # or directly generate
+ >>> generated = model.generate(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores)
+ >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
+ ```"""
assert (
"decoder_cached_states" not in kwargs
@@ -1728,49 +1718,48 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
):
"""
Implements RAG sequence "thorough" decoding. Read the
- :meth:`~transformers.generation_utils.GenerationMixin.generate`` documentation for more information on how to
+ [`~generation_utils.GenerationMixin.generate`]` documentation for more information on how to
set other generate input parameters
Args:
- input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- The sequence used as a prompt for the generation. If :obj:`input_ids` is not passed, then
- :obj:`context_input_ids` has to be provided.
- attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1
- for tokens that are **not masked**, - 0 for tokens that are **masked**. `What are attention masks?
- <../glossary.html#attention-mask>`__
- context_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
+ input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ The sequence used as a prompt for the generation. If `input_ids` is not passed, then
+ `context_input_ids` has to be provided.
+ attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1
+ for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask)
+ context_input_ids (`tf.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Input IDs post-processed from the retrieved documents and the question encoder input_ids by the
retriever.
- context_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
- Attention mask post-processed from the retrieved documents and the question encoder :obj:`input_ids` by
- the retriever. If the model has is not initialized with a ``retriever`` or ``input_ids`` is not given,
- :obj:`context_input_ids` and :obj:`context_attention_mask` have to be provided to the forward pass.
- They are returned by :meth:`~transformers.RagRetriever.__call__`.
- doc_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.n_docs)`):
- Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
- :obj:`question_encoder_last_hidden_state`. If the model has is not initialized with a ``retriever`` or
- ``input_ids`` is not given, :obj:`doc_scores` has to be provided to the forward pass. :obj:`doc_scores`
- are returned by :meth:`~transformers.RagRetriever.__call__`.
- do_deduplication (:obj:`bool`, `optional`):
+ context_attention_mask (`tf.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
+ Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by
+ the retriever. If the model has is not initialized with a `retriever` or `input_ids` is not given,
+ `context_input_ids` and `context_attention_mask` have to be provided to the forward pass.
+ They are returned by [`~RagRetriever.__call__`].
+ doc_scores (`tf.Tensor` of shape `(batch_size, config.n_docs)`):
+ Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
+ `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` or
+ `input_ids` is not given, `doc_scores` has to be provided to the forward pass. `doc_scores`
+ are returned by [`~RagRetriever.__call__`].
+ do_deduplication (`bool`, *optional*):
Whether or not to deduplicate the generations from different context documents for a given input. Has
- to be set to :obj:`False` if used while training with distributed backend.
- num_return_sequences(:obj:`int`, `optional`, defaults to 1):
+ to be set to `False` if used while training with distributed backend.
+ num_return_sequences(`int`, *optional*, defaults to 1):
The number of independently computed returned sequences for each element in the batch. Note that this
- is not the value we pass to the ``generator``'s
- `:func:`~transformers.generation_utils.GenerationMixin.generate`` function, where we set
- ``num_return_sequences`` to :obj:`num_beams`.
- num_beams (:obj:`int`, `optional`, defaults to 1):
+ is not the value we pass to the `generator`'s
+ `[`~generation_utils.GenerationMixin.generate`]` function, where we set
+ `num_return_sequences` to `num_beams`.
+ num_beams (`int`, *optional*, defaults to 1):
Number of beams for beam search. 1 means no beam search.
- n_docs (:obj:`int`, `optional`, defaults to :obj:`config.n_docs`)
+ n_docs (`int`, *optional*, defaults to `config.n_docs`)
Number of documents to retrieve and/or number of documents for which to generate an answer.
kwargs:
- Additional kwargs will be passed to :meth:`~transformers.generation_utils.GenerationMixin.generate`
+ Additional kwargs will be passed to [`~generation_utils.GenerationMixin.generate`]
Return:
- :obj:`tf.Tensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
- sequences. The second dimension (sequence length) is either equal to :obj:`max_length` or shorter if all
- batches finished early due to the :obj:`eos_token_id`.
+ `tf.Tensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
+ sequences. The second dimension (sequence length) is either equal to `max_length` or shorter if all
+ batches finished early due to the `eos_token_id`.
"""
n_docs = n_docs if n_docs is not None else self.config.n_docs
diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py
index eae0b30092..71fef5a7c7 100755
--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -1803,30 +1803,28 @@ class ReformerPreTrainedModel(PreTrainedModel):
@dataclass
class ReformerModelOutput(ModelOutput):
"""
- Output type of :class:`~transformers.ReformerModel`.
+ Output type of [`ReformerModel`].
Args:
- last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, hidden_size)`):
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_predict, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
- ``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then
- ``num_predict`` corresponds to ``sequence_length``.
- past_buckets_states (:obj:`List[Tuple(torch.LongTensor, torch.FloatTensor)]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`Tuple(torch.LongTensor, torch.FloatTensor` of length :obj:`config.n_layers`, with the first
- element being the previous `buckets` of shape :obj:`(batch_size, num_heads, num_hashes, sequence_length)`)
- and the second being the previous `hidden_states` of shape :obj:`(batch_size, sequence_length,
- hidden_size)`).
+ `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then
+ `num_predict` corresponds to `sequence_length`.
+ past_buckets_states (`List[Tuple(torch.LongTensor, torch.FloatTensor)]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `Tuple(torch.LongTensor, torch.FloatTensor` of length `config.n_layers`, with the first
+ element being the previous *buckets* of shape `(batch_size, num_heads, num_hashes, sequence_length)`)
+ and the second being the previous *hidden_states* of shape `(batch_size, sequence_length, hidden_size)`).
- Contains precomputed buckets and hidden-states that can be used (see ``past_buckets_states`` input) to
+ Contains precomputed buckets and hidden-states that can be used (see `past_buckets_states` input) to
speed up sequential decoding.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings and one for the output of each
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -1841,32 +1839,30 @@ class ReformerModelOutput(ModelOutput):
@dataclass
class ReformerModelWithLMHeadOutput(ModelOutput):
"""
- Output type of :class:`~transformers.ReformerModelWithLMHead`.
+ Output type of [`ReformerModelWithLMHead`].
Args:
- loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
+ loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided)
Language modeling loss (for next-token prediction).
- logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, config.vocab_size)`):
+ logits (`torch.FloatTensor` of shape `(batch_size, num_predict, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
- ``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then
- ``num_predict`` corresponds to ``sequence_length``.
- past_buckets_states (:obj:`List[Tuple(torch.LongTensor, torch.FloatTensor)]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- List of :obj:`Tuple(torch.LongTensor, torch.FloatTensor` of length :obj:`config.n_layers`, with the first
- element being the previous `buckets` of shape :obj:`(batch_size, num_heads, num_hashes, sequence_length)`)
- and the second being the previous `hidden_states` of shape :obj:`(batch_size, sequence_length,
- hidden_size)`).
+ `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then
+ `num_predict` corresponds to `sequence_length`.
+ past_buckets_states (`List[Tuple(torch.LongTensor, torch.FloatTensor)]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ List of `Tuple(torch.LongTensor, torch.FloatTensor` of length `config.n_layers`, with the first
+ element being the previous *buckets* of shape `(batch_size, num_heads, num_hashes, sequence_length)`)
+ and the second being the previous *hidden_states* of shape `(batch_size, sequence_length, hidden_size)`).
- Contains precomputed buckets and hidden-states that can be used (see ``past_buckets_states`` input) to
+ Contains precomputed buckets and hidden-states that can be used (see `past_buckets_states` input) to
speed up sequential decoding.
- hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- TTuple of :obj:`torch.FloatTensor` (one for the output of the embeddings and one for the output of each
- layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ TTuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each
+ layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
- attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
@@ -1880,82 +1876,80 @@ class ReformerModelWithLMHeadOutput(ModelOutput):
REFORMER_START_DOCSTRING = r"""
- Reformer was proposed in `Reformer: The Efficient Transformer `__ by Nikita
+ Reformer was proposed in [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita
Kitaev, Łukasz Kaiser, Anselm Levskaya.
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.ReformerConfig`): Model configuration class with all the parameters of the model.
+ config ([`ReformerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
REFORMER_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. During training the input_ids sequence_length has to be
a multiple of the relevant model's chunk lengths (lsh's, local's or both). During evaluation, the indices
are automatically padded to be a multiple of the chunk length.
- Indices can be obtained using :class:`~transformers.ReformerTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`ReformerTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- num_hashes (:obj:`int`, `optional`):
+ num_hashes (`int`, *optional*):
The number of hashing rounds that should be performed during bucketing. Setting this argument overwrites
- the default defined in :obj:`config.num_hashes`.
+ the default defined in `config.num_hashes`.
- For more information, see :obj:`num_hashes` in :class:`~transformers.ReformerConfig`.
- past_buckets_states (:obj:`List[Tuple(torch.LongTensor, torch.FloatTensor)]`, `optional`):
- List of :obj:`Tuple(torch.LongTensor, torch.FloatTensor` of length :obj:`config.n_layers`, with the first
- element being the previous `buckets` of shape :obj:`(batch_size, num_heads, num_hashes, sequence_length)`)
- and the second being the previous `hidden_states` of shape :obj:`(batch_size, sequence_length,
- hidden_size)`).
+ For more information, see `num_hashes` in [`ReformerConfig`].
+ past_buckets_states (`List[Tuple(torch.LongTensor, torch.FloatTensor)]`, *optional*):
+ List of `Tuple(torch.LongTensor, torch.FloatTensor` of length `config.n_layers`, with the first
+ element being the previous *buckets* of shape `(batch_size, num_heads, num_hashes, sequence_length)`)
+ and the second being the previous *hidden_states* of shape `(batch_size, sequence_length, hidden_size)`).
Contains precomputed hidden-states and buckets (only relevant for LSH Self-Attention). Can be used to speed
up sequential decoding.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -2221,10 +2215,9 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
labels=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0,
- ..., config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only
- computed for labels in ``[0, ..., config.vocab_size]``
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only
+ computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2335,9 +2328,8 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2417,10 +2409,9 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2537,13 +2528,13 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index 5b31c32b22..c6f898322f 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -676,65 +676,63 @@ class RemBertPreTrainedModel(PreTrainedModel):
REMBERT_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.RemBertConfig`): Model configuration class with all the parameters of the model.
+ config ([`RemBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
REMBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.RemBertTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`RemBertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -746,13 +744,13 @@ class RemBertModel(RemBertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
- cross-attention is added between the self-attention layers, following the architecture described in `Attention is
- all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+ cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+ all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
- To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
- set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
- argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration
+ set to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
+ argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an
input to the forward pass.
"""
@@ -806,23 +804,23 @@ class RemBertModel(RemBertPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -958,10 +956,9 @@ class RemBertForMaskedLM(RemBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1058,45 +1055,46 @@ class RemBertForCausalLM(RemBertPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
- ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
Returns:
- Example::
+ Example:
- >>> from transformers import RemBertTokenizer, RemBertForCausalLM, RemBertConfig
- >>> import torch
+ ```python
+ >>> from transformers import RemBertTokenizer, RemBertForCausalLM, RemBertConfig
+ >>> import torch
- >>> tokenizer = RemBertTokenizer.from_pretrained('google/rembert')
- >>> config = RemBertConfig.from_pretrained("google/rembert")
- >>> config.is_decoder = True
- >>> model = RemBertForCausalLM.from_pretrained('google/rembert', config=config)
+ >>> tokenizer = RemBertTokenizer.from_pretrained('google/rembert')
+ >>> config = RemBertConfig.from_pretrained("google/rembert")
+ >>> config.is_decoder = True
+ >>> model = RemBertForCausalLM.from_pretrained('google/rembert', config=config)
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.logits
- """
+ >>> prediction_logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.rembert(
@@ -1200,10 +1198,9 @@ class RemBertForSequenceClassification(RemBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1297,10 +1294,9 @@ class RemBertForMultipleChoice(RemBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1390,9 +1386,8 @@ class RemBertForTokenClassification(RemBertPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1480,13 +1475,13 @@ class RemBertForQuestionAnswering(RemBertPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py
index 55b8c0c184..ba29b4bf9f 100644
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -119,7 +119,7 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
@@ -836,7 +836,7 @@ class TFRemBertPreTrainedModel(TFPreTrainedModel):
Dummy inputs to build the network.
Returns:
- :obj:`Dict[str, tf.Tensor]`: The dummy inputs.
+ `Dict[str, tf.Tensor]`: The dummy inputs.
"""
dummy = {"input_ids": tf.constant(DUMMY_INPUTS)}
# Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
@@ -851,92 +851,92 @@ class TFRemBertPreTrainedModel(TFPreTrainedModel):
REMBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.RemBertConfig`): Model configuration class with all the parameters of the model.
+ config ([`RemBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
REMBERT_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`):
+ input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -978,24 +978,24 @@ class TFRemBertModel(TFRemBertPreTrainedModel):
**kwargs,
) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
r"""
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
"""
inputs = input_processing(
func=self.call,
@@ -1097,10 +1097,9 @@ class TFRemBertForMaskedLM(TFRemBertPreTrainedModel, TFMaskedLanguageModelingLos
**kwargs,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
func=self.call,
@@ -1209,27 +1208,26 @@ class TFRemBertForCausalLM(TFRemBertPreTrainedModel, TFCausalLanguageModelingLos
**kwargs,
) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
r"""
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the cross entropy classification loss. Indices should be in ``[0, ...,
- config.vocab_size - 1]``.
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., config.vocab_size - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1348,10 +1346,9 @@ class TFRemBertForSequenceClassification(TFRemBertPreTrainedModel, TFSequenceCla
**kwargs,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
func=self.call,
@@ -1454,10 +1451,9 @@ class TFRemBertForMultipleChoice(TFRemBertPreTrainedModel, TFMultipleChoiceLoss)
**kwargs,
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
inputs = input_processing(
func=self.call,
@@ -1598,9 +1594,8 @@ class TFRemBertForTokenClassification(TFRemBertPreTrainedModel, TFTokenClassific
**kwargs,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1695,13 +1690,13 @@ class TFRemBertForQuestionAnswering(TFRemBertPreTrainedModel, TFQuestionAnswerin
**kwargs,
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
- start_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
inputs = input_processing(
diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/retribert/modeling_retribert.py
index 2456545a22..8684177951 100644
--- a/src/transformers/models/retribert/modeling_retribert.py
+++ b/src/transformers/models/retribert/modeling_retribert.py
@@ -66,18 +66,18 @@ class RetriBertPreTrainedModel(PreTrainedModel):
RETRIBERT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.RetriBertConfig`): Model configuration class with all the parameters of the model.
+ config ([`RetriBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
@@ -179,32 +179,32 @@ class RetriBertModel(RetriBertPreTrainedModel):
):
r"""
Args:
- input_ids_query (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids_query (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary for the queries in a batch.
- Indices can be obtained using :class:`~transformers.RetriBertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`RetriBertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask_query (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask_query (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- input_ids_doc (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ [What are attention masks?](../glossary#attention-mask)
+ input_ids_doc (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary for the documents in a batch.
- attention_mask_doc (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ attention_mask_doc (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on documents padding token indices.
- checkpoint_batch_size (:obj:`int`, `optional`, defaults to `:obj:`-1`):
+ checkpoint_batch_size (`int`, *optional*, defaults to ```-1`):
If greater than 0, uses gradient checkpointing to only compute sequence representation on
- :obj:`checkpoint_batch_size` examples at a time on the GPU. All query representations are still
+ `checkpoint_batch_size` examples at a time on the GPU. All query representations are still
compared to all document representations in the batch.
Return:
- :obj:`torch.FloatTensor`: The bidirectional cross-entropy loss obtained while trying to match each query to
+ `torch.FloatTensor``: The bidirectional cross-entropy loss obtained while trying to match each query to
its corresponding document and each document to its corresponding query in the batch
"""
device = input_ids_query.device
diff --git a/src/transformers/models/roberta/modeling_flax_roberta.py b/src/transformers/models/roberta/modeling_flax_roberta.py
index 8c764c85ad..475edb0b6d 100644
--- a/src/transformers/models/roberta/modeling_flax_roberta.py
+++ b/src/transformers/models/roberta/modeling_flax_roberta.py
@@ -72,64 +72,60 @@ def create_position_ids_from_input_ids(input_ids, padding_idx):
ROBERTA_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading, saving and converting weights from
PyTorch models)
- This model is also a Flax Linen `flax.linen.Module
- `__ subclass. Use it as a regular Flax linen Module
+ This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. Use it as a regular Flax linen Module
and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- - `Just-In-Time (JIT) compilation `__
- - `Automatic Differentiation `__
- - `Vectorization `__
- - `Parallelization `__
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
- config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
+ config ([`RobertaConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
+ configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
model weights.
"""
ROBERTA_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`):
+ input_ids (`numpy.ndarray` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- head_mask (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+ head_mask (`numpy.ndarray` of shape `({0})`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index 0ed738cd6b..b9f3b082f2 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -624,69 +624,67 @@ class RobertaPreTrainedModel(PreTrainedModel):
ROBERTA_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
+ config ([`RobertaConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
ROBERTA_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.RobertaTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`RobertaTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -698,16 +696,16 @@ class RobertaModel(RobertaPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
- cross-attention is added between the self-attention layers, following the architecture described in `Attention is
- all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+ cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+ all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
Kaiser and Illia Polosukhin.
- To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
- set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
- argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration
+ set to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
+ argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an
input to the forward pass.
- .. _`Attention is all you need`: https://arxiv.org/abs/1706.03762
+ .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
"""
@@ -765,24 +763,24 @@ class RobertaModel(RobertaPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -927,47 +925,48 @@ class RobertaForCausalLM(RobertaPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
- ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
Returns:
- Example::
+ Example:
- >>> from transformers import RobertaTokenizer, RobertaForCausalLM, RobertaConfig
- >>> import torch
+ ```python
+ >>> from transformers import RobertaTokenizer, RobertaForCausalLM, RobertaConfig
+ >>> import torch
- >>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
- >>> config = RobertaConfig.from_pretrained("roberta-base")
- >>> config.is_decoder = True
- >>> model = RobertaForCausalLM.from_pretrained('roberta-base', config=config)
+ >>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+ >>> config = RobertaConfig.from_pretrained("roberta-base")
+ >>> config.is_decoder = True
+ >>> model = RobertaForCausalLM.from_pretrained('roberta-base', config=config)
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.logits
- """
+ >>> prediction_logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
use_cache = False
@@ -1085,11 +1084,10 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1196,10 +1194,9 @@ class RobertaForSequenceClassification(RobertaPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1293,10 +1290,9 @@ class RobertaForMultipleChoice(RobertaPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1391,9 +1387,8 @@ class RobertaForTokenClassification(RobertaPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1505,13 +1500,13 @@ class RobertaForQuestionAnswering(RobertaPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py
index bb5defef42..bc62c636ba 100644
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -141,7 +141,7 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
Applies embedding based on inputs tensor.
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
@@ -801,7 +801,7 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel):
Dummy inputs to build the network.
Returns:
- :obj:`Dict[str, tf.Tensor]`: The dummy inputs.
+ `Dict[str, tf.Tensor]`: The dummy inputs.
"""
dummy = {"input_ids": tf.constant(DUMMY_INPUTS)}
# Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
@@ -829,92 +829,92 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel):
ROBERTA_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Parameters:
- config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
+ config ([`RobertaConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
ROBERTA_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`):
+ input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.RobertaTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`RobertaTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
- `What are position IDs? <../glossary.html#position-ids>`__
- head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -955,24 +955,24 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
**kwargs,
):
r"""
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
"""
inputs = input_processing(
func=self.call,
@@ -1127,10 +1127,9 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
func=self.call,
@@ -1245,27 +1244,26 @@ class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLos
**kwargs,
) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
r"""
- encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+ past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the cross entropy classification loss. Indices should be in ``[0, ...,
- config.vocab_size - 1]``.
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`). Set to `False` during training, `True` during generation
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., config.vocab_size - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1411,10 +1409,9 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
func=self.call,
@@ -1522,10 +1519,9 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
inputs = input_processing(
func=self.call,
@@ -1660,9 +1656,8 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
**kwargs,
):
r"""
- labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1762,13 +1757,13 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
**kwargs,
):
r"""
- start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
inputs = input_processing(
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index 4f3188111b..95937e5271 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -732,60 +732,59 @@ class RoFormerPreTrainedModel(PreTrainedModel):
ROFORMER_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.RoFormerConfig`): Model configuration class with all the parameters of the model.
+ config ([`RoFormerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
ROFORMER_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+ input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.RoFormerTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`RoFormerTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`_
- head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are token type IDs?](../glossary#token-type-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -797,13 +796,13 @@ class RoFormerModel(RoFormerPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
- cross-attention is added between the self-attention layers, following the architecture described in `Attention is
- all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+ cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+ all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
- To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
- set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
- argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration
+ set to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
+ argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an
input to the forward pass.
"""
@@ -857,23 +856,23 @@ class RoFormerModel(RoFormerPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1005,10 +1004,9 @@ class RoFormerForMaskedLM(RoFormerPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1104,45 +1102,46 @@ class RoFormerForCausalLM(RoFormerPreTrainedModel):
return_dict=None,
):
r"""
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
- ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
Returns:
- Example::
+ Example:
- >>> from transformers import RoFormerTokenizer, RoFormerForCausalLM, RoFormerConfig
- >>> import torch
+ ```python
+ >>> from transformers import RoFormerTokenizer, RoFormerForCausalLM, RoFormerConfig
+ >>> import torch
- >>> tokenizer = RoFormerTokenizer.from_pretrained('junnyu/roformer_chinese_base')
- >>> config = RoFormerConfig.from_pretrained("junnyu/roformer_chinese_base")
- >>> config.is_decoder = True
- >>> model = RoFormerForCausalLM.from_pretrained('junnyu/roformer_chinese_base', config=config)
+ >>> tokenizer = RoFormerTokenizer.from_pretrained('junnyu/roformer_chinese_base')
+ >>> config = RoFormerConfig.from_pretrained("junnyu/roformer_chinese_base")
+ >>> config.is_decoder = True
+ >>> model = RoFormerForCausalLM.from_pretrained('junnyu/roformer_chinese_base', config=config)
- >>> inputs = tokenizer("今天天气非常好。", return_tensors="pt")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("今天天气非常好。", return_tensors="pt")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.logits
- """
+ >>> prediction_logits = outputs.logits
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roformer(
@@ -1264,10 +1263,9 @@ class RoFormerForSequenceClassification(RoFormerPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1359,10 +1357,9 @@ class RoFormerForMultipleChoice(RoFormerPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1450,9 +1447,8 @@ class RoFormerForTokenClassification(RoFormerPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1539,13 +1535,13 @@ class RoFormerForQuestionAnswering(RoFormerPreTrainedModel):
return_dict=None,
):
r"""
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py
index 08a7e2bc24..66b36e4f70 100644
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -172,7 +172,7 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer):
Returns:
- final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
@@ -715,87 +715,88 @@ class TFRoFormerPreTrainedModel(TFPreTrainedModel):
ROFORMER_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.)
- This model is also a `tf.keras.Model `__ subclass. Use
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior.
- .. note::
+
- TF 2.0 models accepts two formats as inputs:
+ TF 2.0 models accepts two formats as inputs:
- - having all inputs as keyword arguments (like PyTorch models), or
- - having all inputs as a list, tuple or dict in the first positional arguments.
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
- This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
- the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+ the tensors in the first argument of the model call function: `model(inputs)`.
- If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
- the first positional argument :
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
- - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
- - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
- :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- - a dictionary with one or several input Tensors associated to the input names given in the docstring:
- :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+ - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
Args:
- config (:class:`~transformers.RoFormerConfig`): Model configuration class with all the parameters of the model.
+ config ([`RoFormerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
ROFORMER_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`):
+ input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.RoFormerTokenizer`. See
- :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ Indices can be obtained using [`RoFormerTokenizer`]. See
+ [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
- Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
- 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- - 0 corresponds to a `sentence A` token,
- - 1 corresponds to a `sentence B` token.
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
- Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+ [What are token type IDs?](../glossary#token-type-ids)
+ head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
argument can be used in eager mode, in graph mode the value will always be set to True.
- training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@@ -905,10 +906,9 @@ class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingL
**kwargs,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
- config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
inputs = input_processing(
func=self.call,
@@ -997,9 +997,8 @@ class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingL
**kwargs,
) -> Union[TFCausalLMOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the cross entropy classification loss. Indices should be in ``[0, ...,
- config.vocab_size - 1]``.
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., config.vocab_size - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1122,10 +1121,9 @@ class TFRoFormerForSequenceClassification(TFRoFormerPreTrainedModel, TFSequenceC
**kwargs,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
inputs = input_processing(
func=self.call,
@@ -1227,10 +1225,9 @@ class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLos
**kwargs,
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
- num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
- :obj:`input_ids` above)
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+ `input_ids` above)
"""
inputs = input_processing(
func=self.call,
@@ -1363,9 +1360,8 @@ class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassif
**kwargs,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
- labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
- 1]``.
+ labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
inputs = input_processing(
func=self.call,
@@ -1457,13 +1453,13 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
**kwargs,
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
- start_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
- end_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
- Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
inputs = input_processing(
diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index 7cfbe0ceb7..bdbc577d43 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -426,33 +426,33 @@ class SegformerPreTrainedModel(PreTrainedModel):
SEGFORMER_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.SegformerConfig`): Model configuration class with all the parameters of the model.
+ config ([`SegformerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
SEGFORMER_INPUTS_DOCSTRING = r"""
Args:
- pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)`):
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
- :class:`~transformers.SegformerFeatureExtractor`. See
- :meth:`transformers.SegformerFeatureExtractor.__call__` for details.
+ [`SegformerFeatureExtractor`]. See
+ [`SegformerFeatureExtractor.__call__`] for details.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -557,32 +557,32 @@ class SegformerForImageClassification(SegformerPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import SegformerFeatureExtractor, SegformerForImageClassification
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import SegformerFeatureExtractor, SegformerForImageClassification
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = SegformerFeatureExtractor.from_pretrained('nvidia/mit-b0')
- >>> model = SegformerForImageClassification.from_pretrained('nvidia/mit-b0')
+ >>> feature_extractor = SegformerFeatureExtractor.from_pretrained('nvidia/mit-b0')
+ >>> model = SegformerForImageClassification.from_pretrained('nvidia/mit-b0')
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- >>> # model predicts one of the 1000 ImageNet classes
- >>> predicted_class_idx = logits.argmax(-1).item()
- >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ >>> # model predicts one of the 1000 ImageNet classes
+ >>> predicted_class_idx = logits.argmax(-1).item()
+ >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.segformer(
@@ -713,29 +713,29 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, height, width)`, `optional`):
- Ground truth semantic segmentation maps for computing the loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels > 1`, a classification loss is computed
+ labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+ Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed
(Cross-Entropy).
Returns:
- Examples::
+ Examples:
- >>> from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
+ >>> from PIL import Image
+ >>> import requests
- >>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
- >>> model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+ >>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+ >>> model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits # shape (batch_size, num_labels, height/4, width/4)
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits # shape (batch_size, num_labels, height/4, width/4)
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 92ac0d1e0f..dada447525 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -744,50 +744,48 @@ class SEWPreTrainedModel(PreTrainedModel):
SEW_START_DOCSTRING = r"""
- SEW was proposed in `Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition
- `__ by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav
+ SEW was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav
Artzi.
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving etc.).
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.SEWConfig`): Model configuration class with all the parameters of the model.
+ config ([`SEWConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
SEW_INPUTS_DOCSTRING = r"""
Args:
- input_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
- Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
- into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
- soundfile`). To prepare the array into `input_values`, the :class:`~transformers.Wav2Vec2Processor` should
- be used for padding and conversion into a tensor of type `torch.FloatTensor`. See
- :meth:`transformers.Wav2Vec2Processor.__call__` for details.
- attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing convolution and attention on padding token indices. Mask values selected in ``[0,
- 1]``:
+ input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+ Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+ into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
+ soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should
+ be used for padding and conversion into a tensor of type *torch.FloatTensor*. See
+ [`Wav2Vec2Processor.__call__`] for details.
+ attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
+ [What are attention masks?](../glossary#attention-mask)
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -966,11 +964,9 @@ class SEWForCTC(SEWPreTrainedModel):
labels=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_length)`, `optional`):
- Labels for connectionist temporal classification. Note that ``target_length`` has to be smaller or equal to
- the sequence length of the output logits. Indices are selected in ``[-100, 0, ..., config.vocab_size -
- 1]``. All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ...,
- config.vocab_size - 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+ Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+ the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1084,10 +1080,9 @@ class SEWForSequenceClassification(SEWPreTrainedModel):
labels=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 27971624cf..f6bb3d0b1c 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -180,9 +180,9 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
"""
Build relative position according to the query and key
- We assume the absolute position of query :math:`P_q` is range from (0, query_size) and the absolute position of key
- :math:`P_k` is range from (0, key_size), The relative positions from query to key is :math:`R_{q \\rightarrow k} =
- P_q - P_k`
+ We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+ \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} =
+ P_q - P_k\\)
Args:
query_size (int): the length of query
@@ -191,7 +191,7 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
max_position (int): the maximum allowed absolute position
Return:
- :obj:`torch.LongTensor`: A tensor with shape [1, query_size, key_size]
+ `torch.LongTensor`: A tensor with shape [1, query_size, key_size]
"""
q_ids = np.arange(0, query_size)
@@ -468,23 +468,24 @@ class XSoftmax(torch.autograd.Function):
Masked Softmax which is optimized for saving memory
Args:
- input (:obj:`torch.tensor`): The input tensor that will apply softmax.
- mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+ input (`torch.tensor`): The input tensor that will apply softmax.
+ mask (`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
dim (int): The dimension that will apply softmax
- Example::
+ Example:
- >>> import torch
- >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
+ ```python
+ >>> import torch
+ >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
- >>> # Make a tensor
- >>> x = torch.randn([4,20,100])
+ >>> # Make a tensor
+ >>> x = torch.randn([4,20,100])
- >>> # Create a mask
- >>> mask = (x>0).int()
+ >>> # Create a mask
+ >>> mask = (x>0).int()
- >>> y = XSoftmax.apply(x, mask, dim=-1)
- """
+ >>> y = XSoftmax.apply(x, mask, dim=-1)
+ ```"""
@staticmethod
def forward(self, input, mask, dim):
@@ -571,7 +572,7 @@ class StableDropout(nn.Module):
Call the module
Args:
- x (:obj:`torch.tensor`): The input tensor to apply dropout
+ x (`torch.tensor`): The input tensor to apply dropout
"""
if self.training and self.drop_prob > 0:
return XDropout.apply(x, self.get_context())
@@ -622,9 +623,9 @@ class DisentangledSelfAttention(nn.Module):
Disentangled self-attention module
Parameters:
- config (:obj:`DebertaV2Config`):
+ config (`DebertaV2Config`):
A model config class instance with the configuration to build a new model. The schema is similar to
- `BertConfig`, for more details, please refer :class:`~transformers.DebertaV2Config`
+ *BertConfig*, for more details, please refer [`DebertaV2Config`]
"""
@@ -684,28 +685,28 @@ class DisentangledSelfAttention(nn.Module):
Call the module
Args:
- hidden_states (:obj:`torch.FloatTensor`):
+ hidden_states (`torch.FloatTensor`):
Input states to the module usually the output from previous layer, it will be the Q,K and V in
- `Attention(Q,K,V)`
+ *Attention(Q,K,V)*
- attention_mask (:obj:`torch.ByteTensor`):
- An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maximum
- sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j`
+ attention_mask (`torch.ByteTensor`):
+ An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+ sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
th token.
- output_attentions (:obj:`bool`, optional):
+ output_attentions (`bool`, optional):
Whether return the attention matrix.
- query_states (:obj:`torch.FloatTensor`, optional):
- The `Q` state in `Attention(Q,K,V)`.
+ query_states (`torch.FloatTensor`, optional):
+ The *Q* state in *Attention(Q,K,V)*.
- relative_pos (:obj:`torch.LongTensor`):
- The relative position encoding between the tokens in the sequence. It's of shape [`B`, `N`, `N`] with
- values ranging in [`-max_relative_positions`, `max_relative_positions`].
+ relative_pos (`torch.LongTensor`):
+ The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+ values ranging in [*-max_relative_positions*, *max_relative_positions*].
- rel_embeddings (:obj:`torch.FloatTensor`):
- The embedding of relative distances. It's a tensor of shape [:math:`2 \\times
- \\text{max_relative_positions}`, `hidden_size`].
+ rel_embeddings (`torch.FloatTensor`):
+ The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+ \\text{max_relative_positions}\\), *hidden_size*].
"""
@@ -1274,50 +1275,48 @@ class SEWDPreTrainedModel(PreTrainedModel):
SEWD_START_DOCSTRING = r"""
- SEW-D was proposed in `Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition
- `__ by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav
+ SEW-D was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav
Artzi.
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving etc.).
- This model is a PyTorch `torch.nn.Module `_ sub-class. Use
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
- config (:class:`~transformers.SEWDConfig`): Model configuration class with all the parameters of the model.
+ config ([`SEWDConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
SEWD_INPUTS_DOCSTRING = r"""
Args:
- input_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
- Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
- into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
- soundfile`). To prepare the array into `input_values`, the :class:`~transformers.Wav2Vec2Processor` should
- be used for padding and conversion into a tensor of type `torch.FloatTensor`. See
- :meth:`transformers.Wav2Vec2Processor.__call__` for details.
- attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing convolution and attention on padding token indices. Mask values selected in ``[0,
- 1]``:
+ input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+ Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+ into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
+ soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should
+ be used for padding and conversion into a tensor of type *torch.FloatTensor*. See
+ [`Wav2Vec2Processor.__call__`] for details.
+ attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
+ [What are attention masks?](../glossary#attention-mask)
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@@ -1497,11 +1496,9 @@ class SEWDForCTC(SEWDPreTrainedModel):
labels=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_length)`, `optional`):
- Labels for connectionist temporal classification. Note that ``target_length`` has to be smaller or equal to
- the sequence length of the output logits. Indices are selected in ``[-100, 0, ..., config.vocab_size -
- 1]``. All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ...,
- config.vocab_size - 1]``.
+ labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+ Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+ the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1615,10 +1612,9 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel):
labels=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index 87041f5e24..8523d6ef81 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -38,116 +38,112 @@ _CONFIG_FOR_DOC = "SpeechEncoderDecoderConfig"
SPEECH_ENCODER_DECODER_START_DOCSTRING = r"""
This class can be used to initialize a speech-sequence-to-text-sequence model with any pretrained speech
autoencoding model as the encoder and any pretrained text autoregressive model as the decoder. The encoder is
- loaded via :meth:`~transformers.AutoModel.from_pretrained` function and the decoder is loaded via
- :meth:`~transformers.AutoModelForCausalLM.from_pretrained` function. Cross-attention layers are automatically added
+ loaded via [`~AutoModel.from_pretrained`] function and the decoder is loaded via
+ [`~AutoModelForCausalLM.from_pretrained`] function. Cross-attention layers are automatically added
to the decoder and should be fine-tuned on a downstream generative task, like summarization.
The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
- tasks was shown in `Leveraging Pre-trained Checkpoints for Sequence Generation Tasks
- `__ by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
+ tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
Zhou, Wei Li, Peter J. Liu.
- Additionally, in `Large-Scale Self- and Semi-Supervised Learning for Speech Translation
- `__ it is shown how leveraging large pretrained speech models for speech
+ Additionally, in [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) it is shown how leveraging large pretrained speech models for speech
translation yields a significant performance improvement.
After such an Speech-Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other
models (see the examples for more information).
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.SpeechEncoderDecoderConfig`): Model configuration class with all the parameters of the model.
+ config ([`SpeechEncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
- configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
weights.
"""
SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
Args:
- input_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
- into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
- soundfile`). To prepare the array into `input_values`, the :class:`~transformers.Wav2Vec2Processor` should
- be used for padding and conversion into a tensor of type `torch.FloatTensor`. See
- :meth:`transformers.Wav2Vec2Processor.__call__` for details.
- input_features (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length, feature_size)`, `optional`):
+ input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+ into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
+ soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should
+ be used for padding and conversion into a tensor of type *torch.FloatTensor*. See
+ [`Wav2Vec2Processor.__call__`] for details.
+ input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`, *optional*):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
- by loading a ``.flac`` or ``.wav`` audio file into an array of type :obj:`List[float]` or a
- :obj:`numpy.ndarray`, *e.g.* via the soundfile library (``pip install soundfile``). To prepare the array
- into :obj:`input_features`, the :class:`~transformers.Speech2TextTokenizer` should be used for extracting
- the fbank features, padding and conversion into a tensor of type :obj:`torch.FloatTensor`. See
- :meth:`~transformers.Speech2TextTokenizer.__call__`
- attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+ `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array
+ into `input_features`, the [`Speech2TextTokenizer`] should be used for extracting
+ the fbank features, padding and conversion into a tensor of type `torch.FloatTensor`. See
+ [`~Speech2TextTokenizer.__call__`]
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PreTrainedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
+ [What are input IDs?](../glossary#input-ids)
- If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
- For training, :obj:`decoder_input_ids` are automatically created by the model by shifting the :obj:`labels`
- to the right, replacing -100 by the :obj:`pad_token_id` and prepending them with the
- :obj:`decoder_start_token_id`.
- decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ For training, `decoder_input_ids` are automatically created by the model by shifting the `labels`
+ to the right, replacing -100 by the `pad_token_id` and prepending them with the
+ `decoder_start_token_id`.
+ decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
- encoder_outputs (:obj:`tuple(torch.FloatTensor)`, `optional`):
- This tuple must consist of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
- sequence_length, hidden_size)`) is a tensor of hidden-states at the output of the last layer of the
+ encoder_outputs (`tuple(torch.FloatTensor)`, *optional*):
+ This tuple must consist of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) is a tensor of hidden-states at the output of the last layer of the
encoder. Used in the cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
- This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`decoder_input_ids`
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. This is useful if you want more control over how to convert `decoder_input_ids`
indices into associated vectors than the model's internal embedding lookup matrix.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss for the decoder. Indices should be in ``[-100, 0,
- ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- If set to ``True``, the model will return a :class:`~transformers.file_utils.Seq2SeqLMOutput` instead of a
+ return_dict (`bool`, *optional*):
+ If set to `True`, the model will return a [`~file_utils.Seq2SeqLMOutput`] instead of a
plain tuple.
- kwargs: (`optional`) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:
+ kwargs: (*optional*) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:
- - Without a prefix which will be input as ``**encoder_kwargs`` for the encoder forward function.
- - With a `decoder_` prefix which will be input as ``**decoder_kwargs`` for the decoder forward function.
+ - Without a prefix which will be input as `**encoder_kwargs` for the encoder forward function.
+ - With a *decoder_* prefix which will be input as `**decoder_kwargs` for the decoder forward function.
"""
@@ -173,10 +169,10 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
@add_start_docstrings(SPEECH_ENCODER_DECODER_START_DOCSTRING)
class SpeechEncoderDecoderModel(PreTrainedModel):
r"""
- :class:`~transformers.SpeechEncoderDecoderModel` is a generic model class that will be instantiated as a
+ [`SpeechEncoderDecoderModel`] is a generic model class that will be instantiated as a
transformer architecture with one of the base model classes of the library as encoder and another one as decoder
- when created with the :meth`~transformers.AutoModel.from_pretrained` class method for the encoder and
- :meth`~transformers.AutoModelForCausalLM.from_pretrained` class method for the decoder.
+ when created with the :meth*~transformers.AutoModel.from_pretrained* class method for the encoder and
+ :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder.
"""
config_class = SpeechEncoderDecoderConfig
base_model_prefix = "speech_encoder_decoder"
@@ -283,60 +279,60 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
checkpoints.
- The model is set in evaluation mode by default using :obj:`model.eval()` (Dropout modules are deactivated). To
- train the model, you need to first set it back in training mode with :obj:`model.train()`.
+ The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To
+ train the model, you need to first set it back in training mode with `model.train()`.
Params:
- encoder_pretrained_model_name_or_path (:obj: `str`, `optional`):
+ encoder_pretrained_model_name_or_path (:obj: *str*, *optional*):
Information necessary to initiate the encoder. Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
- this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
- as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+ this case, `from_tf` should be set to `True` and a configuration object should be provided
+ as `config` argument. This loading path is slower than converting the TensorFlow checkpoint in
a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- decoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
+ decoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
Information necessary to initiate the decoder. Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
- this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
- as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+ this case, `from_tf` should be set to `True` and a configuration object should be provided
+ as `config` argument. This loading path is slower than converting the TensorFlow checkpoint in
a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- model_args (remaining positional arguments, `optional`):
- All remaning positional arguments will be passed to the underlying model's ``__init__`` method.
+ model_args (remaining positional arguments, *optional*):
+ All remaning positional arguments will be passed to the underlying model's `__init__` method.
- kwargs (remaining dictionary of keyword arguments, `optional`):
+ kwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- :obj:`output_attentions=True`).
+ `output_attentions=True`).
- - To update the encoder configuration, use the prefix `encoder_` for each configuration parameter.
- - To update the decoder configuration, use the prefix `decoder_` for each configuration parameter.
+ - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+ - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
- To update the parent model configuration, do not use a prefix for each configuration parameter.
- Behaves differently depending on whether a :obj:`config` is provided or automatically loaded.
+ Behaves differently depending on whether a `config` is provided or automatically loaded.
- Example::
+ Example:
- >>> from transformers import SpeechEncoderDecoderModel
- >>> # initialize a wav2vec2bert from a pretrained Wav2Vec2 and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
- >>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained('facebook/wav2vec2-base-960h', 'bert-base-uncased')
- >>> # saving model after fine-tuning
- >>> model.save_pretrained("./wav2vec2bert")
- >>> # load fine-tuned model
- >>> model = SpeechEncoderDecoderModel.from_pretrained("./wav2vec2bert")
-
- """
+ ```python
+ >>> from transformers import SpeechEncoderDecoderModel
+ >>> # initialize a wav2vec2bert from a pretrained Wav2Vec2 and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
+ >>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained('facebook/wav2vec2-base-960h', 'bert-base-uncased')
+ >>> # saving model after fine-tuning
+ >>> model.save_pretrained("./wav2vec2bert")
+ >>> # load fine-tuned model
+ >>> model = SpeechEncoderDecoderModel.from_pretrained("./wav2vec2bert")
+ ```"""
kwargs_encoder = {
argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 4fade8ba7c..8fc2e83ed3 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -378,13 +378,13 @@ class Speech2TextEncoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape :obj:`(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- :obj:`(config.encoder_attention_heads,)`.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ `(config.encoder_attention_heads,)`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -461,19 +461,19 @@ class Speech2TextDecoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape :obj:`(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape :obj:`(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
- :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- :obj:`(encoder_attention_heads,)`.
- cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for cross-attention heads in a given layer of
- size `(decoder_attention_heads,)`.
- past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ `(encoder_attention_heads,)`.
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+ size *(decoder_attention_heads,)*.
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -586,117 +586,111 @@ class Speech2TextPreTrainedModel(PreTrainedModel):
SPEECH_TO_TEXT_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.Speech2TextConfig`):
+ config ([`Speech2TextConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
SPEECH_TO_TEXT_INPUTS_DOCSTRING = r"""
Args:
- input_features (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length, feature_size)`):
+ input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
- by loading a ``.flac`` or ``.wav`` audio file into an array of type :obj:`List[float]` or a
- :obj:`numpy.ndarray`, *e.g.* via the soundfile library (``pip install soundfile``). To prepare the array
- into :obj:`input_features`, the :class:`~transformers.Speech2TextTokenizer` should be used for extracting
- the fbank features, padding and conversion into a tensor of type :obj:`torch.FloatTensor`. See
- :meth:`~transformers.Speech2TextTokenizer.__call__`
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing convolution and attention on padding token indices. Mask values selected in ``[0,
- 1]``:
+ by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+ `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array
+ into `input_features`, the [`Speech2TextTokenizer`] should be used for extracting
+ the fbank features, padding and conversion into a tensor of type `torch.FloatTensor`. See
+ [`~Speech2TextTokenizer.__call__`]
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.SpeechToTextTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`SpeechToTextTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
- SpeechToText uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
- :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
- :obj:`past_key_values`).
- decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
- Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+ SpeechToText uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
also be used by default.
If you want to change padding behavior, you should read
- :func:`modeling_speech_to_text._prepare_decoder_inputs` and modify to your needs. See diagram 1 in `the
- paper `__ for more information on the default strategy.
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+ [`modeling_speech_to_text._prepare_decoder_inputs`] and modify to your needs. See diagram 1 in [the
+ paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+ decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
- Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
- :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
- `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+ `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+ *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross-attention of the decoder.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
- of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
- shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+ of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
- blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
- representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
- have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
- :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
- If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
- takes the value of :obj:`inputs_embeds`.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+ takes the value of `inputs_embeds`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class Speech2TextEncoder(Speech2TextPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
- :class:`Speech2TextEncoderLayer`.
+ [`Speech2TextEncoderLayer`].
Args:
config: Speech2TextConfig
@@ -739,35 +733,35 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
):
r"""
Args:
- input_features (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length, feature_size)`):
+ input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
- obtained by loading a ``.flac`` or ``.wav`` audio file into an array of type :obj:`List[float]` or a
- :obj:`numpy.ndarray`, *e.g.* via the soundfile library (``pip install soundfile``). To prepare the
- array into :obj:`input_features`, the :class:`~transformers.Speech2TextTokenizer` should be used for
- extracting the fbank features, padding and conversion into a tensor of type :obj:`torch.FloatTensor`.
- See :meth:`~transformers.Speech2TextTokenizer.__call__`
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+ `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the
+ array into `input_features`, the [`Speech2TextTokenizer`] should be used for
+ extracting the fbank features, padding and conversion into a tensor of type `torch.FloatTensor`.
+ See [`~Speech2TextTokenizer.__call__`]
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
- ``[0, 1]``:
+ `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -851,7 +845,7 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
class Speech2TextDecoder(Speech2TextPreTrainedModel):
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`Speech2TextDecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Speech2TextDecoderLayer`]
Args:
config: Speech2TextConfig
@@ -923,71 +917,68 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.Speech2TextTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`Speech2TextTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
- on hidden heads. Mask values selected in ``[0, 1]``:
+ on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1297,36 +1288,36 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel):
return_dict=None,
):
r"""
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
- (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
- Example::
+ Example:
- >>> import torch
- >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
- >>> from datasets import load_dataset
- >>> import soundfile as sf
+ ```python
+ >>> import torch
+ >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
+ >>> from datasets import load_dataset
+ >>> import soundfile as sf
- >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
- >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
+ >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
+ >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
- >>> def map_to_array(batch):
- >>> speech, _ = sf.read(batch["file"])
- >>> batch["speech"] = speech
- >>> return batch
+ >>> def map_to_array(batch):
+ >>> speech, _ = sf.read(batch["file"])
+ >>> batch["speech"] = speech
+ >>> return batch
- >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
- >>> ds = ds.map(map_to_array)
+ >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+ >>> ds = ds.map(map_to_array)
- >>> input_features = processor(ds["speech"][0], sampling_rate=16000, return_tensors="pt").input_features # Batch size 1
- >>> generated_ids = model.generate(input_ids=input_features)
+ >>> input_features = processor(ds["speech"][0], sampling_rate=16000, return_tensors="pt").input_features # Batch size 1
+ >>> generated_ids = model.generate(input_ids=input_features)
- >>> transcription = processor.batch_decode(generated_ids)
- """
+ >>> transcription = processor.batch_decode(generated_ids)
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index 4aee66b534..bdcf90b65e 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -336,19 +336,19 @@ class Speech2Text2DecoderLayer(nn.Module):
):
"""
Args:
- hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape :obj:`(seq_len, batch, embed_dim)`
- attention_mask (:obj:`torch.FloatTensor`): attention mask of size
- :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape :obj:`(seq_len, batch, embed_dim)`
- encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
- :obj:`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
- :obj:`(encoder_attention_heads,)`.
- cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for cross-attention heads in a given layer of
- size `(decoder_attention_heads,)`.
- past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ `(encoder_attention_heads,)`.
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+ size *(decoder_attention_heads,)*.
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -433,25 +433,25 @@ class Speech2Text2PreTrainedModel(PreTrainedModel):
SPEECH_TO_TEXT_2_START_DOCSTRING = r"""
- This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
- This model is also a PyTorch `torch.nn.Module `__
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
- config (:class:`~transformers.Speech2Text2Config`):
+ config ([`Speech2Text2Config`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
- :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
"""
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`Speech2Text2DecoderLayer`
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Speech2Text2DecoderLayer`]
Args:
config: Speech2Text2Config
@@ -521,71 +521,68 @@ class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.Speech2Text2Tokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`Speech2Text2Tokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
- encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in ``[0, 1]``:
+ selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
- on hidden heads. Mask values selected in ``[0, 1]``:
+ on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last
- :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
- shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
- sequence_length)`.
- inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
- Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
- representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+ If `past_key_values` are used, the user can optionally input only the last
+ `decoder_input_ids` (those that don't have their past key value states given to this model) of
+ shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+ sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding lookup matrix.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -727,7 +724,7 @@ class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
class Speech2Text2DecoderWrapper(Speech2Text2PreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
- used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
+ used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
@@ -792,86 +789,85 @@ class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
):
r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
- Indices can be obtained using :class:`~transformers.Speech2Text2Tokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+ Indices can be obtained using [`Speech2Text2Tokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
for details.
- `What are input IDs? <../glossary.html#input-ids>`__
- attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- `What are attention masks? <../glossary.html#attention-mask>`__
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
- in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+ in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
- Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
- Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
- tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
- tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+ tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+ tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
model.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
- cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
decoding.
- If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
- instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
- config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are
- ignored (masked), the loss is only computed for the tokens with labels in ``[0, ...,
- config.vocab_size]``.
- use_cache (:obj:`bool`, `optional`):
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
- decoding (see :obj:`past_key_values`).
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+ decoding (see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
- output_attentions (:obj:`bool`, `optional`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
- output_hidden_states (:obj:`bool`, `optional`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
- return_dict (:obj:`bool`, `optional`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
Returns:
- Example::
+ Example:
- >>> from transformers import SpeechEncoderDecoderModel, Speech2Text2ForCausalLM, Wav2Vec2Model, Speech2Text2Config, Wav2Vec2Config
+ ```python
+ >>> from transformers import SpeechEncoderDecoderModel, Speech2Text2ForCausalLM, Wav2Vec2Model, Speech2Text2Config, Wav2Vec2Config
- >>> encoder = Wav2Vec2Model(Wav2Vec2Config())
- >>> decoder = Speech2Text2ForCausalLM(Speech2Text2Config())
+ >>> encoder = Wav2Vec2Model(Wav2Vec2Config())
+ >>> decoder = Speech2Text2ForCausalLM(Speech2Text2Config())
- # init speech2text model
- >>> model = SpeechEncoderDecoderModel(encoder=encoder, decoder=decoder)
- """
+ # init speech2text model
+ >>> model = SpeechEncoderDecoderModel(encoder=encoder, decoder=decoder)
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index c88f0eab0b..5565e4c4b3 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -539,65 +539,63 @@ class SplinterPreTrainedModel(PreTrainedModel):
SPLINTER_START_DOCSTRING = r"""
- This model is a PyTorch `torch.nn.Module