From 27b3031de2fb8195dec9bc2093e3e70bdb1c4bff Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 21 Dec 2021 15:06:33 -0500
Subject: [PATCH] Mass conversion of documentation from rst to Markdown
(#14866)
* Convert docstrings of all configurations and tokenizers
* Processors and fixes
* Last modeling files and fixes to models
* Pipeline modules
* Utils files
* Data submodule
* All the other files
* Style
* Missing examples
* Style again
* Fix copies
* Say bye bye to rst docstrings forever
---
src/transformers/configuration_utils.py | 288 ++--
src/transformers/convert_slow_tokenizer.py | 14 +-
src/transformers/data/data_collator.py | 143 +-
src/transformers/data/processors/glue.py | 16 +-
src/transformers/data/processors/squad.py | 52 +-
src/transformers/data/processors/utils.py | 22 +-
src/transformers/debug_utils.py | 113 +-
src/transformers/deepspeed.py | 36 +-
.../feature_extraction_sequence_utils.py | 55 +-
src/transformers/feature_extraction_utils.py | 149 +-
src/transformers/generation_beam_search.py | 108 +-
.../generation_flax_logits_process.py | 66 +-
src/transformers/generation_flax_utils.py | 93 +-
src/transformers/generation_logits_process.py | 137 +-
.../generation_stopping_criteria.py | 30 +-
src/transformers/generation_tf_utils.py | 456 +++---
src/transformers/generation_utils.py | 1222 ++++++++---------
src/transformers/image_utils.py | 48 +-
src/transformers/integrations.py | 76 +-
src/transformers/keras_callbacks.py | 32 +-
src/transformers/modelcard.py | 44 +-
.../modeling_flax_pytorch_utils.py | 2 +-
src/transformers/modeling_flax_utils.py | 271 ++--
src/transformers/modeling_tf_utils.py | 372 ++---
src/transformers/modeling_utils.py | 591 ++++----
.../models/albert/configuration_albert.py | 95 +-
.../models/albert/modeling_flax_albert.py | 18 +-
.../models/albert/modeling_tf_albert.py | 21 +-
.../models/albert/tokenization_albert.py | 103 +-
.../models/albert/tokenization_albert_fast.py | 64 +-
src/transformers/models/auto/auto_factory.py | 357 ++---
.../models/auto/configuration_auto.py | 102 +-
src/transformers/models/auto/dynamic.py | 67 +-
.../models/auto/feature_extraction_auto.py | 81 +-
.../models/auto/processing_auto.py | 73 +-
.../models/auto/tokenization_auto.py | 158 +--
.../models/bart/configuration_bart.py | 90 +-
.../models/bart/modeling_flax_bart.py | 67 +-
.../models/bart/tokenization_bart.py | 4 +-
.../models/bart/tokenization_bart_fast.py | 6 +-
.../models/barthez/tokenization_barthez.py | 91 +-
.../barthez/tokenization_barthez_fast.py | 60 +-
.../models/bartpho/tokenization_bartpho.py | 90 +-
.../models/beit/configuration_beit.py | 81 +-
.../models/beit/feature_extraction_beit.py | 66 +-
src/transformers/models/beit/modeling_beit.py | 25 +-
.../models/beit/modeling_flax_beit.py | 54 +-
.../models/bert/configuration_bert.py | 81 +-
.../models/bert/modeling_flax_bert.py | 40 +-
.../models/bert/modeling_tf_bert.py | 23 +-
.../models/bert/tokenization_bert.py | 94 +-
.../models/bert/tokenization_bert_fast.py | 58 +-
.../configuration_bert_generation.py | 67 +-
.../tokenization_bert_generation.py | 33 +-
.../tokenization_bert_japanese.py | 34 +-
.../models/bertweet/tokenization_bertweet.py | 101 +-
.../models/big_bird/configuration_big_bird.py | 67 +-
.../models/big_bird/modeling_flax_big_bird.py | 18 +-
.../models/big_bird/tokenization_big_bird.py | 67 +-
.../big_bird/tokenization_big_bird_fast.py | 68 +-
.../configuration_bigbird_pegasus.py | 81 +-
.../blenderbot/configuration_blenderbot.py | 85 +-
.../models/blenderbot/modeling_blenderbot.py | 19 +-
.../blenderbot/modeling_flax_blenderbot.py | 67 +-
.../blenderbot/tokenization_blenderbot.py | 12 +-
.../tokenization_blenderbot_fast.py | 14 +-
.../configuration_blenderbot_small.py | 85 +-
.../modeling_blenderbot_small.py | 19 +-
.../modeling_flax_blenderbot_small.py | 67 +-
.../tokenization_blenderbot_small.py | 16 +-
.../tokenization_blenderbot_small_fast.py | 10 +-
.../models/byt5/tokenization_byt5.py | 50 +-
.../camembert/configuration_camembert.py | 2 +-
.../camembert/tokenization_camembert.py | 91 +-
.../camembert/tokenization_camembert_fast.py | 61 +-
.../models/canine/configuration_canine.py | 66 +-
.../models/canine/tokenization_canine.py | 42 +-
.../models/clip/configuration_clip.py | 138 +-
.../models/clip/feature_extraction_clip.py | 74 +-
src/transformers/models/clip/modeling_clip.py | 74 +-
.../models/clip/modeling_flax_clip.py | 72 +-
.../models/clip/processing_clip.py | 100 +-
.../models/clip/tokenization_clip.py | 49 +-
.../models/clip/tokenization_clip_fast.py | 51 +-
.../models/convbert/configuration_convbert.py | 71 +-
.../models/convbert/tokenization_convbert.py | 6 +-
.../convbert/tokenization_convbert_fast.py | 6 +-
.../models/cpm/tokenization_cpm.py | 53 +-
.../models/cpm/tokenization_cpm_fast.py | 53 +-
.../models/ctrl/configuration_ctrl.py | 57 +-
.../models/ctrl/tokenization_ctrl.py | 8 +-
.../models/deberta/configuration_deberta.py | 66 +-
.../models/deberta/tokenization_deberta.py | 46 +-
.../deberta/tokenization_deberta_fast.py | 42 +-
.../deberta_v2/configuration_deberta_v2.py | 62 +-
.../deberta_v2/tokenization_deberta_v2.py | 98 +-
.../models/deit/configuration_deit.py | 55 +-
.../models/deit/feature_extraction_deit.py | 60 +-
src/transformers/models/deit/modeling_deit.py | 56 +-
.../models/detr/configuration_detr.py | 103 +-
.../models/detr/feature_extraction_detr.py | 142 +-
src/transformers/models/detr/modeling_detr.py | 25 +-
.../distilbert/configuration_distilbert.py | 66 +-
.../distilbert/tokenization_distilbert.py | 4 +-
.../tokenization_distilbert_fast.py | 6 +-
.../models/dpr/configuration_dpr.py | 54 +-
src/transformers/models/dpr/modeling_dpr.py | 86 +-
.../models/dpr/modeling_tf_dpr.py | 85 +-
.../models/dpr/tokenization_dpr.py | 122 +-
.../models/dpr/tokenization_dpr_fast.py | 122 +-
.../models/electra/configuration_electra.py | 104 +-
.../models/electra/modeling_flax_electra.py | 16 +-
.../models/electra/modeling_tf_electra.py | 19 +-
.../models/electra/tokenization_electra.py | 4 +-
.../electra/tokenization_electra_fast.py | 6 +-
.../configuration_encoder_decoder.py | 63 +-
.../modeling_encoder_decoder.py | 40 +-
.../modeling_flax_encoder_decoder.py | 89 +-
.../modeling_tf_encoder_decoder.py | 70 +-
.../models/flaubert/configuration_flaubert.py | 98 +-
.../models/flaubert/tokenization_flaubert.py | 6 +-
.../models/fnet/configuration_fnet.py | 69 +-
.../models/fnet/tokenization_fnet.py | 80 +-
.../models/fnet/tokenization_fnet_fast.py | 51 +-
.../models/fsmt/configuration_fsmt.py | 102 +-
.../models/fsmt/tokenization_fsmt.py | 71 +-
.../models/funnel/configuration_funnel.py | 75 +-
.../models/funnel/modeling_tf_funnel.py | 17 +-
.../models/funnel/tokenization_funnel.py | 20 +-
.../models/funnel/tokenization_funnel_fast.py | 22 +-
.../models/gpt2/configuration_gpt2.py | 113 +-
src/transformers/models/gpt2/modeling_gpt2.py | 20 +-
.../models/gpt2/tokenization_gpt2.py | 45 +-
.../models/gpt2/tokenization_gpt2_fast.py | 49 +-
.../models/gpt_neo/configuration_gpt_neo.py | 73 +-
.../models/gptj/configuration_gptj.py | 64 +-
src/transformers/models/gptj/modeling_gptj.py | 21 +-
.../models/herbert/tokenization_herbert.py | 2 +-
.../herbert/tokenization_herbert_fast.py | 42 +-
.../models/hubert/configuration_hubert.py | 151 +-
.../models/hubert/modeling_hubert.py | 31 +-
.../models/hubert/modeling_tf_hubert.py | 31 +-
.../models/ibert/configuration_ibert.py | 60 +-
.../models/ibert/quant_modules.py | 122 +-
.../models/imagegpt/configuration_imagegpt.py | 69 +-
.../imagegpt/feature_extraction_imagegpt.py | 54 +-
.../models/layoutlm/configuration_layoutlm.py | 61 +-
.../models/layoutlm/modeling_layoutlm.py | 43 +-
.../models/layoutlm/modeling_tf_layoutlm.py | 43 +-
.../models/layoutlm/tokenization_layoutlm.py | 4 +-
.../layoutlm/tokenization_layoutlm_fast.py | 4 +-
.../layoutlmv2/configuration_layoutlmv2.py | 90 +-
.../feature_extraction_layoutlmv2.py | 82 +-
.../models/layoutlmv2/modeling_layoutlmv2.py | 21 +-
.../layoutlmv2/processing_layoutlmv2.py | 92 +-
.../layoutlmv2/tokenization_layoutlmv2.py | 169 ++-
.../tokenization_layoutlmv2_fast.py | 72 +-
.../models/layoutxlm/processing_layoutxlm.py | 88 +-
.../layoutxlm/tokenization_layoutxlm.py | 151 +-
.../layoutxlm/tokenization_layoutxlm_fast.py | 79 +-
.../models/led/configuration_led.py | 65 +-
src/transformers/models/led/modeling_led.py | 25 +-
.../models/led/modeling_tf_led.py | 50 +-
.../models/led/tokenization_led.py | 4 +-
.../models/led/tokenization_led_fast.py | 6 +-
.../longformer/configuration_longformer.py | 36 +-
.../models/longformer/modeling_longformer.py | 62 +-
.../longformer/modeling_tf_longformer.py | 25 +-
.../longformer/tokenization_longformer.py | 2 +-
.../tokenization_longformer_fast.py | 4 +-
.../models/luke/configuration_luke.py | 68 +-
src/transformers/models/luke/modeling_luke.py | 41 +-
.../models/luke/tokenization_luke.py | 209 ++-
.../models/lxmert/configuration_lxmert.py | 74 +-
.../models/lxmert/tokenization_lxmert.py | 4 +-
.../models/lxmert/tokenization_lxmert_fast.py | 6 +-
.../models/m2m_100/configuration_m2m_100.py | 76 +-
.../models/m2m_100/tokenization_m2m_100.py | 86 +-
.../models/marian/configuration_marian.py | 85 +-
.../models/marian/modeling_flax_marian.py | 87 +-
.../models/marian/modeling_marian.py | 51 +-
.../models/marian/modeling_tf_marian.py | 30 +-
.../models/marian/tokenization_marian.py | 98 +-
.../models/mbart/configuration_mbart.py | 86 +-
.../models/mbart/modeling_flax_mbart.py | 67 +-
.../models/mbart/tokenization_mbart.py | 49 +-
.../models/mbart/tokenization_mbart_fast.py | 42 +-
.../models/mbart50/tokenization_mbart50.py | 82 +-
.../mbart50/tokenization_mbart50_fast.py | 56 +-
.../configuration_megatron_bert.py | 75 +-
.../models/mluke/tokenization_mluke.py | 288 ++--
.../models/mmbt/configuration_mmbt.py | 8 +-
src/transformers/models/mmbt/modeling_mmbt.py | 13 +-
.../mobilebert/configuration_mobilebert.py | 80 +-
.../mobilebert/modeling_tf_mobilebert.py | 41 +-
.../mobilebert/tokenization_mobilebert.py | 4 +-
.../tokenization_mobilebert_fast.py | 6 +-
.../models/mpnet/configuration_mpnet.py | 59 +-
.../models/mpnet/tokenization_mpnet.py | 106 +-
.../models/mpnet/tokenization_mpnet_fast.py | 60 +-
.../models/mt5/configuration_mt5.py | 47 +-
.../models/openai/configuration_openai.py | 95 +-
.../models/openai/tokenization_openai.py | 12 +-
.../models/openai/tokenization_openai_fast.py | 10 +-
.../models/pegasus/configuration_pegasus.py | 86 +-
.../models/pegasus/modeling_flax_pegasus.py | 67 +-
.../models/pegasus/modeling_pegasus.py | 19 +-
.../models/pegasus/tokenization_pegasus.py | 71 +-
.../pegasus/tokenization_pegasus_fast.py | 53 +-
.../perceiver/configuration_perceiver.py | 83 +-
.../perceiver/feature_extraction_perceiver.py | 64 +-
.../models/perceiver/modeling_perceiver.py | 131 +-
.../perceiver/tokenization_perceiver.py | 43 +-
.../models/phobert/tokenization_phobert.py | 64 +-
.../prophetnet/configuration_prophetnet.py | 68 +-
.../models/prophetnet/modeling_prophetnet.py | 40 +-
.../prophetnet/tokenization_prophetnet.py | 71 +-
.../models/qdqbert/configuration_qdqbert.py | 62 +-
.../models/rag/configuration_rag.py | 94 +-
src/transformers/models/rag/modeling_rag.py | 21 +-
.../models/rag/modeling_tf_rag.py | 24 +-
src/transformers/models/rag/retrieval_rag.py | 162 +--
.../models/reformer/configuration_reformer.py | 152 +-
.../models/reformer/tokenization_reformer.py | 42 +-
.../reformer/tokenization_reformer_fast.py | 26 +-
.../models/rembert/configuration_rembert.py | 58 +-
.../models/rembert/tokenization_rembert.py | 80 +-
.../rembert/tokenization_rembert_fast.py | 74 +-
.../retribert/configuration_retribert.py | 40 +-
.../retribert/tokenization_retribert.py | 4 +-
.../retribert/tokenization_retribert_fast.py | 6 +-
.../models/roberta/configuration_roberta.py | 29 +-
.../models/roberta/tokenization_roberta.py | 97 +-
.../roberta/tokenization_roberta_fast.py | 85 +-
.../models/roformer/configuration_roformer.py | 71 +-
.../models/roformer/tokenization_roformer.py | 83 +-
.../roformer/tokenization_roformer_fast.py | 46 +-
.../segformer/configuration_segformer.py | 73 +-
.../segformer/feature_extraction_segformer.py | 54 +-
.../models/segformer/modeling_segformer.py | 25 +-
.../models/sew/configuration_sew.py | 142 +-
.../models/sew_d/configuration_sew_d.py | 166 ++-
.../configuration_speech_encoder_decoder.py | 63 +-
.../modeling_speech_encoder_decoder.py | 30 +-
.../configuration_speech_to_text.py | 89 +-
.../feature_extraction_speech_to_text.py | 62 +-
.../processing_speech_to_text.py | 76 +-
.../tokenization_speech_to_text.py | 51 +-
.../configuration_speech_to_text_2.py | 68 +-
.../processing_speech_to_text_2.py | 76 +-
.../tokenization_speech_to_text_2.py | 14 +-
.../models/splinter/configuration_splinter.py | 66 +-
.../models/splinter/tokenization_splinter.py | 87 +-
.../splinter/tokenization_splinter_fast.py | 55 +-
.../squeezebert/configuration_squeezebert.py | 72 +-
.../squeezebert/tokenization_squeezebert.py | 4 +-
.../tokenization_squeezebert_fast.py | 6 +-
.../models/t5/configuration_t5.py | 49 +-
.../models/t5/modeling_flax_t5.py | 111 +-
src/transformers/models/t5/modeling_t5.py | 58 +-
src/transformers/models/t5/modeling_tf_t5.py | 38 +-
src/transformers/models/t5/tokenization_t5.py | 77 +-
.../models/t5/tokenization_t5_fast.py | 47 +-
.../models/tapas/configuration_tapas.py | 111 +-
.../models/tapas/modeling_tapas.py | 31 +-
.../models/tapas/modeling_tf_tapas.py | 31 +-
.../models/tapas/tokenization_tapas.py | 265 ++--
.../transfo_xl/configuration_transfo_xl.py | 83 +-
.../modeling_transfo_xl_utilities.py | 10 +-
.../transfo_xl/tokenization_transfo_xl.py | 73 +-
.../models/trocr/configuration_trocr.py | 72 +-
.../models/trocr/processing_trocr.py | 76 +-
.../unispeech/configuration_unispeech.py | 171 ++-
.../configuration_unispeech_sat.py | 191 ++-
.../unispeech_sat/modeling_unispeech_sat.py | 63 +-
.../configuration_vision_encoder_decoder.py | 63 +-
.../modeling_flax_vision_encoder_decoder.py | 101 +-
.../modeling_vision_encoder_decoder.py | 48 +-
.../configuration_vision_text_dual_encoder.py | 65 +-
.../modeling_flax_vision_text_dual_encoder.py | 47 +-
.../modeling_vision_text_dual_encoder.py | 46 +-
.../processing_vision_text_dual_encoder.py | 102 +-
.../visual_bert/configuration_visual_bert.py | 75 +-
.../visual_bert/modeling_visual_bert.py | 37 +-
.../models/vit/configuration_vit.py | 56 +-
.../models/vit/feature_extraction_vit.py | 48 +-
.../models/vit/modeling_flax_vit.py | 56 +-
.../models/vit/modeling_tf_vit.py | 25 +-
src/transformers/models/vit/modeling_vit.py | 25 +-
.../models/wav2vec2/configuration_wav2vec2.py | 212 ++-
.../wav2vec2/feature_extraction_wav2vec2.py | 88 +-
.../models/wav2vec2/modeling_flax_wav2vec2.py | 131 +-
.../models/wav2vec2/modeling_tf_wav2vec2.py | 31 +-
.../models/wav2vec2/processing_wav2vec2.py | 84 +-
.../models/wav2vec2/tokenization_wav2vec2.py | 122 +-
.../tokenization_wav2vec2_phoneme.py | 79 +-
.../processing_wav2vec2_with_lm.py | 122 +-
.../models/wavlm/configuration_wavlm.py | 212 +--
.../models/xlm/configuration_xlm.py | 113 +-
.../models/xlm/tokenization_xlm.py | 77 +-
.../configuration_xlm_prophetnet.py | 2 +-
.../tokenization_xlm_prophetnet.py | 89 +-
.../xlm_roberta/configuration_xlm_roberta.py | 2 +-
.../xlm_roberta/tokenization_xlm_roberta.py | 89 +-
.../tokenization_xlm_roberta_fast.py | 59 +-
.../models/xlnet/configuration_xlnet.py | 125 +-
.../models/xlnet/modeling_tf_xlnet.py | 41 +-
.../models/xlnet/tokenization_xlnet.py | 105 +-
.../models/xlnet/tokenization_xlnet_fast.py | 79 +-
src/transformers/optimization.py | 190 +--
src/transformers/optimization_tf.py | 72 +-
src/transformers/pipelines/__init__.py | 172 +--
.../pipelines/audio_classification.py | 31 +-
.../pipelines/automatic_speech_recognition.py | 36 +-
src/transformers/pipelines/base.py | 240 ++--
src/transformers/pipelines/conversational.py | 101 +-
.../pipelines/feature_extraction.py | 32 +-
src/transformers/pipelines/fill_mask.py | 41 +-
.../pipelines/image_classification.py | 17 +-
.../pipelines/image_segmentation.py | 24 +-
.../pipelines/object_detection.py | 23 +-
.../pipelines/question_answering.py | 88 +-
.../pipelines/table_question_answering.py | 82 +-
.../pipelines/text2text_generation.py | 114 +-
.../pipelines/text_classification.py | 45 +-
src/transformers/pipelines/text_generation.py | 38 +-
.../pipelines/token_classification.py | 48 +-
.../pipelines/zero_shot_classification.py | 40 +-
src/transformers/testing_utils.py | 238 ++--
src/transformers/tokenization_utils.py | 120 +-
src/transformers/tokenization_utils_base.py | 999 +++++++-------
src/transformers/tokenization_utils_fast.py | 65 +-
src/transformers/trainer.py | 303 ++--
src/transformers/trainer_callback.py | 143 +-
src/transformers/trainer_pt_utils.py | 189 +--
src/transformers/trainer_seq2seq.py | 66 +-
src/transformers/trainer_tf.py | 120 +-
src/transformers/trainer_utils.py | 43 +-
src/transformers/training_args.py | 450 +++---
src/transformers/training_args_seq2seq.py | 18 +-
src/transformers/training_args_tf.py | 169 +--
src/transformers/utils/fx.py | 40 +-
src/transformers/utils/logging.py | 41 +-
src/transformers/utils/notebook.py | 69 +-
src/transformers/utils/versions.py | 16 +-
...on_{{cookiecutter.lowercase_modelname}}.py | 116 +-
...st_{{cookiecutter.lowercase_modelname}}.py | 22 +-
...on_{{cookiecutter.lowercase_modelname}}.py | 50 +-
utils/check_repo.py | 45 +
349 files changed, 14049 insertions(+), 13622 deletions(-)
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 23938bfb1f..25358fae42 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -79,36 +79,36 @@ class PretrainedConfig(PushToHubMixin):
- **num_hidden_layers** (`int`) -- The number of blocks in the model.
Arg:
- name_or_path (`str`, _optional_, defaults to `""`):
+ name_or_path (`str`, *optional*, defaults to `""`):
Store the string that was passed to [`PreTrainedModel.from_pretrained`] or
[`TFPreTrainedModel.from_pretrained`] as `pretrained_model_name_or_path` if the configuration was created
with such a method.
- output_hidden_states (`bool`, _optional_, defaults to `False`):
+ output_hidden_states (`bool`, *optional*, defaults to `False`):
Whether or not the model should return all hidden-states.
- output_attentions (`bool`, _optional_, defaults to `False`):
+ output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not the model should returns all attentions.
- return_dict (`bool`, _optional_, defaults to `True`):
+ return_dict (`bool`, *optional*, defaults to `True`):
Whether or not the model should return a [`~transformers.file_utils.ModelOutput`] instead of a plain tuple.
- is_encoder_decoder (`bool`, _optional_, defaults to `False`):
+ is_encoder_decoder (`bool`, *optional*, defaults to `False`):
Whether the model is used as an encoder/decoder or not.
- is_decoder (`bool`, _optional_, defaults to `False`):
+ is_decoder (`bool`, *optional*, defaults to `False`):
Whether the model is used as decoder or not (in which case it's used as an encoder).
- cross_attention_hidden_size** (`bool`, _optional_):
+ cross_attention_hidden_size** (`bool`, *optional*):
The hidden size of the cross-attention layer in case the model is used as a decoder in an encoder-decoder
setting and the cross-attention hidden dimension differs from `self.config.hidden_size`.
- add_cross_attention (`bool`, _optional_, defaults to `False`):
+ add_cross_attention (`bool`, *optional*, defaults to `False`):
Whether cross-attention layers should be added to the model. Note, this option is only relevant for models
that can be used as decoder models within the [`EncoderDecoderModel`] class, which consists of all models
in `AUTO_MODELS_FOR_CAUSAL_LM`.
- tie_encoder_decoder (`bool`, _optional_, defaults to `False`):
+ tie_encoder_decoder (`bool`, *optional*, defaults to `False`):
Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder
and decoder model to have the exact same parameter names.
- prune_heads (`Dict[int, List[int]]`, _optional_, defaults to `{}`):
+ prune_heads (`Dict[int, List[int]]`, *optional*, defaults to `{}`):
Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of
heads to prune in said layer.
For instance `{1: [0, 2], 2: [2, 3]}` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
- chunk_size_feed_forward (`int`, _optional_, defaults to `0`):
+ chunk_size_feed_forward (`int`, *optional*, defaults to `0`):
The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means that
the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes `n` <
sequence_length embeddings at a time. For more information on feed forward chunking, see [How does Feed
@@ -116,105 +116,105 @@ class PretrainedConfig(PushToHubMixin):
> Parameters for sequence generation
- max_length (`int`, _optional_, defaults to 20):
+ max_length (`int`, *optional*, defaults to 20):
Maximum length that will be used by default in the `generate` method of the model.
- min_length (`int`, _optional_, defaults to 10):
+ min_length (`int`, *optional*, defaults to 10):
Minimum length that will be used by default in the `generate` method of the model.
- do_sample (`bool`, _optional_, defaults to `False`):
+ do_sample (`bool`, *optional*, defaults to `False`):
Flag that will be used by default in the `generate` method of the model. Whether or not to use sampling ;
use greedy decoding otherwise.
- early_stopping (`bool`, _optional_, defaults to `False`):
+ early_stopping (`bool`, *optional*, defaults to `False`):
Flag that will be used by default in the `generate` method of the model. Whether to stop the beam search
when at least `num_beams` sentences are finished per batch or not.
- num_beams (`int`, _optional_, defaults to 1):
+ num_beams (`int`, *optional*, defaults to 1):
Number of beams for beam search that will be used by default in the `generate` method of the model. 1 means
no beam search.
- num_beam_groups (`int`, _optional_, defaults to 1):
+ num_beam_groups (`int`, *optional*, defaults to 1):
Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams
that will be used by default in the `generate` method of the model. 1 means no group beam search.
- diversity_penalty (`float`, _optional_, defaults to 0.0):
+ diversity_penalty (`float`, *optional*, defaults to 0.0):
Value to control diversity for group beam search. that will be used by default in the `generate` method of
the model. 0 means no diversity penalty. The higher the penalty, the more diverse are the outputs.
- temperature (`float`, _optional_, defaults to 1):
+ temperature (`float`, *optional*, defaults to 1):
The value used to module the next token probabilities that will be used by default in the `generate` method
of the model. Must be strictly positive.
- top_k (`int`, _optional_, defaults to 50):
+ top_k (`int`, *optional*, defaults to 50):
Number of highest probability vocabulary tokens to keep for top-k-filtering that will be used by default in
the `generate` method of the model.
- top_p (`float`, _optional_, defaults to 1):
+ top_p (`float`, *optional*, defaults to 1):
Value that will be used by default in the `generate` method of the model for `top_p`. If set to float < 1,
only the most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.
- repetition_penalty (`float`, _optional_, defaults to 1):
+ repetition_penalty (`float`, *optional*, defaults to 1):
Parameter for repetition penalty that will be used by default in the `generate` method of the model. 1.0
means no penalty.
- length_penalty (`float`, _optional_, defaults to 1):
+ length_penalty (`float`, *optional*, defaults to 1):
Exponential penalty to the length that will be used by default in the `generate` method of the model.
- no_repeat_ngram_size (`int`, _optional_, defaults to 0) -- Value that will be used by default in the
+ no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by default in the
`generate` method of the model for `no_repeat_ngram_size`. If set to int > 0, all ngrams of that size can
only occur once.
- encoder_no_repeat_ngram_size (`int`, _optional_, defaults to 0) -- Value that will be used by
+ encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by
default in the `generate` method of the model for `encoder_no_repeat_ngram_size`. If set to int > 0, all
ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.
- bad_words_ids (`List[int]`, _optional_):
+ bad_words_ids (`List[int]`, *optional*):
List of token ids that are not allowed to be generated that will be used by default in the `generate`
method of the model. In order to get the tokens of the words that should not appear in the generated text,
use `tokenizer.encode(bad_word, add_prefix_space=True)`.
- num_return_sequences (`int`, _optional_, defaults to 1):
+ num_return_sequences (`int`, *optional*, defaults to 1):
Number of independently computed returned sequences for each element in the batch that will be used by
default in the `generate` method of the model.
- output_scores (`bool`, _optional_, defaults to `False`):
+ output_scores (`bool`, *optional*, defaults to `False`):
Whether the model should return the logits when used for generation.
- return_dict_in_generate (`bool`, _optional_, defaults to `False`):
+ return_dict_in_generate (`bool`, *optional*, defaults to `False`):
Whether the model should return a [`~transformers.file_utils.ModelOutput`] instead of a `torch.LongTensor`.
- forced_bos_token_id (`int`, _optional_):
+ forced_bos_token_id (`int`, *optional*):
The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
language token.
- forced_eos_token_id (`int`, _optional_):
+ forced_eos_token_id (`int`, *optional*):
The id of the token to force as the last generated token when `max_length` is reached.
- remove_invalid_values (`bool`, _optional_):
+ remove_invalid_values (`bool`, *optional*):
Whether to remove possible _nan_ and _inf_ outputs of the model to prevent the generation method to crash.
Note that using `remove_invalid_values` can slow down generation.
> Parameters for fine-tuning tasks
- architectures (`List[str]`, _optional_): Model architectures that can be used with the model pretrained weights.
- finetuning_task (`str`, _optional_):
+ architectures (`List[str]`, *optional*): Model architectures that can be used with the model pretrained weights.
+ finetuning_task (`str`, *optional*):
Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow
or PyTorch) checkpoint.
- id2label (`Dict[int, str]`, _optional_):
+ id2label (`Dict[int, str]`, *optional*):
A map from index (for instance prediction index, or target index) to label.
- label2id (`Dict[str, int]`, _optional_): A map from label to index for the model.
- num_labels (`int`, _optional_):
+ label2id (`Dict[str, int]`, *optional*): A map from label to index for the model.
+ num_labels (`int`, *optional*):
Number of labels to use in the last layer added to the model, typically for a classification task.
- task_specific_params (`Dict[str, Any]`, _optional_):
+ task_specific_params (`Dict[str, Any]`, *optional*):
Additional keyword arguments to store for the current task.
- problem_type (`str`, _optional_):
+ problem_type (`str`, *optional*):
Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`,
`"single_label_classification"` or `"multi_label_classification"`.
> Parameters linked to the tokenizer
- tokenizer_class (`str`, _optional_):
+ tokenizer_class (`str`, *optional*):
The name of the associated tokenizer class to use (if none is set, will use the tokenizer associated to the
model by default).
- prefix (`str`, _optional_):
+ prefix (`str`, *optional*):
A specific prompt that should be added at the beginning of each text before calling the model.
- bos_token_id (`int`, _optional_): The id of the _beginning-of-stream_ token.
- pad_token_id (`int`, _optional_): The id of the _padding_ token.
- eos_token_id (`int`, _optional_): The id of the _end-of-stream_ token.
- decoder_start_token_id (`int`, _optional_):
+ bos_token_id (`int`, *optional*): The id of the _beginning-of-stream_ token.
+ pad_token_id (`int`, *optional*): The id of the _padding_ token.
+ eos_token_id (`int`, *optional*): The id of the _end-of-stream_ token.
+ decoder_start_token_id (`int`, *optional*):
If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
- sep_token_id (`int`, _optional_): The id of the _separation_ token.
+ sep_token_id (`int`, *optional*): The id of the _separation_ token.
> PyTorch specific parameters
- torchscript (`bool`, _optional_, defaults to `False`):
+ torchscript (`bool`, *optional*, defaults to `False`):
Whether or not the model should be used with Torchscript.
- tie_word_embeddings (`bool`, _optional_, defaults to `True`):
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
model has a output word embedding layer.
- torch_dtype (`str`, _optional_):
+ torch_dtype (`str`, *optional*):
The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
(which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
model is `float16`, ideally we want to load it back using the minimal amount of memory needed to load
@@ -227,7 +227,7 @@ class PretrainedConfig(PushToHubMixin):
> TensorFlow specific parameters
- use_bfloat16 (`bool`, _optional_, defaults to `False`):
+ use_bfloat16 (`bool`, *optional*, defaults to `False`):
Whether or not the model should use BFloat16 scalars (only used by some TensorFlow models).
"""
model_type: str = ""
@@ -370,7 +370,7 @@ class PretrainedConfig(PushToHubMixin):
@property
def use_return_dict(self) -> bool:
"""
- :obj:`bool`: Whether or not return :class:`~transformers.file_utils.ModelOutput` instead of tuples.
+ `bool`: Whether or not return [`~file_utils.ModelOutput`] instead of tuples.
"""
# If torchscript is set, force `return_dict=False` to avoid jit errors
return self.return_dict and not self.torchscript
@@ -378,7 +378,7 @@ class PretrainedConfig(PushToHubMixin):
@property
def num_labels(self) -> int:
"""
- :obj:`int`: The number of labels for classification models.
+ `int`: The number of labels for classification models.
"""
return len(self.id2label)
@@ -390,25 +390,27 @@ class PretrainedConfig(PushToHubMixin):
def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
"""
- Save a configuration object to the directory ``save_directory``, so that it can be re-loaded using the
- :func:`~transformers.PretrainedConfig.from_pretrained` class method.
+ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
+ [`~PretrainedConfig.from_pretrained`] class method.
Args:
- save_directory (:obj:`str` or :obj:`os.PathLike`):
+ save_directory (`str` or `os.PathLike`):
Directory where the configuration JSON file will be saved (will be created if it does not exist).
- push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ push_to_hub (`bool`, *optional*, defaults to `False`):
Whether or not to push your model to the Hugging Face model hub after saving it.
- .. warning::
+
- Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
- :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
- pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
- instead.
+ Using `push_to_hub=True` will synchronize the repository you are pushing to with
+ `save_directory`, which requires `save_directory` to be a local clone of the repo you are
+ pushing to if it's an existing folder. Pass along `temp_dir=True` to use a temporary directory
+ instead.
+
+
kwargs:
Additional key word arguments passed along to the
- :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method.
+ [`~file_utils.PushToHubMixin.push_to_hub`] method.
"""
if os.path.isfile(save_directory):
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -431,73 +433,73 @@ class PretrainedConfig(PushToHubMixin):
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
r"""
- Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pretrained model
+ Instantiate a [`PretrainedConfig`] (or a derived class) from a pretrained model
configuration.
Args:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- - a string, the `model id` of a pretrained model configuration hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
- namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - a path to a `directory` containing a configuration file saved using the
- :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g., ``./my_model_directory/``.
- - a path or url to a saved configuration JSON `file`, e.g.,
- ``./my_model_directory/configuration.json``.
- cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+ - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+ namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - a path to a *directory* containing a configuration file saved using the
+ [`~PretrainedConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
+ - a path or url to a saved configuration JSON *file*, e.g.,
+ `./my_model_directory/configuration.json`.
+ cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the configuration files and override the cached versions if
they exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received file. Attempts to resume the download if such a file
exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
- use_auth_token (:obj:`str` or `bool`, `optional`):
- The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
- generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+ generated when running `transformers-cli login` (stored in `~/.huggingface`).
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
- If :obj:`False`, then this function returns just the final configuration object.
+ return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+ If `False`, then this function returns just the final configuration object.
- If :obj:`True`, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs`
+ If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs*
is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e.,
- the part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored.
- kwargs (:obj:`Dict[str, Any]`, `optional`):
+ the part of `kwargs` which has not been used to update `config` and is otherwise ignored.
+ kwargs (`Dict[str, Any]`, *optional*):
The values in kwargs of any keys which are configuration attributes will be used to override the loaded
values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
- by the ``return_unused_kwargs`` keyword parameter.
+ by the `return_unused_kwargs` keyword parameter.
- .. note::
+
- Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+ Passing `use_auth_token=True` is required when you want to use a private model.
+
Returns:
- :class:`PretrainedConfig`: The configuration object instantiated from this pretrained model.
+ [`PretrainedConfig`]: The configuration object instantiated from this pretrained model.
- Examples::
+ Examples:
- # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
- # derived class: BertConfig
- config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from huggingface.co and cache.
- config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
- config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
- config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
- assert config.output_attentions == True
- config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True,
- foo=False, return_unused_kwargs=True)
- assert config.output_attentions == True
- assert unused_kwargs == {'foo': False}
-
- """
+ ```python
+ # We can't instantiate directly the base class *PretrainedConfig* so let's show the examples on a
+ # derived class: BertConfig
+ config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from huggingface.co and cache.
+ config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using *save_pretrained('./test/saved_model/')*
+ config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+ config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
+ assert config.output_attentions == True
+ config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True,
+ foo=False, return_unused_kwargs=True)
+ assert config.output_attentions == True
+ assert unused_kwargs == {'foo': False}
+ ```"""
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warn(
@@ -512,17 +514,17 @@ class PretrainedConfig(PushToHubMixin):
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""
- From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
- :class:`~transformers.PretrainedConfig` using ``from_dict``.
+ From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+ [`PretrainedConfig`] using `from_dict`.
Parameters:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
Returns:
- :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
+ `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
"""
cache_dir = kwargs.pop("cache_dir", None)
@@ -608,18 +610,18 @@ class PretrainedConfig(PushToHubMixin):
@classmethod
def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
"""
- Instantiates a :class:`~transformers.PretrainedConfig` from a Python dictionary of parameters.
+ Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
Args:
- config_dict (:obj:`Dict[str, Any]`):
+ config_dict (`Dict[str, Any]`):
Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
retrieved from a pretrained checkpoint by leveraging the
- :func:`~transformers.PretrainedConfig.get_config_dict` method.
- kwargs (:obj:`Dict[str, Any]`):
+ [`~PretrainedConfig.get_config_dict`] method.
+ kwargs (`Dict[str, Any]`):
Additional parameters from which to initialize the configuration object.
Returns:
- :class:`PretrainedConfig`: The configuration object instantiated from those parameters.
+ [`PretrainedConfig`]: The configuration object instantiated from those parameters.
"""
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
@@ -647,14 +649,14 @@ class PretrainedConfig(PushToHubMixin):
@classmethod
def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PretrainedConfig":
"""
- Instantiates a :class:`~transformers.PretrainedConfig` from the path to a JSON file of parameters.
+ Instantiates a [`PretrainedConfig`] from the path to a JSON file of parameters.
Args:
- json_file (:obj:`str` or :obj:`os.PathLike`):
+ json_file (`str` or `os.PathLike`):
Path to the JSON file containing the parameters.
Returns:
- :class:`PretrainedConfig`: The configuration object instantiated from that JSON file.
+ [`PretrainedConfig`]: The configuration object instantiated from that JSON file.
"""
config_dict = cls._dict_from_json_file(json_file)
@@ -678,7 +680,7 @@ class PretrainedConfig(PushToHubMixin):
serializes to a Python dictionary.
Returns:
- :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+ `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
"""
config_dict = self.to_dict()
@@ -709,7 +711,7 @@ class PretrainedConfig(PushToHubMixin):
Serializes this instance to a Python dictionary.
Returns:
- :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+ `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
"""
output = copy.deepcopy(self.__dict__)
if hasattr(self.__class__, "model_type"):
@@ -727,12 +729,12 @@ class PretrainedConfig(PushToHubMixin):
Serializes this instance to a JSON string.
Args:
- use_diff (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to ``True``, only the difference between the config instance and the default
- ``PretrainedConfig()`` is serialized to JSON string.
+ use_diff (`bool`, *optional*, defaults to `True`):
+ If set to `True`, only the difference between the config instance and the default
+ `PretrainedConfig()` is serialized to JSON string.
Returns:
- :obj:`str`: String containing all the attributes that make up this configuration instance in JSON format.
+ `str`: String containing all the attributes that make up this configuration instance in JSON format.
"""
if use_diff is True:
config_dict = self.to_diff_dict()
@@ -745,36 +747,36 @@ class PretrainedConfig(PushToHubMixin):
Save this instance to a JSON file.
Args:
- json_file_path (:obj:`str` or :obj:`os.PathLike`):
+ json_file_path (`str` or `os.PathLike`):
Path to the JSON file in which this configuration instance's parameters will be saved.
- use_diff (:obj:`bool`, `optional`, defaults to :obj:`True`):
- If set to ``True``, only the difference between the config instance and the default
- ``PretrainedConfig()`` is serialized to JSON file.
+ use_diff (`bool`, *optional*, defaults to `True`):
+ If set to `True`, only the difference between the config instance and the default
+ `PretrainedConfig()` is serialized to JSON file.
"""
with open(json_file_path, "w", encoding="utf-8") as writer:
writer.write(self.to_json_string(use_diff=use_diff))
def update(self, config_dict: Dict[str, Any]):
"""
- Updates attributes of this class with attributes from ``config_dict``.
+ Updates attributes of this class with attributes from `config_dict`.
Args:
- config_dict (:obj:`Dict[str, Any]`): Dictionary of attributes that should be updated for this class.
+ config_dict (`Dict[str, Any]`): Dictionary of attributes that should be updated for this class.
"""
for key, value in config_dict.items():
setattr(self, key, value)
def update_from_string(self, update_str: str):
"""
- Updates attributes of this class with attributes from ``update_str``.
+ Updates attributes of this class with attributes from `update_str`.
- The expected format is ints, floats and strings as is, and for booleans use ``true`` or ``false``. For example:
+ The expected format is ints, floats and strings as is, and for booleans use `true` or `false`. For example:
"n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
The keys to change have to already exist in the config object.
Args:
- update_str (:obj:`str`): String with attributes that should be updated for this class.
+ update_str (`str`): String with attributes that should be updated for this class.
"""
@@ -804,8 +806,8 @@ class PretrainedConfig(PushToHubMixin):
def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
"""
- Checks whether the passed dictionary has a `torch_dtype` key and if it's not None, converts torch.dtype to a
- string of just the type. For example, :obj:`torch.float32` get converted into `"float32"` string, which can
+ Checks whether the passed dictionary has a *torch_dtype* key and if it's not None, converts torch.dtype to a
+ string of just the type. For example, `torch.float32` get converted into *"float32"* string, which can
then be stored in the json format.
"""
if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
@@ -822,20 +824,20 @@ def get_configuration_file(
Get the configuration file to use for this version of transformers.
Args:
- path_or_repo (:obj:`str` or :obj:`os.PathLike`):
- Can be either the id of a repo on huggingface.co or a path to a `directory`.
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ path_or_repo (`str` or `os.PathLike`):
+ Can be either the id of a repo on huggingface.co or a path to a *directory*.
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- use_auth_token (:obj:`str` or `bool`, `optional`):
- The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
- generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
- local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+ generated when running `transformers-cli login` (stored in `~/.huggingface`).
+ local_files_only (`bool`, *optional*, defaults to `False`):
Whether or not to only rely on local files and not to attempt to download any files.
Returns:
- :obj:`str`: The configuration file to use.
+ `str`: The configuration file to use.
"""
# Inspect all files from the repo/folder.
all_files = get_list_of_files(
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index ef6832ed71..e9611fdca6 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -13,10 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
- Utilities to convert slow tokenizers in their fast tokenizers counterparts.
+Utilities to convert slow tokenizers in their fast tokenizers counterparts.
- All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
- allow to make our dependency on SentencePiece optional.
+All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
+allow to make our dependency on SentencePiece optional.
"""
from typing import Dict, List, Tuple
@@ -960,13 +960,13 @@ def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer:
Utilities to convert a slow tokenizer instance in a fast tokenizer instance.
Args:
- transformer_tokenizer (:class:`~transformers.tokenization_utils_base.PreTrainedTokenizer`):
+ transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
Instance of a slow tokenizer to convert in the backend tokenizer for
- :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerFast`.
+ [`~tokenization_utils_base.PreTrainedTokenizerFast`].
Return:
- A instance of :class:`~tokenizers.Tokenizer` to be used as the backend tokenizer of a
- :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerFast`
+ A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
+ [`~tokenization_utils_base.PreTrainedTokenizerFast`]
"""
tokenizer_class_name = transformer_tokenizer.__class__.__name__
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 8b16280e3f..f6750268cb 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -50,8 +50,8 @@ def default_data_collator(features: List[InputDataClass], return_tensors="pt") -
Very simple data collator that simply collates batches of dict-like objects and performs special handling for
potential keys named:
- - ``label``: handles a single value (int or float) per object
- - ``label_ids``: handles a list of values per object
+ - `label`: handles a single value (int or float) per object
+ - `label_ids`: handles a list of values per object
Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
to the model. See glue and ner for example of how it's useful.
@@ -76,8 +76,8 @@ class DefaultDataCollator(DataCollatorMixin):
Very simple data collator that simply collates batches of dict-like objects and performs special handling for
potential keys named:
- - ``label``: handles a single value (int or float) per object
- - ``label_ids``: handles a list of values per object
+ - `label`: handles a single value (int or float) per object
+ - `label_ids`: handles a list of values per object
Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
to the model. See glue and ner for example of how it's useful.
@@ -86,7 +86,7 @@ class DefaultDataCollator(DataCollatorMixin):
helpful if you need to set a return_tensors value at initialization.
Args:
- return_tensors (:obj:`str`):
+ return_tensors (`str`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
"""
@@ -213,26 +213,26 @@ class DataCollatorWithPadding:
Data collator that will dynamically pad the inputs received.
Args:
- tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+ tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
The tokenizer used for encoding the data.
- padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+ padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
- * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
- * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
- max_length (:obj:`int`, `optional`):
+ max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
- pad_to_multiple_of (:obj:`int`, `optional`):
+ pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
- return_tensors (:obj:`str`):
+ return_tensors (`str`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
"""
@@ -265,28 +265,28 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
Data collator that will dynamically pad the inputs received, as well as the labels.
Args:
- tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+ tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
The tokenizer used for encoding the data.
- padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+ padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
- * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
- * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
- max_length (:obj:`int`, `optional`):
+ max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
- pad_to_multiple_of (:obj:`int`, `optional`):
+ pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
- label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
+ label_pad_token_id (`int`, *optional*, defaults to -100):
The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
- return_tensors (:obj:`str`):
+ return_tensors (`str`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
"""
@@ -515,33 +515,33 @@ class DataCollatorForSeq2Seq:
Data collator that will dynamically pad the inputs received, as well as the labels.
Args:
- tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+ tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
The tokenizer used for encoding the data.
- model (:class:`~transformers.PreTrainedModel`):
- The model that is being trained. If set and has the `prepare_decoder_input_ids_from_labels`, use it to
- prepare the `decoder_input_ids`
+ model ([`PreTrainedModel`]):
+ The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
+ prepare the *decoder_input_ids*
- This is useful when using `label_smoothing` to avoid calculating loss twice.
- padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+ This is useful when using *label_smoothing* to avoid calculating loss twice.
+ padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
- * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence is provided).
- * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
- * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
- max_length (:obj:`int`, `optional`):
+ max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
- pad_to_multiple_of (:obj:`int`, `optional`):
+ pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
- label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
+ label_pad_token_id (`int`, *optional*, defaults to -100):
The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
- return_tensors (:obj:`str`):
+ return_tensors (`str`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
"""
@@ -605,26 +605,27 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
are not all of the same length.
Args:
- tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+ tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
The tokenizer used for encoding the data.
- mlm (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether or not to use masked language modeling. If set to :obj:`False`, the labels are the same as the
+ mlm (`bool`, *optional*, defaults to `True`):
+ Whether or not to use masked language modeling. If set to `False`, the labels are the same as the
inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for
non-masked tokens and the value to predict for the masked token.
- mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
- The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`.
- pad_to_multiple_of (:obj:`int`, `optional`):
+ mlm_probability (`float`, *optional*, defaults to 0.15):
+ The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`.
+ pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value.
- return_tensors (:obj:`str`):
+ return_tensors (`str`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
- .. note::
+
- For best performance, this data collator should be used with a dataset having items that are dictionaries or
- BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
- :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
- argument :obj:`return_special_tokens_mask=True`.
- """
+ For best performance, this data collator should be used with a dataset having items that are dictionaries or
+ BatchEncoding, with the `"special_tokens_mask"` key, as returned by a
+ [`PreTrainedTokenizer`] or a [`PreTrainedTokenizerFast`] with the
+ argument `return_special_tokens_mask=True`.
+
+ """
tokenizer: PreTrainedTokenizerBase
mlm: bool = True
@@ -845,13 +846,14 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for masked language modeling
- .. note::
+
- This collator relies on details of the implementation of subword tokenization by
- :class:`~transformers.BertTokenizer`, specifically that subword tokens are prefixed with `##`. For tokenizers
- that do not adhere to this scheme, this collator will produce an output that is roughly equivalent to
- :class:`.DataCollatorForLanguageModeling`.
- """
+ This collator relies on details of the implementation of subword tokenization by
+ [`BertTokenizer`], specifically that subword tokens are prefixed with *##*. For tokenizers
+ that do not adhere to this scheme, this collator will produce an output that is roughly equivalent to
+ [`.DataCollatorForLanguageModeling`].
+
+ """
def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
if isinstance(examples[0], (dict, BatchEncoding)):
@@ -1227,14 +1229,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
"""
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
- 0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
- 1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
+ 0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+ 1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be
masked)
- 2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
+ 2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
masked
- 3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
- span_length]`` and mask tokens ``start_index:start_index + span_length``
- 4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
+ 3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
+ 4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in
the sequence to be processed), repeat from Step 1.
"""
import torch
@@ -1325,14 +1326,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
"""
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
- 0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
- 1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
+ 0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+ 1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be
masked)
- 2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
+ 2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
masked
- 3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
- span_length]`` and mask tokens ``start_index:start_index + span_length``
- 4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
+ 3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
+ 4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in
the sequence to be processed), repeat from Step 1.
"""
from random import randint
@@ -1434,14 +1434,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
"""
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
- 0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
- 1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
+ 0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+ 1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be
masked)
- 2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
+ 2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
masked
- 3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
- span_length]`` and mask tokens ``start_index:start_index + span_length``
- 4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
+ 3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
+ 4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in
the sequence to be processed), repeat from Step 1.
"""
from random import randint
diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py
index 3dc3e6544e..24ff39ddb3 100644
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -48,20 +48,20 @@ def glue_convert_examples_to_features(
output_mode=None,
):
"""
- Loads a data file into a list of ``InputFeatures``
+ Loads a data file into a list of `InputFeatures`
Args:
- examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
+ examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
tokenizer: Instance of a tokenizer that will tokenize the examples
max_length: Maximum example length. Defaults to the tokenizer's max_len
task: GLUE task
- label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
- output_mode: String indicating the output mode. Either ``regression`` or ``classification``
+ label_list: List of labels. Can be obtained from the processor using the `processor.get_labels()` method
+ output_mode: String indicating the output mode. Either `regression` or `classification`
Returns:
- If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
- task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
- ``InputFeatures`` which can be fed to the model.
+ If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
+ task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
+ `InputFeatures` which can be fed to the model.
"""
warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
@@ -84,7 +84,7 @@ if is_tf_available():
) -> tf.data.Dataset:
"""
Returns:
- A ``tf.data.Dataset`` containing the task-specific features.
+ A `tf.data.Dataset` containing the task-specific features.
"""
processor = glue_processors[task]()
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index cea84fb3b1..208ebe504f 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -332,8 +332,8 @@ def squad_convert_examples_to_features(
model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Args:
- examples: list of :class:`~transformers.data.processors.squad.SquadExample`
- tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
+ examples: list of [`~data.processors.squad.SquadExample`]
+ tokenizer: an instance of a child of [`PreTrainedTokenizer`]
max_seq_length: The maximum sequence length of the inputs.
doc_stride: The stride used when the context is too large and is split across several features.
max_query_length: The maximum length of the query.
@@ -345,22 +345,23 @@ def squad_convert_examples_to_features(
Returns:
- list of :class:`~transformers.data.processors.squad.SquadFeatures`
+ list of [`~data.processors.squad.SquadFeatures`]
- Example::
+ Example:
- processor = SquadV2Processor()
- examples = processor.get_dev_examples(data_dir)
+ ```python
+ processor = SquadV2Processor()
+ examples = processor.get_dev_examples(data_dir)
- features = squad_convert_examples_to_features(
- examples=examples,
- tokenizer=tokenizer,
- max_seq_length=args.max_seq_length,
- doc_stride=args.doc_stride,
- max_query_length=args.max_query_length,
- is_training=not evaluate,
- )
- """
+ features = squad_convert_examples_to_features(
+ examples=examples,
+ tokenizer=tokenizer,
+ max_seq_length=args.max_seq_length,
+ doc_stride=args.doc_stride,
+ max_query_length=args.max_query_length,
+ is_training=not evaluate,
+ )
+ ```"""
# Defining helper methods
features = []
@@ -574,23 +575,24 @@ class SquadProcessor(DataProcessor):
def get_examples_from_dataset(self, dataset, evaluate=False):
"""
- Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
+ Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset.
Args:
- dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
+ dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")*
evaluate: Boolean specifying if in evaluation mode or in training mode
Returns:
List of SquadExample
- Examples::
+ Examples:
- >>> import tensorflow_datasets as tfds
- >>> dataset = tfds.load("squad")
+ ```python
+ >>> import tensorflow_datasets as tfds
+ >>> dataset = tfds.load("squad")
- >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
- >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
- """
+ >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
+ >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+ ```"""
if evaluate:
dataset = dataset["validation"]
@@ -759,8 +761,8 @@ class SquadExample:
class SquadFeatures:
"""
Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
- :class:`~transformers.data.processors.squad.SquadExample` using the
- :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
+ [`~data.processors.squad.SquadExample`] using the
+ :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.
Args:
input_ids: Indices of input sequence tokens in the vocabulary.
diff --git a/src/transformers/data/processors/utils.py b/src/transformers/data/processors/utils.py
index e96376d01e..bb008fe153 100644
--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -60,7 +60,7 @@ class InputFeatures:
Args:
input_ids: Indices of input sequence tokens in the vocabulary.
attention_mask: Mask to avoid performing attention on padding token indices.
- Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded)
+ Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
tokens.
token_type_ids: (Optional) Segment token indices to indicate first and second
portions of the inputs. Only some models use them.
@@ -92,15 +92,15 @@ class DataProcessor:
raise NotImplementedError()
def get_train_examples(self, data_dir):
- """Gets a collection of :class:`InputExample` for the train set."""
+ """Gets a collection of [`InputExample`] for the train set."""
raise NotImplementedError()
def get_dev_examples(self, data_dir):
- """Gets a collection of :class:`InputExample` for the dev set."""
+ """Gets a collection of [`InputExample`] for the dev set."""
raise NotImplementedError()
def get_test_examples(self, data_dir):
- """Gets a collection of :class:`InputExample` for the test set."""
+ """Gets a collection of [`InputExample`] for the test set."""
raise NotImplementedError()
def get_labels(self):
@@ -240,21 +240,21 @@ class SingleSentenceClassificationProcessor(DataProcessor):
return_tensors=None,
):
"""
- Convert examples in a list of ``InputFeatures``
+ Convert examples in a list of `InputFeatures`
Args:
tokenizer: Instance of a tokenizer that will tokenize the examples
max_length: Maximum example length
- pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
+ pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
pad_token: Padding token
- mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
- and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
+ mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
+ and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for
actual values)
Returns:
- If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
- task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
- ``InputFeatures`` which can be fed to the model.
+ If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
+ task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
+ `InputFeatures` which can be fed to the model.
"""
if max_length is None:
diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index 4588ca58f5..d876c40527 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -28,7 +28,7 @@ logger = logging.get_logger(__name__)
class DebugUnderflowOverflow:
"""
This debug class helps detect and understand where the model starts getting very large or very small, and more
- importantly ``nan`` or ``inf`` weight and activation elements.
+ importantly `nan` or `inf` weight and activation elements.
There are 2 working modes:
@@ -37,69 +37,77 @@ class DebugUnderflowOverflow:
Mode 1: Underflow/overflow detection
- To activate the underflow/overflow detection, initialize the object with the model ::
+ To activate the underflow/overflow detection, initialize the object with the model :
- debug_overflow = DebugUnderflowOverflow(model)
+ ```python
+ debug_overflow = DebugUnderflowOverflow(model)
+ ```
- then run the training as normal and if ``nan`` or ``inf`` gets detected in at least one of the weight, input or
- output elements this module will throw an exception and will print ``max_frames_to_save`` frames that lead to this
+ then run the training as normal and if `nan` or `inf` gets detected in at least one of the weight, input or
+ output elements this module will throw an exception and will print `max_frames_to_save` frames that lead to this
event, each frame reporting
- 1. the fully qualified module name plus the class name whose ``forward`` was run
+ 1. the fully qualified module name plus the class name whose `forward` was run
2. the absolute min and max value of all elements for each module weights, and the inputs and output
- For example, here is the header and the last few frames in detection report for ``google/mt5-small`` run in fp16 mixed precision ::
+ For example, here is the header and the last few frames in detection report for `google/mt5-small` run in fp16 mixed precision :
- Detected inf/nan during batch_number=0
- Last 21 forward frames:
- abs min abs max metadata
- [...]
- encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
- 2.17e-07 4.50e+00 weight
- 1.79e-06 4.65e+00 input[0]
- 2.68e-06 3.70e+01 output
- encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
- 8.08e-07 2.66e+01 weight
- 1.79e-06 4.65e+00 input[0]
- 1.27e-04 2.37e+02 output
- encoder.block.2.layer.1.DenseReluDense.wo Linear
- 1.01e-06 6.44e+00 weight
- 0.00e+00 9.74e+03 input[0]
- 3.18e-04 6.27e+04 output
- encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
- 1.79e-06 4.65e+00 input[0]
- 3.18e-04 6.27e+04 output
- encoder.block.2.layer.1.dropout Dropout
- 3.18e-04 6.27e+04 input[0]
- 0.00e+00 inf output
+ ```
+ Detected inf/nan during batch_number=0
+ Last 21 forward frames:
+ abs min abs max metadata
+ [...]
+ encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+ 2.17e-07 4.50e+00 weight
+ 1.79e-06 4.65e+00 input[0]
+ 2.68e-06 3.70e+01 output
+ encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+ 8.08e-07 2.66e+01 weight
+ 1.79e-06 4.65e+00 input[0]
+ 1.27e-04 2.37e+02 output
+ encoder.block.2.layer.1.DenseReluDense.wo Linear
+ 1.01e-06 6.44e+00 weight
+ 0.00e+00 9.74e+03 input[0]
+ 3.18e-04 6.27e+04 output
+ encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+ 1.79e-06 4.65e+00 input[0]
+ 3.18e-04 6.27e+04 output
+ encoder.block.2.layer.1.dropout Dropout
+ 3.18e-04 6.27e+04 input[0]
+ 0.00e+00 inf output
+ ```
- You can see here, that ``T5DenseGatedGeluDense.forward`` resulted in output activations, whose absolute max value
- was around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have ``Dropout`` which
+ You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value
+ was around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
64K, and we get an overlow.
As you can see it's the previous frames that we need to look into when the numbers start going into very large for
fp16 numbers.
- The tracking is done in a forward hook, which gets invoked immediately after ``forward`` has completed.
+ The tracking is done in a forward hook, which gets invoked immediately after `forward` has completed.
- By default the last 21 frames are printed. You can change the default to adjust for your needs. For example ::
+ By default the last 21 frames are printed. You can change the default to adjust for your needs. For example :
- debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+ ```python
+ debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+ ```
- To validate that you have set up this debugging feature correctly, and you intend to use it in a training that may
- take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in the next
- section.
+ To validate that you have set up this debugging feature correctly, and you intend to use it in a training that may
+ take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in the next
+ section.
- Mode 2. Specific batch absolute min/max tracing without detection
+ Mode 2. Specific batch absolute min/max tracing without detection
- The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
+ The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
- Let's say you want to watch the absolute min and max values for all the ingredients of each ``forward`` call of a
- given batch, and only do that for batches 1 and 3. Then you instantiate this class as ::
+ Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a
+ given batch, and only do that for batches 1 and 3. Then you instantiate this class as :
- debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+ ```python
+ debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+ ```
And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed.
@@ -109,28 +117,29 @@ class DebugUnderflowOverflow:
Early stopping:
- You can also specify the batch number after which to stop the training, with ::
+ You can also specify the batch number after which to stop the training, with :
- debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
+ ```python
+ debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
+ ```
This feature is mainly useful in the tracing mode, but you can use it for any mode.
**Performance**:
- As this module measures absolute ``min``/``max`` of each weight of the model on every forward it'll slow the
+ As this module measures absolute `min`/``max` of each weight of the model on every forward it'll slow the
training down. Therefore remember to turn it off once the debugging needs have been met.
Args:
- model (:obj:`nn.Module`):
+ model (`nn.Module`):
The model to debug.
- max_frames_to_save (:obj:`int`, `optional`, defaults to 21):
+ max_frames_to_save (`int`, *optional*, defaults to 21):
How many frames back to record
- trace_batch_nums(:obj:`List[int]`, `optional`, defaults to ``[]``):
+ trace_batch_nums(`List[int]`, *optional*, defaults to `[]`):
Which batch numbers to trace (turns detection off)
- abort_after_batch_num (:obj:`int`, `optional`):
+ abort_after_batch_num (`int``, *optional*):
Whether to abort after a certain batch number has finished
-
"""
def __init__(self, model, max_frames_to_save=21, trace_batch_nums=[], abort_after_batch_num=None):
@@ -287,7 +296,7 @@ def get_abs_min_max(var, ctx):
def detect_overflow(var, ctx):
"""
- Report whether the tensor contains any ``nan`` or ``inf`` entries.
+ Report whether the tensor contains any `nan` or `inf` entries.
This is useful for detecting overflows/underflows and best to call right after the function that did some math that
modified the tensor in question.
@@ -300,7 +309,7 @@ def detect_overflow(var, ctx):
ctx: the message to print as a context
Return:
- :obj:`True` if ``inf`` or ``nan`` was detected, :obj:`False` otherwise
+ `True` if `inf` or `nan` was detected, `False` otherwise
"""
detected = False
if torch.isnan(var).any().item():
diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index d1d2114c89..0680be888d 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -41,16 +41,16 @@ class HfDeepSpeedConfig:
"""
This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
- A ``weakref`` of this object is stored in the module's globals to be able to access the config from areas where
- things like the Trainer object is not available (e.g. ``from_pretrained`` and ``_get_resized_embeddings``).
+ A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
+ things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`).
Therefore it's important that this object remains alive while the program is still running.
- :class:`~transformers.Trainer` uses the ``HfTrainerDeepSpeedConfig`` subclass instead. That subclass has logic to
- sync the configuration with values of :class:`~transformers.TrainingArguments` by replacing special placeholder
- values: ``"auto"``. Without this special logic the DeepSpeed configuration is not modified in any way.
+ [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to
+ sync the configuration with values of [`TrainingArguments`] by replacing special placeholder
+ values: `"auto"`. Without this special logic the DeepSpeed configuration is not modified in any way.
Args:
- config_file_or_dict (:obj:`Union[str, Dict]`): path to DeepSpeed config file or dict.
+ config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.
"""
@@ -104,7 +104,7 @@ class HfDeepSpeedConfig:
def get_value(self, ds_key_long, default=None):
"""
- Returns the set value or ``default`` if no value is set
+ Returns the set value or `default` if no value is set
"""
config, ds_key = self.find_config_node(ds_key_long)
if config is None:
@@ -115,7 +115,7 @@ class HfDeepSpeedConfig:
"""
Deletes a sub-section of the config file if it's found.
- Unless ``must_exist`` is :obj:`True` the section doesn't have to exist.
+ Unless `must_exist` is `True` the section doesn't have to exist.
"""
config = self.config
@@ -136,8 +136,7 @@ class HfDeepSpeedConfig:
def is_true(self, ds_key_long):
"""
- Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to
- ask the very specific question of whether the value is set to :obj:`True` (and it's not set to :obj:`False` or
+ Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very specific question of whether the value is set to `True` (and it's not set to `False`` or
isn't set).
"""
@@ -146,8 +145,7 @@ class HfDeepSpeedConfig:
def is_false(self, ds_key_long):
"""
- Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to
- ask the very specific question of whether the value is set to :obj:`False` (and it's not set to :obj:`True` or
+ Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very specific question of whether the value is set to `False` (and it's not set to `True`` or
isn't set).
"""
value = self.get_value(ds_key_long)
@@ -165,7 +163,7 @@ class HfDeepSpeedConfig:
class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
"""
- The ``HfTrainerDeepSpeedConfig`` object is meant to be created during ``TrainingArguments`` object creation and has
+ The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has
the same lifespan as the latter.
"""
@@ -181,11 +179,11 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
"""
A utility method that massages the config file and can optionally verify that the values match.
- 1. Replace "auto" values with ``TrainingArguments`` value.
+ 1. Replace "auto" values with `TrainingArguments` value.
- 2. If it wasn't "auto" and ``must_match`` is true, then check that DS config matches Trainer
- config values and if mismatched add the entry to ``self.mismatched`` - will assert during
- ``trainer_config_finalize`` for one or more mismatches.
+ 2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
+ config values and if mismatched add the entry to `self.mismatched` - will assert during
+ `trainer_config_finalize` for one or more mismatches.
"""
config, ds_key = self.find_config_node(ds_key_long)
@@ -207,7 +205,7 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
def trainer_config_process(self, args):
"""
- Adjust the config with ``TrainingArguments`` values. This stage is run during ``TrainingArguments`` object
+ Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
creation.
"""
# DeepSpeed does:
@@ -373,7 +371,7 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inf
"""
Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
- If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made.
+ If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.
Args:
trainer: Trainer object
diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 5d8304b062..fbf2fc37ec 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -40,11 +40,11 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
This is a general feature extraction class for speech recognition.
Args:
- feature_size (:obj:`int`):
+ feature_size (`int`):
The feature dimension of the extracted features.
- sampling_rate (:obj:`int`):
+ sampling_rate (`int`):
The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
- padding_value (:obj:`float`):
+ padding_value (`float`):
The value that is used to fill the padding values / vectors.
"""
@@ -79,53 +79,54 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
max sequence length in the batch.
Padding side (left/right) padding values are defined at the feature extractor level (with
- ``self.padding_side``, ``self.padding_value``)
+ `self.padding_side`, `self.padding_value`)
- .. note::
+
- If the ``processed_features`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors,
- the result will use the same type unless you provide a different tensor type with ``return_tensors``. In
- the case of PyTorch tensors, you will lose the specific device of your tensors however.
+ If the `processed_features` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors,
+ the result will use the same type unless you provide a different tensor type with `return_tensors`. In
+ the case of PyTorch tensors, you will lose the specific device of your tensors however.
+
+
Args:
- processed_features (:class:`~transformers.BatchFeature`, list of :class:`~transformers.BatchFeature`, :obj:`Dict[str, List[float]]`, :obj:`Dict[str, List[List[float]]` or :obj:`List[Dict[str, List[float]]]`):
- Processed inputs. Can represent one input (:class:`~transformers.BatchFeature` or :obj:`Dict[str,
- List[float]]`) or a batch of input values / vectors (list of :class:`~transformers.BatchFeature`,
- `Dict[str, List[List[float]]]` or `List[Dict[str, List[float]]]`) so you can use this method during
+ processed_features ([`BatchFeature`], list of [`BatchFeature`], `Dict[str, List[float]]`, `Dict[str, List[List[float]]` or `List[Dict[str, List[float]]]`):
+ Processed inputs. Can represent one input ([`BatchFeature`] or `Dict[str, List[float]]`) or a batch of input values / vectors (list of [`BatchFeature`],
+ *Dict[str, List[List[float]]]* or *List[Dict[str, List[float]]]*) so you can use this method during
preprocessing as well as in a PyTorch Dataloader collate function.
- Instead of :obj:`List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow
+ Instead of `List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow
tensors), see the note above for the return type.
- padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+ padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
- * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
- * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
- max_length (:obj:`int`, `optional`):
+ max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
- truncation (:obj:`bool`):
- Activates truncation to cut input sequences longer than :obj:`max_length` to :obj:`max_length`.
- pad_to_multiple_of (:obj:`int`, `optional`):
+ truncation (`bool`):
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+ pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
>= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
- return_attention_mask (:obj:`bool`, `optional`):
+ return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific feature_extractor's default.
- `What are attention masks? <../glossary.html#attention-mask>`__
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+ [What are attention masks?](../glossary#attention-mask)
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
- * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
- * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return Numpy `np.ndarray` objects.
"""
# If we have a list of dicts, let's convert it in a dict of lists
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index fd616b59cf..e96c8e23cd 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -54,16 +54,16 @@ PreTrainedFeatureExtractor = Union["SequenceFeatureExtractor"] # noqa: F821
class BatchFeature(UserDict):
r"""
- Holds the output of the :meth:`~transformers.SequenceFeatureExtractor.pad` and feature extractor specific
- ``__call__`` methods.
+ Holds the output of the [`~SequenceFeatureExtractor.pad`] and feature extractor specific
+ `__call__` methods.
This class is derived from a python dictionary and can be used as a dictionary.
Args:
- data (:obj:`dict`):
+ data (`dict`):
Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
etc.).
- tensor_type (:obj:`Union[None, str, TensorType]`, `optional`):
+ tensor_type (`Union[None, str, TensorType]`, *optional*):
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
initialization.
"""
@@ -74,7 +74,7 @@ class BatchFeature(UserDict):
def __getitem__(self, item: str) -> Union[Any]:
"""
- If the key is a string, returns the value of the dict associated to :obj:`key` ('input_values',
+ If the key is a string, returns the value of the dict associated to `key` ('input_values',
'attention_mask', etc.).
"""
if isinstance(item, str):
@@ -112,9 +112,9 @@ class BatchFeature(UserDict):
Convert the inner content to tensors.
Args:
- tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
- The type of tensors to use. If :obj:`str`, should be one of the values of the enum
- :class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
+ tensor_type (`str` or [`~file_utils.TensorType`], *optional*):
+ The type of tensors to use. If `str`, should be one of the values of the enum
+ [`~file_utils.TensorType`]. If `None`, no modification is done.
"""
if tensor_type is None:
return self
@@ -176,13 +176,13 @@ class BatchFeature(UserDict):
# Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature
def to(self, device: Union[str, "torch.device"]) -> "BatchFeature":
"""
- Send all values to device by calling :obj:`v.to(device)` (PyTorch only).
+ Send all values to device by calling `v.to(device)` (PyTorch only).
Args:
- device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
+ device (`str` or `torch.device`): The device to put the tensors on.
Returns:
- :class:`~transformers.BatchFeature`: The same instance after modification.
+ [`BatchFeature`]: The same instance after modification.
"""
# This check catches things like APEX blindly calling "to" on all inputs to a module
@@ -216,83 +216,84 @@ class FeatureExtractionMixin:
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
) -> PreTrainedFeatureExtractor:
r"""
- Instantiate a type of :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` from a feature
- extractor, *e.g.* a derived class of :class:`~transformers.SequenceFeatureExtractor`.
+ Instantiate a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a feature
+ extractor, *e.g.* a derived class of [`SequenceFeatureExtractor`].
Args:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
- namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - a path to a `directory` containing a feature extractor file saved using the
- :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` method, e.g.,
- ``./my_model_directory/``.
- - a path or url to a saved feature extractor JSON `file`, e.g.,
- ``./my_model_directory/preprocessor_config.json``.
- cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+ - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+ namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - a path to a *directory* containing a feature extractor file saved using the
+ [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
+ `./my_model_directory/`.
+ - a path or url to a saved feature extractor JSON *file*, e.g.,
+ `./my_model_directory/preprocessor_config.json`.
+ cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
standard cache should not be used.
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the feature extractor files and override the cached versions
if they exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received file. Attempts to resume the download if such a file
exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
- use_auth_token (:obj:`str` or `bool`, `optional`):
- The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
- generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+ generated when running `transformers-cli login` (stored in `~/.huggingface`).
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
- If :obj:`False`, then this function returns just the final feature extractor object. If :obj:`True`,
- then this functions returns a :obj:`Tuple(feature_extractor, unused_kwargs)` where `unused_kwargs` is a
+ return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+ If `False`, then this function returns just the final feature extractor object. If `True`,
+ then this functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a
dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the
- part of ``kwargs`` which has not been used to update ``feature_extractor`` and is otherwise ignored.
- kwargs (:obj:`Dict[str, Any]`, `optional`):
+ part of `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+ kwargs (`Dict[str, Any]`, *optional*):
The values in kwargs of any keys which are feature extractor attributes will be used to override the
loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
- controlled by the ``return_unused_kwargs`` keyword parameter.
+ controlled by the `return_unused_kwargs` keyword parameter.
- .. note::
+
- Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+ Passing `use_auth_token=True` is required when you want to use a private model.
+
Returns:
- A feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`.
+ A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`].
- Examples::
+ Examples:
- # We can't instantiate directly the base class `FeatureExtractionMixin` nor `SequenceFeatureExtractor` so let's show the examples on a
- # derived class: `Wav2Vec2FeatureExtractor`
- feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h') # Download feature_extraction_config from huggingface.co and cache.
- feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/') # E.g. feature_extractor (or model) was saved using `save_pretrained('./test/saved_model/')`
- feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json')
- feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False)
- assert feature_extractor.return_attention_mask is False
- feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False,
- foo=False, return_unused_kwargs=True)
- assert feature_extractor.return_attention_mask is False
- assert unused_kwargs == {'foo': False}
- """
+ ```python
+ # We can't instantiate directly the base class *FeatureExtractionMixin* nor *SequenceFeatureExtractor* so let's show the examples on a
+ # derived class: *Wav2Vec2FeatureExtractor*
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h') # Download feature_extraction_config from huggingface.co and cache.
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/') # E.g. feature_extractor (or model) was saved using *save_pretrained('./test/saved_model/')*
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json')
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False)
+ assert feature_extractor.return_attention_mask is False
+ feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False,
+ foo=False, return_unused_kwargs=True)
+ assert feature_extractor.return_attention_mask is False
+ assert unused_kwargs == {'foo': False}
+ ```"""
feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
return cls.from_dict(feature_extractor_dict, **kwargs)
def save_pretrained(self, save_directory: Union[str, os.PathLike]):
"""
- Save a feature_extractor object to the directory ``save_directory``, so that it can be re-loaded using the
- :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` class method.
+ Save a feature_extractor object to the directory `save_directory`, so that it can be re-loaded using the
+ [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] class method.
Args:
- save_directory (:obj:`str` or :obj:`os.PathLike`):
+ save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file will be saved (will be created if it does not exist).
"""
if os.path.isfile(save_directory):
@@ -309,16 +310,16 @@ class FeatureExtractionMixin:
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""
- From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
- feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` using
- ``from_dict``.
+ From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+ feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`] using
+ `from_dict`.
Parameters:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
Returns:
- :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor
+ `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor
object.
"""
cache_dir = kwargs.pop("cache_dir", None)
@@ -397,19 +398,19 @@ class FeatureExtractionMixin:
@classmethod
def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> PreTrainedFeatureExtractor:
"""
- Instantiates a type of :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` from a Python
+ Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python
dictionary of parameters.
Args:
- feature_extractor_dict (:obj:`Dict[str, Any]`):
+ feature_extractor_dict (`Dict[str, Any]`):
Dictionary that will be used to instantiate the feature extractor object. Such a dictionary can be
retrieved from a pretrained checkpoint by leveraging the
- :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.to_dict` method.
- kwargs (:obj:`Dict[str, Any]`):
+ [`~feature_extraction_utils.FeatureExtractionMixin.to_dict`] method.
+ kwargs (`Dict[str, Any]`):
Additional parameters from which to initialize the feature extractor object.
Returns:
- :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`: The feature extractor object
+ [`~feature_extraction_utils.FeatureExtractionMixin`]: The feature extractor object
instantiated from those parameters.
"""
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
@@ -436,7 +437,7 @@ class FeatureExtractionMixin:
Serializes this instance to a Python dictionary.
Returns:
- :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance.
+ `Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance.
"""
output = copy.deepcopy(self.__dict__)
output["feature_extractor_type"] = self.__class__.__name__
@@ -446,15 +447,15 @@ class FeatureExtractionMixin:
@classmethod
def from_json_file(cls, json_file: Union[str, os.PathLike]) -> PreTrainedFeatureExtractor:
"""
- Instantiates a feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`
+ Instantiates a feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`]
from the path to a JSON file of parameters.
Args:
- json_file (:obj:`str` or :obj:`os.PathLike`):
+ json_file (`str` or `os.PathLike`):
Path to the JSON file containing the parameters.
Returns:
- A feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`: The
+ A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The
feature_extractor object instantiated from that JSON file.
"""
with open(json_file, "r", encoding="utf-8") as reader:
@@ -467,7 +468,7 @@ class FeatureExtractionMixin:
Serializes this instance to a JSON string.
Returns:
- :obj:`str`: String containing all the attributes that make up this feature_extractor instance in JSON
+ `str`: String containing all the attributes that make up this feature_extractor instance in JSON
format.
"""
dictionary = self.to_dict()
@@ -483,7 +484,7 @@ class FeatureExtractionMixin:
Save this instance to a JSON file.
Args:
- json_file_path (:obj:`str` or :obj:`os.PathLike`):
+ json_file_path (`str` or `os.PathLike`):
Path to the JSON file in which this feature_extractor instance's parameters will be saved.
"""
with open(json_file_path, "w", encoding="utf-8") as writer:
diff --git a/src/transformers/generation_beam_search.py b/src/transformers/generation_beam_search.py
index aa20350b9a..663e8c31f0 100644
--- a/src/transformers/generation_beam_search.py
+++ b/src/transformers/generation_beam_search.py
@@ -25,70 +25,70 @@ from .file_utils import add_start_docstrings
PROCESS_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- next_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
- Current scores of the top :obj:`2 * num_beams` non-finished beam hypotheses.
- next_tokens (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
- :obj:`input_ids` of the tokens corresponding to the top :obj:`2 * num_beams` non-finished beam hypotheses.
- next_indices (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
- Beam indices indicating to which beam hypothesis the :obj:`next_tokens` correspond.
- pad_token_id (:obj:`int`, `optional`):
- The id of the `padding` token.
- eos_token_id (:obj:`int`, `optional`):
- The id of the `end-of-sequence` token.
+ [What are input IDs?](../glossary#input-ids)
+ next_scores (`torch.FloatTensor` of shape `(batch_size, 2 * num_beams)`):
+ Current scores of the top `2 * num_beams` non-finished beam hypotheses.
+ next_tokens (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
+ `input_ids` of the tokens corresponding to the top `2 * num_beams` non-finished beam hypotheses.
+ next_indices (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
+ Beam indices indicating to which beam hypothesis the `next_tokens` correspond.
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
Return:
- :obj:`UserDict`: A dictionary composed of the fields as defined above:
+ `UserDict`: A dictionary composed of the fields as defined above:
- - **next_beam_scores** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Updated
+ - **next_beam_scores** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Updated
scores of all non-finished beams.
- - **next_beam_tokens** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Next tokens
+ - **next_beam_tokens** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Next tokens
to be added to the non-finished beam_hypotheses.
- - **next_beam_indices** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Beam indices
+ - **next_beam_indices** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Beam indices
indicating to which beam the next tokens shall be added.
"""
FINALIZE_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- final_beam_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+ [What are input IDs?](../glossary#input-ids)
+ final_beam_scores (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
The final scores of all non-finished beams.
- final_beam_tokens (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+ final_beam_tokens (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
The last tokens to be added to the non-finished beam_hypotheses.
- final_beam_indices (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
- The beam indices indicating to which beam the :obj:`final_beam_tokens` shall be added.
- pad_token_id (:obj:`int`, `optional`):
- The id of the `padding` token.
- eos_token_id (:obj:`int`, `optional`):
- The id of the `end-of-sequence` token.
+ final_beam_indices (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
+ The beam indices indicating to which beam the `final_beam_tokens` shall be added.
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
Return:
- :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
- sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all
- batches finished early due to the :obj:`eos_token_id`.
+ `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
+ sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all
+ batches finished early due to the `eos_token_id`.
"""
class BeamScorer(ABC):
"""
- Abstract base class for all beam scorers that are used for :meth:`~transformers.PreTrainedModel.beam_search` and
- :meth:`~transformers.PreTrainedModel.beam_sample`.
+ Abstract base class for all beam scorers that are used for [`~PreTrainedModel.beam_search`] and
+ [`~PreTrainedModel.beam_sample`].
"""
@abstractmethod
@@ -119,36 +119,34 @@ class BeamScorer(ABC):
class BeamSearchScorer(BeamScorer):
r"""
- :class:`transformers.BeamScorer` implementing standard beam search decoding.
+ [`BeamScorer`] implementing standard beam search decoding.
- Adapted in part from `Facebook's XLM beam search code
- `__.
+ Adapted in part from [Facebook's XLM beam search code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529).
- Reference for the diverse beam search algorithm and implementation `Ashwin Kalyan's DBS implementation
- `__
+ Reference for the diverse beam search algorithm and implementation [Ashwin Kalyan's DBS implementation](https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua)
Args:
- batch_size (:obj:`int`):
- Batch Size of :obj:`input_ids` for which standard beam search decoding is run in parallel.
- max_length (:obj:`int`):
+ batch_size (`int`):
+ Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
+ max_length (`int`):
The maximum length of the sequence to be generated.
- num_beams (:obj:`int`):
+ num_beams (`int`):
Number of beams for beam search.
- device (:obj:`torch.device`):
- Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of
- :obj:`BeamSearchScorer` will be allocated.
- length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+ device (`torch.device`):
+ Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of
+ `BeamSearchScorer` will be allocated.
+ length_penalty (`float`, *optional*, defaults to 1.0):
Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
sequences.
- do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
- num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1):
+ do_early_stopping (`bool`, *optional*, defaults to `False`):
+ Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
+ num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
The number of beam hypotheses that shall be returned upon calling
- :meth:`~transformer.BeamSearchScorer.finalize`.
- num_beam_groups (:obj:`int`):
- Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
- beams. See `this paper `__ for more details.
+ [`~transformer.BeamSearchScorer.finalize`].
+ num_beam_groups (`int`):
+ Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
+ beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
"""
def __init__(
diff --git a/src/transformers/generation_flax_logits_process.py b/src/transformers/generation_flax_logits_process.py
index 1244291775..1b6bd4df6c 100644
--- a/src/transformers/generation_flax_logits_process.py
+++ b/src/transformers/generation_flax_logits_process.py
@@ -29,22 +29,22 @@ logger = get_logger(__name__)
LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`PreTrainedTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- scores (:obj:`jnp.ndarray` of shape :obj:`(batch_size, config.vocab_size)`):
+ [What are input IDs?](../glossary#input-ids)
+ scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`):
Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
search or log softmax for each vocabulary token when using beam search
kwargs:
Additional logits processor specific kwargs.
Return:
- :obj:`jnp.ndarray` of shape :obj:`(batch_size, config.vocab_size)`: The processed prediction scores.
+ `jnp.ndarray` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
"""
@@ -73,10 +73,10 @@ class FlaxLogitsWarper(ABC):
class FlaxLogitsProcessorList(list):
"""
- This class can be used to create a list of :class:`~transformers.FlaxLogitsProcessor` or
- :class:`~transformers.FlaxLogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits
- from list and adds a specific `__call__` method to apply each :class:`~transformers.FlaxLogitsProcessor` or
- :class:`~transformers.FlaxLogitsWarper` to the inputs.
+ This class can be used to create a list of [`FlaxLogitsProcessor`] or
+ [`FlaxLogitsWarper`] to subsequently process a `scores` input tensor. This class inherits
+ from list and adds a specific *__call__* method to apply each [`FlaxLogitsProcessor`] or
+ [`FlaxLogitsWarper`] to the inputs.
"""
@add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -97,10 +97,10 @@ class FlaxLogitsProcessorList(list):
class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
r"""
- :class:`transformers.LogitsWarper` for temperature (exponential scaling output probability distribution).
+ [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
Args:
- temperature (:obj:`float`):
+ temperature (`float`):
The value used to module the logits distribution.
"""
@@ -117,16 +117,16 @@ class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
class FlaxTopPLogitsWarper(FlaxLogitsWarper):
"""
- :class:`transformers.LogitsWarper` that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
+ [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
prob_cut_off.
Args:
- top_p (:obj:`float`):
- If set to < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are
+ top_p (`float`):
+ If set to < 1, only the most probable tokens with probabilities that add up to `top_p` or higher are
kept for generation.
- filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+ filter_value (`float`, *optional*, defaults to `-float("Inf")`):
All filtered values will be set to this float value.
- min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+ min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimum number of tokens that cannot be filtered.
"""
@@ -159,14 +159,14 @@ class FlaxTopPLogitsWarper(FlaxLogitsWarper):
class FlaxTopKLogitsWarper(FlaxLogitsWarper):
r"""
- :class:`transformers.LogitsWarper` that performs top-k, i.e. restricting to the k highest probability elements.
+ [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
Args:
- top_k (:obj:`int`):
+ top_k (`int`):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
- filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+ filter_value (`float`, *optional*, defaults to `-float("Inf")`):
All filtered values will be set to this float value.
- min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+ min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimum number of tokens that cannot be filtered.
"""
@@ -195,10 +195,10 @@ class FlaxTopKLogitsWarper(FlaxLogitsWarper):
class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor):
r"""
- :class:`~transformers.FlaxLogitsProcessor` that enforces the specified token as the first generated token.
+ [`FlaxLogitsProcessor`] that enforces the specified token as the first generated token.
Args:
- bos_token_id (:obj:`int`):
+ bos_token_id (`int`):
The id of the token to force as the first generated token.
"""
@@ -219,14 +219,14 @@ class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor):
class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor):
r"""
- :class:`~transformers.FlaxLogitsProcessor` that enforces the specified token as the last generated token when
- :obj:`max_length` is reached.
+ [`FlaxLogitsProcessor`] that enforces the specified token as the last generated token when
+ `max_length` is reached.
Args:
- max_length (:obj:`int`):
+ max_length (`int`):
The maximum length of the sequence to be generated.
- eos_token_id (:obj:`int`):
- The id of the token to force as the last generated token when :obj:`max_length` is reached.
+ eos_token_id (`int`):
+ The id of the token to force as the last generated token when `max_length` is reached.
"""
def __init__(self, max_length: int, eos_token_id: int):
@@ -247,13 +247,13 @@ class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor):
class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):
r"""
- :class:`transformers.FlaxLogitsProcessor` enforcing a min-length by setting EOS probability to 0.
+ [`FlaxLogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
Args:
- min_length (:obj:`int`):
- The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
- eos_token_id (:obj:`int`):
- The id of the `end-of-sequence` token.
+ min_length (`int`):
+ The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
+ eos_token_id (`int`):
+ The id of the *end-of-sequence* token.
"""
def __init__(self, min_length: int, eos_token_id: int):
diff --git a/src/transformers/generation_flax_utils.py b/src/transformers/generation_flax_utils.py
index fa5a4225e5..634540a215 100644
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -48,7 +48,7 @@ class FlaxGreedySearchOutput(ModelOutput):
Args:
- sequences (:obj:`jnp.ndarray` of shape :obj:`(batch_size, max_length)`):
+ sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
The generated sequences.
"""
@@ -62,7 +62,7 @@ class FlaxSampleOutput(ModelOutput):
Args:
- sequences (:obj:`jnp.ndarray` of shape :obj:`(batch_size, max_length)`):
+ sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
The generated sequences.
"""
@@ -76,9 +76,9 @@ class FlaxBeamSearchOutput(ModelOutput):
Args:
- sequences (:obj:`jnp.ndarray` of shape :obj:`(batch_size, max_length)`):
+ sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
The generated sequences.
- scores (:obj:`jnp.ndarray` of shape :obj:`(batch_size,)`):
+ scores (`jnp.ndarray` of shape `(batch_size,)`):
The scores (log probabilites) of the generated sequences.
"""
@@ -119,7 +119,7 @@ class BeamSearchState:
class FlaxGenerationMixin:
"""
A class containing all of the functions supporting generation, to be used as a mixin in
- :class:`~transformers.FlaxPreTrainedModel`.
+ [`FlaxPreTrainedModel`].
"""
@staticmethod
@@ -149,7 +149,7 @@ class FlaxGenerationMixin:
"""
This function can be overwritten in the specific modeling_flax_.py classes to allow for custom beam
search behavior. Note that the only model that overwrites this method is
- :class:`~transformes.FlaxMarianMTModel`.
+ [`~transformes.FlaxMarianMTModel`].
"""
return logits
@@ -181,61 +181,62 @@ class FlaxGenerationMixin:
Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
and, multinomial sampling.
- Apart from :obj:`input_ids`, all the arguments below will default to the value of the attribute of the same
- name inside the :class:`~transformers.PretrainedConfig` of the model. The default values indicated are the
+ Apart from `input_ids`, all the arguments below will default to the value of the attribute of the same
+ name inside the [`PretrainedConfig`] of the model. The default values indicated are the
default values of those config.
- Most of these parameters are explained in more detail in `this blog post
- `__.
+ Most of these parameters are explained in more detail in [this blog post](https://huggingface.co/blog/how-to-generate).
Parameters:
- input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation.
- max_length (:obj:`int`, `optional`, defaults to 20):
+ max_length (`int`, *optional*, defaults to 20):
The maximum length of the sequence to be generated.
- do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ do_sample (`bool`, *optional*, defaults to `False`):
Whether or not to use sampling ; use greedy decoding otherwise.
- temperature (:obj:`float`, `optional`, defaults to 1.0):
+ temperature (`float`, *optional*, defaults to 1.0):
The value used to module the next token probabilities.
- top_k (:obj:`int`, `optional`, defaults to 50):
+ top_k (`int`, *optional*, defaults to 50):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
- top_p (:obj:`float`, `optional`, defaults to 1.0):
- If set to float < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or
+ top_p (`float`, *optional*, defaults to 1.0):
+ If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation.
- pad_token_id (:obj:`int`, `optional`):
- The id of the `padding` token.
- bos_token_id (:obj:`int`, `optional`):
- The id of the `beginning-of-sequence` token.
- eos_token_id (:obj:`int`, `optional`):
- The id of the `end-of-sequence` token.
- num_beams (:obj:`int`, `optional`, defaults to 1):
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ bos_token_id (`int`, *optional*):
+ The id of the *beginning-of-sequence* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
+ num_beams (`int`, *optional*, defaults to 1):
Number of beams for beam search. 1 means no beam search.
- decoder_start_token_id (:obj:`int`, `optional`):
- If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
- trace (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether to trace generation. Setting ``trace=False`` should only be used for debugging and will lead to
+ decoder_start_token_id (`int`, *optional*):
+ If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
+ trace (`bool`, *optional*, defaults to `True`):
+ Whether to trace generation. Setting `trace=False` should only be used for debugging and will lead to
a considerably slower runtime.
- params (:obj:`Dict[str, jnp.ndarray]`, `optional`):
+ params (`Dict[str, jnp.ndarray]`, *optional*):
Optionally the model parameters can be passed. Can be useful for parallelized generation.
model_kwargs:
- Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model.
+ Additional model specific kwargs will be forwarded to the `forward` function of the model.
Return:
- :class:`~transformers.file_utils.ModelOutput`.
+ [`~file_utils.ModelOutput`].
- Examples::
- >>> from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
+ Examples:
- >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
- >>> model = FlaxAutoModelForCausalLM.from_pretrained("distilgpt2")
- >>> input_context = "The dog"
- >>> # encode input context
- >>> input_ids = tokenizer(input_context, return_tensors="np").input_ids
- >>> # generate candidates using sampling
- >>> outputs = model.generate(input_ids=input_ids, max_length=20, top_k=30, do_sample=True)
- >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
- """
+ ```python
+ >>> from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
+
+ >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+ >>> model = FlaxAutoModelForCausalLM.from_pretrained("distilgpt2")
+ >>> input_context = "The dog"
+ >>> # encode input context
+ >>> input_ids = tokenizer(input_context, return_tensors="np").input_ids
+ >>> # generate candidates using sampling
+ >>> outputs = model.generate(input_ids=input_ids, max_length=20, top_k=30, do_sample=True)
+ >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+ ```"""
# set init values
max_length = max_length if max_length is not None else self.config.max_length
bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
@@ -326,8 +327,8 @@ class FlaxGenerationMixin:
self, top_k: int = None, top_p: float = None, temperature: float = None
) -> FlaxLogitsProcessorList:
"""
- This class returns a :class:`~transformers.FlaxLogitsProcessorList` list object that contains all relevant
- :class:`~transformers.FlaxLogitsWarper` instances used for multinomial sampling.
+ This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant
+ [`FlaxLogitsWarper`] instances used for multinomial sampling.
"""
# init warp parameters
@@ -358,8 +359,8 @@ class FlaxGenerationMixin:
forced_eos_token_id: int,
) -> FlaxLogitsProcessorList:
"""
- This class returns a :class:`~transformers.FlaxLogitsProcessorList` list object that contains all relevant
- :class:`~transformers.FlaxLogitsProcessor` instances used to modify the scores of the language model head.
+ This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant
+ [`FlaxLogitsProcessor`] instances used to modify the scores of the language model head.
"""
processors = FlaxLogitsProcessorList()
diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index 4ce7c99444..8a9285f757 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -30,22 +30,22 @@ logger = get_logger(__name__)
LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`):
+ [What are input IDs?](../glossary#input-ids)
+ scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
search or log softmax for each vocabulary token when using beam search
kwargs:
Additional logits processor specific kwargs.
Return:
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`: The processed prediction scores.
+ `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
"""
@@ -74,10 +74,10 @@ class LogitsWarper(ABC):
class LogitsProcessorList(list):
"""
- This class can be used to create a list of :class:`~transformers.LogitsProcessor` or
- :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from
- list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or
- :class:`~transformers.LogitsWarper` to the inputs.
+ This class can be used to create a list of [`LogitsProcessor`] or
+ [`LogitsWarper`] to subsequently process a `scores` input tensor. This class inherits from
+ list and adds a specific *__call__* method to apply each [`LogitsProcessor`] or
+ [`LogitsWarper`] to the inputs.
"""
@add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -98,13 +98,13 @@ class LogitsProcessorList(list):
class MinLengthLogitsProcessor(LogitsProcessor):
r"""
- :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.
+ [`LogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
Args:
- min_length (:obj:`int`):
- The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
- eos_token_id (:obj:`int`):
- The id of the `end-of-sequence` token.
+ min_length (`int`):
+ The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
+ eos_token_id (`int`):
+ The id of the *end-of-sequence* token.
"""
def __init__(self, min_length: int, eos_token_id: int):
@@ -126,10 +126,10 @@ class MinLengthLogitsProcessor(LogitsProcessor):
class TemperatureLogitsWarper(LogitsWarper):
r"""
- :class:`transformers.LogitsWarper` for temperature (exponential scaling output probability distribution).
+ [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
Args:
- temperature (:obj:`float`):
+ temperature (`float`):
The value used to module the logits distribution.
"""
@@ -146,12 +146,11 @@ class TemperatureLogitsWarper(LogitsWarper):
class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
r"""
- :class:`transformers.LogitsProcessor` enforcing an exponential penalty on repeated sequences.
+ [`LogitsProcessor`] enforcing an exponential penalty on repeated sequences.
Args:
- repetition_penalty (:obj:`float`):
- The parameter for repetition penalty. 1.0 means no penalty. See `this paper
- `__ for more details.
+ repetition_penalty (`float`):
+ The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
"""
def __init__(self, penalty: float):
@@ -172,16 +171,16 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
class TopPLogitsWarper(LogitsWarper):
"""
- :class:`transformers.LogitsWarper` that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
+ [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
prob_cut_off.
Args:
- top_p (:obj:`float`):
- If set to < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are
+ top_p (`float`):
+ If set to < 1, only the most probable tokens with probabilities that add up to `top_p` or higher are
kept for generation.
- filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+ filter_value (`float`, *optional*, defaults to `-float("Inf")`):
All filtered values will be set to this float value.
- min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+ min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimum number of tokens that cannot be filtered.
"""
@@ -215,14 +214,14 @@ class TopPLogitsWarper(LogitsWarper):
class TopKLogitsWarper(LogitsWarper):
r"""
- :class:`transformers.LogitsWarper` that performs top-k, i.e. restricting to the k highest probability elements.
+ [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
Args:
- top_k (:obj:`int`):
+ top_k (`int`):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
- filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+ filter_value (`float`, *optional*, defaults to `-float("Inf")`):
All filtered values will be set to this float value.
- min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+ min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimum number of tokens that cannot be filtered.
"""
@@ -279,12 +278,11 @@ def _calc_banned_ngram_tokens(
class NoRepeatNGramLogitsProcessor(LogitsProcessor):
r"""
- :class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq
- `__.
+ [`LogitsProcessor`] that enforces no repetition of n-grams. See [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
Args:
- ngram_size (:obj:`int`):
- All ngrams of size :obj:`ngram_size` can only occur once.
+ ngram_size (`int`):
+ All ngrams of size `ngram_size` can only occur once.
"""
def __init__(self, ngram_size: int):
@@ -305,13 +303,13 @@ class NoRepeatNGramLogitsProcessor(LogitsProcessor):
class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
r"""
- :class:`transformers.LogitsProcessor` that enforces no repetition of encoder input ids n-grams for the decoder ids.
- See `ParlAI `__.
+ [`LogitsProcessor`] that enforces no repetition of encoder input ids n-grams for the decoder ids.
+ See [ParlAI](https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350).
Args:
- encoder_ngram_size (:obj:`int`):
- All ngrams of size :obj:`ngram_size` can only occur within the encoder input ids.
- encoder_input_ids (:obj:`int`):
+ encoder_ngram_size (`int`):
+ All ngrams of size `ngram_size` can only occur within the encoder input ids.
+ encoder_input_ids (`int`):
The encoder_input_ids that should not be repeated within the decoder ids.
"""
@@ -346,15 +344,14 @@ class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
class NoBadWordsLogitsProcessor(LogitsProcessor):
"""
- :class:`transformers.LogitsProcessor` that enforces that specified sequences will never be sampled.
+ [`LogitsProcessor`] that enforces that specified sequences will never be sampled.
Args:
- bad_words_ids (:obj:`List[List[int]]`):
+ bad_words_ids (`List[List[int]]`):
List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
- that should not appear in the generated text, use :obj:`tokenizer(bad_word,
- add_prefix_space=True).input_ids`.
- eos_token_id (:obj:`int`):
- The id of the `end-of-sequence` token.
+ that should not appear in the generated text, use `tokenizer(bad_word, add_prefix_space=True).input_ids`.
+ eos_token_id (`int`):
+ The id of the *end-of-sequence* token.
"""
def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
@@ -474,16 +471,16 @@ class NoBadWordsLogitsProcessor(LogitsProcessor):
class PrefixConstrainedLogitsProcessor(LogitsProcessor):
r"""
- :class:`transformers.LogitsProcessor` that enforces constrained generation and is useful for prefix-conditioned
- constrained generation. See `Autoregressive Entity Retrieval `__ for more
+ [`LogitsProcessor`] that enforces constrained generation and is useful for prefix-conditioned
+ constrained generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more
information.
Args:
- prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`):
+ prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`):
This function constraints the beam search to allowed tokens only at each step. This function takes 2
- arguments :obj:`inputs_ids` and the batch ID :obj:`batch_id`. It has to return a list with the allowed
- tokens for the next generation step conditioned on the previously generated tokens :obj:`inputs_ids` and
- the batch ID :obj:`batch_id`.
+ arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed
+ tokens for the next generation step conditioned on the previously generated tokens `inputs_ids` and
+ the batch ID `batch_id`.
"""
def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], num_beams: int):
@@ -501,20 +498,20 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
class HammingDiversityLogitsProcessor(LogitsProcessor):
r"""
- :class:`transformers.LogitsProcessor` that enforces diverse beam search. Note that this logits processor is only
- effective for :meth:`transformers.PreTrainedModel.group_beam_search`. See `Diverse Beam Search: Decoding Diverse
- Solutions from Neural Sequence Models `__ for more details.
+ [`LogitsProcessor`] that enforces diverse beam search. Note that this logits processor is only
+ effective for [`PreTrainedModel.group_beam_search`]. See [Diverse Beam Search: Decoding Diverse
+ Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf) for more details.
Args:
- diversity_penalty (:obj:`float`):
+ diversity_penalty (`float`):
This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
- particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is enabled.
- num_beams (:obj:`int`):
- Number of beams used for group beam search. See `this paper `__ for
+ particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
+ num_beams (`int`):
+ Number of beams used for group beam search. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for
more details.
- num_beam_groups (:obj:`int`):
- Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
- beams. See `this paper `__ for more details.
+ num_beam_groups (`int`):
+ Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
+ beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
"""
def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
@@ -561,10 +558,10 @@ class HammingDiversityLogitsProcessor(LogitsProcessor):
class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
r"""
- :class:`~transformers.LogitsProcessor` that enforces the specified token as the first generated token.
+ [`LogitsProcessor`] that enforces the specified token as the first generated token.
Args:
- bos_token_id (:obj:`int`):
+ bos_token_id (`int`):
The id of the token to force as the first generated token.
"""
@@ -582,14 +579,14 @@ class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
r"""
- :class:`~transformers.LogitsProcessor` that enforces the specified token as the last generated token when
- :obj:`max_length` is reached.
+ [`LogitsProcessor`] that enforces the specified token as the last generated token when
+ `max_length` is reached.
Args:
- max_length (:obj:`int`):
+ max_length (`int`):
The maximum length of the sequence to be generated.
- eos_token_id (:obj:`int`):
- The id of the token to force as the last generated token when :obj:`max_length` is reached.
+ eos_token_id (`int`):
+ The id of the token to force as the last generated token when `max_length` is reached.
"""
def __init__(self, max_length: int, eos_token_id: int):
@@ -607,9 +604,9 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
class InfNanRemoveLogitsProcessor(LogitsProcessor):
r"""
- :class:`~transformers.LogitsProcessor` that removes all :obj:`nan` and :obj:`inf` values to avoid the generation
+ [`LogitsProcessor`] that removes all `nan` and `inf` values to avoid the generation
method to fail. Note that using the logits processor should only be used if necessary since it can slow down the
- generation method. :obj:`max_length` is reached.
+ generation method. `max_length` is reached.
"""
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
diff --git a/src/transformers/generation_stopping_criteria.py b/src/transformers/generation_stopping_criteria.py
index 479a524606..27b92114bd 100644
--- a/src/transformers/generation_stopping_criteria.py
+++ b/src/transformers/generation_stopping_criteria.py
@@ -11,22 +11,22 @@ from .file_utils import add_start_docstrings
STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
Args:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`~transformers.BertTokenizer`. See
- :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+ Indices can be obtained using [`BertTokenizer`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
details.
- `What are input IDs? <../glossary.html#input-ids>`__
- scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`):
+ [What are input IDs?](../glossary#input-ids)
+ scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
or scores for each vocabulary token after SoftMax.
kwargs:
Additional stopping criteria specific kwargs.
Return:
- :obj:`bool`. :obj:`False` indicates we should continue, :obj:`True` indicates we should stop.
+ `bool`. `False` indicates we should continue, `True` indicates we should stop.
"""
@@ -41,11 +41,11 @@ class StoppingCriteria(ABC):
class MaxLengthCriteria(StoppingCriteria):
"""
- This class can be used to stop generation whenever the full generated number of tokens exceeds :obj:`max_length`.
+ This class can be used to stop generation whenever the full generated number of tokens exceeds `max_length`.
Keep in mind for decoder-only type of transformers, this will include the initial prompted tokens.
Args:
- max_length (:obj:`int`):
+ max_length (`int`):
The maximum length that the output sequence can have in number of tokens.
"""
@@ -59,14 +59,14 @@ class MaxLengthCriteria(StoppingCriteria):
class MaxNewTokensCriteria(StoppingCriteria):
"""
- This class can be used to stop generation whenever the generated number of tokens exceeds :obj:`max_new_tokens`.
+ This class can be used to stop generation whenever the generated number of tokens exceeds `max_new_tokens`.
Keep in mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is
- very close to :obj:`MaxLengthCriteria` but ignores the number of initial tokens.
+ very close to `MaxLengthCriteria` but ignores the number of initial tokens.
Args:
- start_length (:obj:`int`):
+ start_length (`int`):
The number of initial tokens.
- max_new_tokens (:obj:`int`):
+ max_new_tokens (`int`):
The maximum number of tokens to generate.
"""
@@ -90,12 +90,12 @@ class MaxTimeCriteria(StoppingCriteria):
"""
This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the
time will start being counted when you initialize this function. You can override this by passing an
- :obj:`initial_time`.
+ `initial_time`.
Args:
- max_time (:obj:`float`):
+ max_time (`float`):
The maximum allowed time in seconds for the generation.
- initial_time (:obj:`float`, `optional`, defaults to :obj:`time.time()`):
+ initial_time (`float`, *optional*, defaults to `time.time()`):
The start of the generation allowed time.
"""
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index d91ff8ce6f..5cc103c7f7 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -34,19 +34,19 @@ class TFGreedySearchDecoderOnlyOutput(ModelOutput):
Args:
- sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
- at each generation step. :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`tf.Tensor` with
- each tensor of shape :obj:`(batch_size, config.vocab_size)`).
- attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with
+ each tensor of shape `(batch_size, config.vocab_size)`).
+ attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
- hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+ hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size, generated_length, hidden_size)`.
+ `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
"""
sequences: tf.Tensor = None
@@ -64,28 +64,27 @@ class TFGreedySearchEncoderDecoderOutput(ModelOutput):
Args:
- sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
- at each generation step. :obj:`(max_length-1,)`-shaped tuple of :obj:`tf.Tensor` with each tensor of shape
- :obj:`(batch_size, config.vocab_size)`).
- encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer of the decoder) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
- encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size, sequence_length, hidden_size)`.
- decoder_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ at each generation step. `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape
+ `(batch_size, config.vocab_size)`).
+ encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+ encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+ decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
- cross_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+ cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
- decoder_hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+ decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size, generated_length, hidden_size)`.
+ `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
"""
sequences: tf.Tensor = None
@@ -104,20 +103,19 @@ class TFSampleDecoderOnlyOutput(ModelOutput):
Args:
- sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
- at each generation step. :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`tf.Tensor` with
- each tensor of shape :obj:`(batch_size*num_return_sequences, config.vocab_size)`).
- attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with
+ each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`).
+ attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(num_return_sequences*batch_size, num_heads, generated_length,
- sequence_length)`.
- hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `tf.Tensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length, sequence_length)`.
+ hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(num_return_sequences*batch_size, generated_length, hidden_size)`.
+ `tf.Tensor` of shape `(num_return_sequences*batch_size, generated_length, hidden_size)`.
"""
sequences: tf.Tensor = None
@@ -135,29 +133,28 @@ class TFSampleEncoderDecoderOutput(ModelOutput):
Args:
- sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
- at each generation step. :obj:`(max_length-1,)`-shaped tuple of :obj:`tf.Tensor` with each tensor of shape
- :obj:`(batch_size*num_return_sequences, config.vocab_size)`).
- encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer of the decoder) of shape
- :obj:`(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`.
- encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size*num_return_sequences, sequence_length, hidden_size)`.
- decoder_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ at each generation step. `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape
+ `(batch_size*num_return_sequences, config.vocab_size)`).
+ encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer of the decoder) of shape
+ `(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`.
+ encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size*num_return_sequences, sequence_length, hidden_size)`.
+ decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, num_heads, generated_length,
- sequence_length)`.
- cross_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ `tf.Tensor` of shape `(batch_size*num_return_sequences, num_heads, generated_length, sequence_length)`.
+ cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
- decoder_hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+ decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, generated_length, hidden_size)`.
+ `tf.Tensor` of shape `(batch_size*num_return_sequences, generated_length, hidden_size)`.
"""
sequences: tf.Tensor = None
@@ -175,23 +172,22 @@ class TFBeamSearchDecoderOnlyOutput(ModelOutput):
Base class for outputs of decoder-only generation models using beam search.
Args:
- sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- sequences_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
- Final beam scores of the generated ``sequences``.
- scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+ Final beam scores of the generated `sequences`.
+ scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
- . :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`tf.Tensor` with each tensor of shape
- :obj:`(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
- attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ . `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor of shape
+ `(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+ attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
- hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+ hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, generated_length,
- hidden_size)`.
+ `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
"""
sequences: tf.Tensor = None
@@ -209,34 +205,31 @@ class TFBeamSearchEncoderDecoderOutput(ModelOutput):
attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
Args:
- sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- sequences_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
- Final beam scores of the generated ``sequences``.
- scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+ Final beam scores of the generated `sequences`.
+ scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
- . :obj:`(max_length-1,)`-shaped tuple of :obj:`tf.Tensor` with each tensor of shape
- :obj:`(batch_size*num_beams, config.vocab_size)`).
- attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
- encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer of the decoder) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
- encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
- decoder_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ . `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape
+ `(batch_size*num_beams, config.vocab_size)`).
+ attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+ encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+ encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
+ decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, num_heads, generated_length,
- sequence_length)`.
- cross_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length, sequence_length)`.
+ cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
- decoder_hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+ decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, generated_length,
- hidden_size)`.
+ `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
"""
sequences: tf.Tensor = None
@@ -255,22 +248,22 @@ class TFBeamSampleDecoderOnlyOutput(ModelOutput):
Base class for outputs of decoder-only generation models using beam sample.
Args:
- sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- sequences_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size * num_return_sequence)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
- Final beam scores of the generated ``sequences``.
- scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+ Final beam scores of the generated `sequences`.
+ scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
- . :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`tf.Tensor` with each tensor of shape
- :obj:`(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
- attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ . `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor of shape
+ `(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+ attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
- hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+ hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams, generated_length, hidden_size)`.
+ `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
"""
sequences: tf.Tensor = None
@@ -288,31 +281,30 @@ class TFBeamSampleEncoderDecoderOutput(ModelOutput):
encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
Args:
- sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- sequences_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size * num_return_sequence)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
- Final beam scores of the generated ``sequences``.
- scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`tf.Tensor` of shape `(batch_size*num_beams, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+ Final beam scores of the generated `sequences`.
+ scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
- . :obj:`(max_length-1,)`-shaped tuple of :obj:`tf.Tensor` with each tensor of shape
- :obj:`(batch_size*num_beams, config.vocab_size)`).
- encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
- Tuple of :obj:`tf.Tensor` (one for each layer of the decoder) of shape :obj:`(batch_size, num_heads,
- sequence_length, sequence_length)`.
- encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
- shape :obj:`(batch_size*num_beams, sequence_length, hidden_size)`.
- decoder_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ . `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape
+ `(batch_size*num_beams, config.vocab_size)`).
+ encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+ Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+ encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size*num_beams, sequence_length, hidden_size)`.
+ decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
- cross_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+ cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
- decoder_hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+ decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams, generated_length, hidden_size)`.
+ `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
"""
sequences: tf.Tensor = None
@@ -334,12 +326,12 @@ TFBeamSampleOutput = Union[TFBeamSampleEncoderDecoderOutput, TFBeamSampleDecoder
class TFGenerationMixin:
"""
A class containing all of the functions supporting generation, to be used as a mixin in
- :class:`~transformers.TFPreTrainedModel`.
+ [`TFPreTrainedModel`].
"""
def prepare_inputs_for_generation(self, inputs, **kwargs):
"""
- Implement in subclasses of :class:`~transformers.TFPreTrainedModel` for custom behavior to prepare inputs in
+ Implement in subclasses of [`TFPreTrainedModel`] for custom behavior to prepare inputs in
the generate method.
"""
return {"input_ids": inputs}
@@ -387,148 +379,146 @@ class TFGenerationMixin:
Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling.
- Adapted in part from `Facebook's XLM beam search code
- `__.
+ Adapted in part from [Facebook's XLM beam search code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529).
- Apart from :obj:`input_ids` and :obj:`attention_mask`, all the arguments below will default to the value of the
- attribute of the same name inside the :class:`~transformers.PretrainedConfig` of the model. The default values
+ Apart from `input_ids` and `attention_mask`, all the arguments below will default to the value of the
+ attribute of the same name inside the [`PretrainedConfig`] of the model. The default values
indicated are the default values of those config.
- Most of these parameters are explained in more detail in `this blog post
- `__.
+ Most of these parameters are explained in more detail in [this blog post](https://huggingface.co/blog/how-to-generate).
Parameters:
- input_ids (:obj:`tf.Tensor` of :obj:`dtype=tf.int32` and shape :obj:`(batch_size, sequence_length)`, `optional`):
- The sequence used as a prompt for the generation. If :obj:`None` the method initializes it with
- :obj:`bos_token_id` and a batch size of 1.
- max_length (:obj:`int`, `optional`, defaults to 20):
+ input_ids (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*):
+ The sequence used as a prompt for the generation. If `None` the method initializes it with
+ `bos_token_id` and a batch size of 1.
+ max_length (`int`, *optional*, defaults to 20):
The maximum length of the sequence to be generated.
- min_length (:obj:`int`, `optional`, defaults to 10):
+ min_length (`int`, *optional*, defaults to 10):
The minimum length of the sequence to be generated.
- do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ do_sample (`bool`, *optional*, defaults to `False`):
Whether or not to use sampling ; use greedy decoding otherwise.
- early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
- num_beams (:obj:`int`, `optional`, defaults to 1):
+ early_stopping (`bool`, *optional*, defaults to `False`):
+ Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
+ num_beams (`int`, *optional*, defaults to 1):
Number of beams for beam search. 1 means no beam search.
- temperature (:obj:`float`, `optional`, defaults to 1.0):
+ temperature (`float`, *optional*, defaults to 1.0):
The value used to module the next token probabilities.
- top_k (:obj:`int`, `optional`, defaults to 50):
+ top_k (`int`, *optional*, defaults to 50):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
- top_p (:obj:`float`, `optional`, defaults to 1.0):
- If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or
+ top_p (`float`, *optional*, defaults to 1.0):
+ If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation.
- repetition_penalty (:obj:`float`, `optional`, defaults to 1.0):
- The parameter for repetition penalty. 1.0 means no penalty. See `this paper
- `__ for more details.
- pad_token_id (:obj:`int`, `optional`):
- The id of the `padding` token.
- bos_token_id (:obj:`int`, `optional`):
- The id of the `beginning-of-sequence` token.
- eos_token_id (:obj:`int`, `optional`):
- The id of the `end-of-sequence` token.
- length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+ repetition_penalty (`float`, *optional*, defaults to 1.0):
+ The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ bos_token_id (`int`, *optional*):
+ The id of the *beginning-of-sequence* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
+ length_penalty (`float`, *optional*, defaults to 1.0):
Exponential penalty to the length. 1.0 means no penalty.
Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
order to encourage the model to produce longer sequences.
- no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
+ no_repeat_ngram_size (`int`, *optional*, defaults to 0):
If set to int > 0, all ngrams of that size can only occur once.
- bad_words_ids(:obj:`List[int]`, `optional`):
+ bad_words_ids(`List[int]`, *optional*):
List of token ids that are not allowed to be generated. In order to get the tokens of the words that
- should not appear in the generated text, use :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
- num_return_sequences(:obj:`int`, `optional`, defaults to 1):
+ should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.
+ num_return_sequences(`int`, *optional*, defaults to 1):
The number of independently computed returned sequences for each element in the batch.
- attention_mask (:obj:`tf.Tensor` of :obj:`dtype=tf.int32` and shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values are in ``[0, 1]``, 1 for
+ attention_mask (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for
tokens that are not masked, and 0 for masked tokens.
- If not provided, will default to a tensor the same shape as :obj:`input_ids` that masks the pad token.
+ If not provided, will default to a tensor the same shape as `input_ids` that masks the pad token.
- `What are attention masks? <../glossary.html#attention-mask>`__
- decoder_start_token_id (:obj:`int`, `optional`):
- If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
- use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_start_token_id (`int`, *optional*):
+ If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
+ use_cache: (`bool`, *optional*, defaults to `True`):
Whether or not the model should use the past last key/values attentions (if applicable to the model) to
speed up decoding.
- output_attentions (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
- output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
- output_scores (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
- return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
- forced_bos_token_id (:obj:`int`, `optional`):
- The id of the token to force as the first generated token after the :obj:`decoder_start_token_id`.
- Useful for multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token
+ output_scores (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+ return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+ forced_bos_token_id (`int`, *optional*):
+ The id of the token to force as the first generated token after the `decoder_start_token_id`.
+ Useful for multilingual models like [mBART](../model_doc/mbart) where the first generated token
needs to be the target language token.
- forced_eos_token_id (:obj:`int`, `optional`):
- The id of the token to force as the last generated token when :obj:`max_length` is reached.
+ forced_eos_token_id (`int`, *optional*):
+ The id of the token to force as the last generated token when `max_length` is reached.
model_specific_kwargs:
- Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model.
+ Additional model specific kwargs will be forwarded to the `forward` function of the model.
Return:
- :class:`~transformers.file_utils.ModelOutput` or :obj:`tf.Tensor`: A
- :class:`~transformers.file_utils.ModelOutput` (if ``return_dict_in_generate=True`` or when
- ``config.return_dict_in_generate=True``) or a :obj:`tf.Tensor`.
+ [`~file_utils.ModelOutput`] or `tf.Tensor`: A
+ [`~file_utils.ModelOutput`] (if `return_dict_in_generate=True` or when
+ `config.return_dict_in_generate=True`) or a `tf.Tensor`.
- If the model is `not` an encoder-decoder model (``model.config.is_encoder_decoder=False``), the
- possible :class:`~transformers.file_utils.ModelOutput` types are:
+ If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the
+ possible [`~file_utils.ModelOutput`] types are:
- - :class:`~transformers.generation_utils.TFGreedySearchDecoderOnlyOutput`,
- - :class:`~transformers.generation_utils.TFSampleDecoderOnlyOutput`,
- - :class:`~transformers.generation_utils.TFBeamSearchDecoderOnlyOutput`,
- - :class:`~transformers.generation_utils.TFBeamSampleDecoderOnlyOutput`
+ - [`~generation_utils.TFGreedySearchDecoderOnlyOutput`],
+ - [`~generation_utils.TFSampleDecoderOnlyOutput`],
+ - [`~generation_utils.TFBeamSearchDecoderOnlyOutput`],
+ - [`~generation_utils.TFBeamSampleDecoderOnlyOutput`]
- If the model is an encoder-decoder model (``model.config.is_encoder_decoder=True``), the possible
- :class:`~transformers.file_utils.ModelOutput` types are:
+ If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+ [`~file_utils.ModelOutput`] types are:
- - :class:`~transformers.generation_utils.TFGreedySearchEncoderDecoderOutput`,
- - :class:`~transformers.generation_utils.TFSampleEncoderDecoderOutput`,
- - :class:`~transformers.generation_utils.TFBeamSearchEncoderDecoderOutput`,
- - :class:`~transformers.generation_utils.TFBeamSampleEncoderDecoderOutput`
+ - [`~generation_utils.TFGreedySearchEncoderDecoderOutput`],
+ - [`~generation_utils.TFSampleEncoderDecoderOutput`],
+ - [`~generation_utils.TFBeamSearchEncoderDecoderOutput`],
+ - [`~generation_utils.TFBeamSampleEncoderDecoderOutput`]
- Examples::
+ Examples:
- tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer
- model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from huggingface.co and cache.
- outputs = model.generate(max_length=40) # do greedy decoding
- print(f'Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')
+ ```python
+ tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer
+ model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from huggingface.co and cache.
+ outputs = model.generate(max_length=40) # do greedy decoding
+ print(f'Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')
- tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer
- model = TFAutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from huggingface.co and cache.
- input_context = 'The dog'
- input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context
- outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
- for i in range(3): # 3 output sequences were generated
- print(f'Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}')
+ tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer
+ model = TFAutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from huggingface.co and cache.
+ input_context = 'The dog'
+ input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context
+ outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
+ for i in range(3): # 3 output sequences were generated
+ print(f'Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}')
- tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer
- model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from huggingface.co and cache.
- input_context = 'The dog'
- input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context
- outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True) # generate 3 candidates using sampling
- for i in range(3): # 3 output sequences were generated
- print(f'Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}')
+ tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer
+ model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from huggingface.co and cache.
+ input_context = 'The dog'
+ input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context
+ outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True) # generate 3 candidates using sampling
+ for i in range(3): # 3 output sequences were generated
+ print(f'Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}')
- tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer
- model = TFAutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from huggingface.co and cache.
- input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl
- input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context
- outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences
- print(f'Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')
+ tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer
+ model = TFAutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from huggingface.co and cache.
+ input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl
+ input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context
+ outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences
+ print(f'Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')
- tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer
- model = TFAutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from huggingface.co and cache.
- input_context = 'My cute dog'
- bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
- input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context
- outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated
- """
+ tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer
+ model = TFAutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from huggingface.co and cache.
+ input_context = 'My cute dog'
+ bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
+ input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context
+ outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated
+ ```"""
# We cannot generate if the model does not have a LM head
if self.get_output_embeddings() is None:
@@ -1448,7 +1438,7 @@ class TFGenerationMixin:
self, logits, cur_len, max_length, forced_bos_token_id, forced_eos_token_id, **kwargs
):
"""
- Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
+ Implement in subclasses of [`PreTrainedModel`] for custom behavior to adjust the logits in
the generate method.
"""
vocab_size = getattr(self.config, "vocab_size", None)
@@ -1546,12 +1536,12 @@ def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("In
Args:
logits: logits distribution shape (batch size, vocabulary size)
- top_k (:obj:`int`, `optional`, defaults to 0):
+ top_k (`int`, *optional*, defaults to 0):
If > 0, only keep the top k tokens with highest probability (top-k filtering)
- top_p (:obj:`float`, `optional`, defaults to 1.0):
+ top_p (`float`, *optional*, defaults to 1.0):
If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus
filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
- min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+ min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimumber of tokens we keep per batch example in the output.
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 24ac094bfc..f8abfa53b8 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -60,19 +60,19 @@ class GreedySearchDecoderOnlyOutput(ModelOutput):
Args:
- sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
- at each generation step. :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor`
- with each tensor of shape :obj:`(batch_size, config.vocab_size)`).
- attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor`
+ with each tensor of shape `(batch_size, config.vocab_size)`).
+ attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
- hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+ hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size, generated_length, hidden_size)`.
+ `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
"""
sequences: torch.LongTensor = None
@@ -90,28 +90,27 @@ class GreedySearchEncoderDecoderOutput(ModelOutput):
Args:
- sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
- at each generation step. :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor
- of shape :obj:`(batch_size, config.vocab_size)`).
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape :obj:`(batch_size,
- num_heads, sequence_length, sequence_length)`.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size, sequence_length, hidden_size)`.
- decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ at each generation step. `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor
+ of shape `(batch_size, config.vocab_size)`).
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size, sequence_length, hidden_size)`.
+ decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
- cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+ cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
- decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+ decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size, generated_length, hidden_size)`.
+ `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
"""
sequences: torch.LongTensor = None
@@ -130,20 +129,19 @@ class SampleDecoderOnlyOutput(ModelOutput):
Args:
- sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
- at each generation step. :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor`
- with each tensor of shape :obj:`(batch_size*num_return_sequences, config.vocab_size)`).
- attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor`
+ with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`).
+ attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(num_return_sequences*batch_size, num_heads, generated_length,
- sequence_length)`.
- hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `torch.FloatTensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length, sequence_length)`.
+ hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(num_return_sequences*batch_size, generated_length, hidden_size)`.
+ `torch.FloatTensor` of shape `(num_return_sequences*batch_size, generated_length, hidden_size)`.
"""
sequences: torch.LongTensor = None
@@ -161,29 +159,28 @@ class SampleEncoderDecoderOutput(ModelOutput):
Args:
- sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
- at each generation step. :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor
- of shape :obj:`(batch_size*num_return_sequences, config.vocab_size)`).
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape
- :obj:`(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size*num_return_sequences, sequence_length, hidden_size)`.
- decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ at each generation step. `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor
+ of shape `(batch_size*num_return_sequences, config.vocab_size)`).
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape
+ `(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size*num_return_sequences, sequence_length, hidden_size)`.
+ decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences, num_heads, generated_length,
- sequence_length)`.
- cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ `torch.FloatTensor` of shape `(batch_size*num_return_sequences, num_heads, generated_length, sequence_length)`.
+ cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
- decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+ decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences, generated_length, hidden_size)`.
+ `torch.FloatTensor` of shape `(batch_size*num_return_sequences, generated_length, hidden_size)`.
"""
sequences: torch.LongTensor = None
@@ -201,24 +198,22 @@ class BeamSearchDecoderOnlyOutput(ModelOutput):
Base class for outputs of decoder-only generation models using beam search.
Args:
- sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
- Final beam scores of the generated ``sequences``.
- scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+ Final beam scores of the generated `sequences`.
+ scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
- . :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of
- shape :obj:`(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
- attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ . `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor` with each tensor of
+ shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+ attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length,
- sequence_length)`.
- hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+ hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, generated_length,
- hidden_size)`.
+ `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
"""
sequences: torch.LongTensor = None
@@ -236,34 +231,31 @@ class BeamSearchEncoderDecoderOutput(ModelOutput):
attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
Args:
- sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
- Final beam scores of the generated ``sequences``.
- scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+ Final beam scores of the generated `sequences`.
+ scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
- . :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of shape
- :obj:`(batch_size*num_beams, config.vocab_size)`).
- attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape :obj:`(batch_size,
- num_heads, sequence_length, sequence_length)`.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
- decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ . `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor of shape
+ `(batch_size*num_beams, config.vocab_size)`).
+ attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
+ decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, num_heads,
- generated_length, sequence_length)`.
- cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length, sequence_length)`.
+ cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
- decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+ decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, generated_length,
- hidden_size)`.
+ `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
"""
sequences: torch.LongTensor = None
@@ -282,23 +274,22 @@ class BeamSampleDecoderOnlyOutput(ModelOutput):
Base class for outputs of decoder-only generation models using beam sample.
Args:
- sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_return_sequence)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
- Final beam scores of the generated ``sequences``.
- scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ sequences_scores (`torch.FloatTensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+ Final beam scores of the generated `sequences`.
+ scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
- . :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of
- shape :obj:`(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
- attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ . `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor` with each tensor of
+ shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+ attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length,
- sequence_length)`.
- hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+ hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, generated_length, hidden_size)`.
+ `torch.FloatTensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
"""
sequences: torch.LongTensor = None
@@ -316,32 +307,30 @@ class BeamSampleEncoderDecoderOutput(ModelOutput):
encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
Args:
- sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_beams, sequence_length)`):
- The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
- shorter if all batches finished early due to the :obj:`eos_token_id`.
- sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_return_sequence)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
- Final beam scores of the generated ``sequences``.
- scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+ sequences (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+ shorter if all batches finished early due to the `eos_token_id`.
+ sequences_scores (`torch.FloatTensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+ Final beam scores of the generated `sequences`.
+ scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
- . :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of shape
- :obj:`(batch_size*num_beams, config.vocab_size)`).
- encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
- Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape :obj:`(batch_size,
- num_heads, sequence_length, sequence_length)`.
- encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
- Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
- of shape :obj:`(batch_size*num_beams, sequence_length, hidden_size)`.
- decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ . `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor of shape
+ `(batch_size*num_beams, config.vocab_size)`).
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+ of shape `(batch_size*num_beams, sequence_length, hidden_size)`.
+ decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length,
- sequence_length)`.
- cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+ `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+ cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
- decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+ decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
- :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, generated_length, hidden_size)`.
+ `torch.FloatTensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
"""
sequences: torch.LongTensor = None
@@ -366,7 +355,7 @@ ENCODER_MODEL_INPUT_NAMES = ["input_ids", "inputs_embeds", "input_values", "inpu
class GenerationMixin:
"""
A class containing all of the functions supporting generation, to be used as a mixin in
- :class:`~transformers.PreTrainedModel`.
+ [`PreTrainedModel`].
"""
def _prepare_model_inputs(
@@ -428,14 +417,14 @@ class GenerationMixin:
def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs) -> Dict[str, Any]:
"""
- Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to prepare inputs in the
+ Implement in subclasses of [`PreTrainedModel`] for custom behavior to prepare inputs in the
generate method.
"""
return {"input_ids": input_ids}
def adjust_logits_during_generation(self, logits: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
"""
- Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
+ Implement in subclasses of [`PreTrainedModel`] for custom behavior to adjust the logits in
the generate method.
"""
return logits
@@ -612,8 +601,8 @@ class GenerationMixin:
self, top_k: int = None, top_p: float = None, temperature: float = None, num_beams: int = None
) -> LogitsProcessorList:
"""
- This class returns a :class:`~transformers.LogitsProcessorList` list object that contains all relevant
- :class:`~transformers.LogitsWarper` instances used for multinomial sampling.
+ This class returns a [`LogitsProcessorList`] list object that contains all relevant
+ [`LogitsWarper`] instances used for multinomial sampling.
"""
# init warp parameters
@@ -653,8 +642,8 @@ class GenerationMixin:
logits_processor: Optional[LogitsProcessorList],
) -> LogitsProcessorList:
"""
- This class returns a :class:`~transformers.LogitsProcessorList` list object that contains all relevant
- :class:`~transformers.LogitsProcessor` instances used to modify the scores of the language model head.
+ This class returns a [`LogitsProcessorList`] list object that contains all relevant
+ [`LogitsProcessor`] instances used to modify the scores of the language model head.
"""
processors = LogitsProcessorList()
@@ -793,198 +782,196 @@ class GenerationMixin:
Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
multinomial sampling, beam-search decoding, and beam-search multinomial sampling.
- Apart from :obj:`inputs`, all the arguments below will default to the value of the attribute of the same name
- inside the :class:`~transformers.PretrainedConfig` of the model. The default values indicated are the default
+ Apart from `inputs`, all the arguments below will default to the value of the attribute of the same name
+ inside the [`PretrainedConfig`] of the model. The default values indicated are the default
values of those config.
- Most of these parameters are explained in more detail in `this blog post
- `__.
+ Most of these parameters are explained in more detail in [this blog post](https://huggingface.co/blog/how-to-generate).
Parameters:
- inputs (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, :obj:`(batch_size, sequence_length, feature_dim)` or :obj:`(batch_size, num_channels, height, width)`, `optional`):
- The sequence used as a prompt for the generation or as model inputs to the encoder. If :obj:`None` the
- method initializes it with :obj:`bos_token_id` and a batch size of 1. For decoder-only models
- :obj:`inputs` should of in the format of :obj:`input_ids`. For encoder-decoder models `inputs` can
- represent any of :obj:`input_ids`, :obj:`input_values`, :obj:`input_features`, or :obj:`pixel_values`.
- max_length (:obj:`int`, `optional`, defaults to :obj:`model.config.max_length`):
+ inputs (`torch.Tensor` of shape `(batch_size, sequence_length)`, `(batch_size, sequence_length, feature_dim)` or `(batch_size, num_channels, height, width)`, *optional*):
+ The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+ method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models
+ `inputs` should of in the format of `input_ids`. For encoder-decoder models *inputs* can
+ represent any of `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+ max_length (`int`, *optional*, defaults to `model.config.max_length`):
The maximum length of the sequence to be generated.
- max_new_tokens (:obj:`int`, `optional`, defaults to None):
+ max_new_tokens (`int`, *optional*, defaults to None):
The maximum numbers of tokens to generate, ignore the current number of tokens. Use either
- :obj:`max_new_tokens` or :obj:`max_length` but not both, they serve the same purpose.
- min_length (:obj:`int`, `optional`, defaults to 10):
+ `max_new_tokens` or `max_length` but not both, they serve the same purpose.
+ min_length (`int`, *optional*, defaults to 10):
The minimum length of the sequence to be generated.
- do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ do_sample (`bool`, *optional*, defaults to `False`):
Whether or not to use sampling ; use greedy decoding otherwise.
- early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
- num_beams (:obj:`int`, `optional`, defaults to 1):
+ early_stopping (`bool`, *optional*, defaults to `False`):
+ Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
+ num_beams (`int`, *optional*, defaults to 1):
Number of beams for beam search. 1 means no beam search.
- temperature (:obj:`float`, `optional`, defaults to 1.0):
+ temperature (`float`, *optional*, defaults to 1.0):
The value used to module the next token probabilities.
- top_k (:obj:`int`, `optional`, defaults to 50):
+ top_k (`int`, *optional*, defaults to 50):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
- top_p (:obj:`float`, `optional`, defaults to 1.0):
- If set to float < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or
+ top_p (`float`, *optional*, defaults to 1.0):
+ If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation.
- repetition_penalty (:obj:`float`, `optional`, defaults to 1.0):
- The parameter for repetition penalty. 1.0 means no penalty. See `this paper
- `__ for more details.
- pad_token_id (:obj:`int`, `optional`):
- The id of the `padding` token.
- bos_token_id (:obj:`int`, `optional`):
- The id of the `beginning-of-sequence` token.
- eos_token_id (:obj:`int`, `optional`):
- The id of the `end-of-sequence` token.
- length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+ repetition_penalty (`float`, *optional*, defaults to 1.0):
+ The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ bos_token_id (`int`, *optional*):
+ The id of the *beginning-of-sequence* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
+ length_penalty (`float`, *optional*, defaults to 1.0):
Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
sequences.
- no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
+ no_repeat_ngram_size (`int`, *optional*, defaults to 0):
If set to int > 0, all ngrams of that size can only occur once.
- encoder_no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
- If set to int > 0, all ngrams of that size that occur in the ``encoder_input_ids`` cannot occur in the
- ``decoder_input_ids``.
- bad_words_ids(:obj:`List[List[int]]`, `optional`):
+ encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
+ If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
+ `decoder_input_ids`.
+ bad_words_ids(`List[List[int]]`, *optional*):
List of token ids that are not allowed to be generated. In order to get the tokens of the words that
- should not appear in the generated text, use :obj:`tokenizer(bad_word,
- add_prefix_space=True).input_ids`.
- num_return_sequences(:obj:`int`, `optional`, defaults to 1):
+ should not appear in the generated text, use `tokenizer(bad_word, add_prefix_space=True).input_ids`.
+ num_return_sequences(`int`, *optional*, defaults to 1):
The number of independently computed returned sequences for each element in the batch.
- max_time(:obj:`float`, `optional`, defaults to None):
+ max_time(`float`, *optional*, defaults to None):
The maximum amount of time you allow the computation to run for in seconds. generation will still
finish the current pass after allocated time has been passed.
- attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
- Mask to avoid performing attention on padding token indices. Mask values are in ``[0, 1]``, 1 for
+ attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for
tokens that are not masked, and 0 for masked tokens. If not provided, will default to a tensor the same
- shape as :obj:`input_ids` that masks the pad token. `What are attention masks?
- <../glossary.html#attention-mask>`__
- decoder_start_token_id (:obj:`int`, `optional`):
- If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
- use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ shape as `input_ids` that masks the pad token. [What are attention masks?](../glossary#attention-mask)
+ decoder_start_token_id (`int`, *optional*):
+ If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
+ use_cache: (`bool`, *optional*, defaults to `True`):
Whether or not the model should use the past last key/values attentions (if applicable to the model) to
speed up decoding.
- num_beam_groups (:obj:`int`, `optional`, defaults to 1):
- Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
- beams. `this paper `__ for more details.
- diversity_penalty (:obj:`float`, `optional`, defaults to 0.0):
+ num_beam_groups (`int`, *optional*, defaults to 1):
+ Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
+ beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+ diversity_penalty (`float`, *optional*, defaults to 0.0):
This value is subtracted from a beam's score if it generates a token same as any beam from other group
- at a particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is
+ at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is
enabled.
- prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`, `optional`):
+ prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
If provided, this function constraints the beam search to allowed tokens only at each step. If not
- provided no constraint is applied. This function takes 2 arguments: the batch ID :obj:`batch_id` and
- :obj:`input_ids`. It has to return a list with the allowed tokens for the next generation step
- conditioned on the batch ID :obj:`batch_id` and the previously generated tokens :obj:`inputs_ids`. This
+ provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+ `input_ids`. It has to return a list with the allowed tokens for the next generation step
+ conditioned on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This
argument is useful for constrained generation conditioned on the prefix, as described in
- `Autoregressive Entity Retrieval `__.
- logits_processor (:obj:`LogitsProcessorList`, `optional`):
+ [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904).
+ logits_processor (`LogitsProcessorList`, *optional*):
Custom logits processors that complement the default logits processors built from arguments and a
model's config. If a logit processor is passed that is already created with the arguments or a model's
config an error is thrown. This feature is intended for advanced users.
- stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
Custom stopping criteria that complement the default stopping criteria built from arguments and a
model's config. If a stopping criteria is passed that is already created with the arguments or a
model's config an error is thrown. This feature is intended for advanced users.
- output_attentions (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ output_attentions (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
- output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
- output_scores (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
- return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
- forced_bos_token_id (:obj:`int`, `optional`):
- The id of the token to force as the first generated token after the :obj:`decoder_start_token_id`.
- Useful for multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token
+ output_scores (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+ return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+ forced_bos_token_id (`int`, *optional*):
+ The id of the token to force as the first generated token after the `decoder_start_token_id`.
+ Useful for multilingual models like [mBART](../model_doc/mbart) where the first generated token
needs to be the target language token.
- forced_eos_token_id (:obj:`int`, `optional`):
- The id of the token to force as the last generated token when :obj:`max_length` is reached.
- remove_invalid_values (:obj:`bool`, `optional`):
- Whether to remove possible `nan` and `inf` outputs of the model to prevent the generation method to
- crash. Note that using ``remove_invalid_values`` can slow down generation.
- synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ forced_eos_token_id (`int`, *optional*):
+ The id of the token to force as the last generated token when `max_length` is reached.
+ remove_invalid_values (`bool`, *optional*):
+ Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to
+ crash. Note that using `remove_invalid_values` can slow down generation.
+ synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
model_kwargs:
- Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If the
+ Additional model specific kwargs will be forwarded to the `forward` function of the model. If the
model is an encoder-decoder model, encoder specific kwargs should not be prefixed and decoder specific
- kwargs should be prefixed with `decoder_`.
+ kwargs should be prefixed with *decoder_*.
Return:
- :class:`~transformers.file_utils.ModelOutput` or :obj:`torch.LongTensor`: A
- :class:`~transformers.file_utils.ModelOutput` (if ``return_dict_in_generate=True`` or when
- ``config.return_dict_in_generate=True``) or a :obj:`torch.FloatTensor`.
+ [`~file_utils.ModelOutput`] or `torch.LongTensor`: A
+ [`~file_utils.ModelOutput`] (if `return_dict_in_generate=True` or when
+ `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
- If the model is `not` an encoder-decoder model (``model.config.is_encoder_decoder=False``), the
- possible :class:`~transformers.file_utils.ModelOutput` types are:
+ If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the
+ possible [`~file_utils.ModelOutput`] types are:
- - :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`,
- - :class:`~transformers.generation_utils.SampleDecoderOnlyOutput`,
- - :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput`,
- - :class:`~transformers.generation_utils.BeamSampleDecoderOnlyOutput`
+ - [`~generation_utils.GreedySearchDecoderOnlyOutput`],
+ - [`~generation_utils.SampleDecoderOnlyOutput`],
+ - [`~generation_utils.BeamSearchDecoderOnlyOutput`],
+ - [`~generation_utils.BeamSampleDecoderOnlyOutput`]
- If the model is an encoder-decoder model (``model.config.is_encoder_decoder=True``), the possible
- :class:`~transformers.file_utils.ModelOutput` types are:
+ If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+ [`~file_utils.ModelOutput`] types are:
- - :class:`~transformers.generation_utils.GreedySearchEncoderDecoderOutput`,
- - :class:`~transformers.generation_utils.SampleEncoderDecoderOutput`,
- - :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput`,
- - :class:`~transformers.generation_utils.BeamSampleEncoderDecoderOutput`
+ - [`~generation_utils.GreedySearchEncoderDecoderOutput`],
+ - [`~generation_utils.SampleEncoderDecoderOutput`],
+ - [`~generation_utils.BeamSearchEncoderDecoderOutput`],
+ - [`~generation_utils.BeamSampleEncoderDecoderOutput`]
- Examples::
- >>> from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
+ Examples:
- >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
- >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
- >>> # do greedy decoding without providing a prompt
- >>> outputs = model.generate(max_length=40)
- >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
+ ```python
+ >>> from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
- >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
- >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
- >>> document = (
- ... "at least two people were killed in a suspected bomb attack on a passenger bus "
- ... "in the strife-torn southern philippines on monday , the military said."
- ... )
- >>> # encode input context
- >>> input_ids = tokenizer(document, return_tensors="pt").input_ids
- >>> # generate 3 independent sequences using beam search decoding (5 beams)
- >>> # with T5 encoder-decoder model conditioned on short news article.
- >>> outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3)
- >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+ >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+ >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+ >>> # do greedy decoding without providing a prompt
+ >>> outputs = model.generate(max_length=40)
+ >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
- >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
- >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
- >>> input_context = "The dog"
- >>> # encode input context
- >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
- >>> # generate 3 candidates using sampling
- >>> outputs = model.generate(input_ids=input_ids, max_length=20, num_return_sequences=3, do_sample=True)
- >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+ >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+ >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+ >>> document = (
+ ... "at least two people were killed in a suspected bomb attack on a passenger bus "
+ ... "in the strife-torn southern philippines on monday , the military said."
+ ... )
+ >>> # encode input context
+ >>> input_ids = tokenizer(document, return_tensors="pt").input_ids
+ >>> # generate 3 independent sequences using beam search decoding (5 beams)
+ >>> # with T5 encoder-decoder model conditioned on short news article.
+ >>> outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3)
+ >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
- >>> tokenizer = AutoTokenizer.from_pretrained("ctrl")
- >>> model = AutoModelForCausalLM.from_pretrained("ctrl")
- >>> # "Legal" is one of the control codes for ctrl
- >>> input_context = "Legal My neighbor is"
- >>> # encode input context
- >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
- >>> outputs = model.generate(input_ids=input_ids, max_length=20, repetition_penalty=1.2)
- >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
+ >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+ >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+ >>> input_context = "The dog"
+ >>> # encode input context
+ >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
+ >>> # generate 3 candidates using sampling
+ >>> outputs = model.generate(input_ids=input_ids, max_length=20, num_return_sequences=3, do_sample=True)
+ >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
- >>> tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=False)
- >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
- >>> input_context = "My cute dog"
- >>> # get tokens of words that should not be generated
- >>> bad_words_ids = tokenizer(["idiot", "stupid", "shut up"], add_prefix_space=True).input_ids
- >>> # encode input context
- >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
- >>> # generate sequences without allowing bad_words to be generated
- >>> outputs = model.generate(input_ids=input_ids, max_length=20, do_sample=True, bad_words_ids=bad_words_ids)
- >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
- """
+ >>> tokenizer = AutoTokenizer.from_pretrained("ctrl")
+ >>> model = AutoModelForCausalLM.from_pretrained("ctrl")
+ >>> # "Legal" is one of the control codes for ctrl
+ >>> input_context = "Legal My neighbor is"
+ >>> # encode input context
+ >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
+ >>> outputs = model.generate(input_ids=input_ids, max_length=20, repetition_penalty=1.2)
+ >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
+
+ >>> tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=False)
+ >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+ >>> input_context = "My cute dog"
+ >>> # get tokens of words that should not be generated
+ >>> bad_words_ids = tokenizer(["idiot", "stupid", "shut up"], add_prefix_space=True).input_ids
+ >>> # encode input context
+ >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
+ >>> # generate sequences without allowing bad_words to be generated
+ >>> outputs = model.generate(input_ids=input_ids, max_length=20, do_sample=True, bad_words_ids=bad_words_ids)
+ >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
+ ```"""
# 1. Set generation parameters if not already defined
bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
num_beams = num_beams if num_beams is not None else self.config.num_beams
@@ -1292,75 +1279,76 @@ class GenerationMixin:
Parameters:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation.
- logits_processor (:obj:`LogitsProcessorList`, `optional`):
- An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
- :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+ logits_processor (`LogitsProcessorList`, *optional*):
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from
+ [`LogitsProcessor`] used to modify the prediction scores of the language modeling
head applied at each generation step.
- stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
- An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
- :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
+ An instance of [`StoppingCriteriaList`]. List of instances of class derived from
+ [`StoppingCriteria`] used to tell if the generation loop should stop.
- max_length (:obj:`int`, `optional`, defaults to 20):
- **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+ max_length (`int`, *optional*, defaults to 20):
+ **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of
generated tokens. The maximum length of the sequence to be generated.
- pad_token_id (:obj:`int`, `optional`):
- The id of the `padding` token.
- eos_token_id (:obj:`int`, `optional`):
- The id of the `end-of-sequence` token.
- output_attentions (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
+ output_attentions (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
- output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
- output_scores (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
- return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
- synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ output_scores (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+ return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+ synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
model_kwargs:
- Additional model specific keyword arguments will be forwarded to the :obj:`forward` function of the
- model. If model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+ Additional model specific keyword arguments will be forwarded to the `forward` function of the
+ model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
Return:
- :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`,
- :class:`~transformers.generation_utils.GreedySearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A
- :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
- :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput` if
- ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
- :class:`~transformers.generation_utils.GreedySearchEncoderDecoderOutput` if
- ``model.config.is_encoder_decoder=True``.
+ [`~generation_utils.GreedySearchDecoderOnlyOutput`],
+ [`~generation_utils.GreedySearchEncoderDecoderOutput`] or obj:*torch.LongTensor*: A
+ `torch.LongTensor` containing the generated tokens (default behaviour) or a
+ [`~generation_utils.GreedySearchDecoderOnlyOutput`] if
+ `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+ [`~generation_utils.GreedySearchEncoderDecoderOutput`] if
+ `model.config.is_encoder_decoder=True`.
- Examples::
+ Examples:
- >>> from transformers import (
- ... AutoTokenizer,
- ... AutoModelForCausalLM,
- ... LogitsProcessorList,
- ... MinLengthLogitsProcessor,
- ... )
+ ```python
+ >>> from transformers import (
+ ... AutoTokenizer,
+ ... AutoModelForCausalLM,
+ ... LogitsProcessorList,
+ ... MinLengthLogitsProcessor,
+ ... )
- >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
- >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+ >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+ >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
- >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
- >>> model.config.pad_token_id = model.config.eos_token_id
+ >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+ >>> model.config.pad_token_id = model.config.eos_token_id
- >>> input_prompt = "Today is a beautiful day, and"
- >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+ >>> input_prompt = "Today is a beautiful day, and"
+ >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
- >>> # instantiate logits processors
- >>> logits_processor = LogitsProcessorList([
- ... MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
- ... ])
+ >>> # instantiate logits processors
+ >>> logits_processor = LogitsProcessorList([
+ ... MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
+ ... ])
- >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor)
+ >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor)
- >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
- """
+ >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+ ```"""
# init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
@@ -1518,85 +1506,86 @@ class GenerationMixin:
Parameters:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation.
- logits_processor (:obj:`LogitsProcessorList`, `optional`):
- An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
- :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+ logits_processor (`LogitsProcessorList`, *optional*):
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from
+ [`LogitsProcessor`] used to modify the prediction scores of the language modeling
head applied at each generation step.
- stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
- An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
- :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
- logits_warper (:obj:`LogitsProcessorList`, `optional`):
- An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
- :class:`~transformers.LogitsWarper` used to warp the prediction score distribution of the language
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
+ An instance of [`StoppingCriteriaList`]. List of instances of class derived from
+ [`StoppingCriteria`] used to tell if the generation loop should stop.
+ logits_warper (`LogitsProcessorList`, *optional*):
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from
+ [`LogitsWarper`] used to warp the prediction score distribution of the language
modeling head applied before multinomial sampling at each generation step.
- max_length (:obj:`int`, `optional`, defaults to 20):
- **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+ max_length (`int`, *optional*, defaults to 20):
+ **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of
generated tokens. The maximum length of the sequence to be generated.
- pad_token_id (:obj:`int`, `optional`):
- The id of the `padding` token.
- eos_token_id (:obj:`int`, `optional`):
- The id of the `end-of-sequence` token.
- output_attentions (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
+ output_attentions (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
- output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
- output_scores (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
- return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
- synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ output_scores (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+ return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+ synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
model_kwargs:
- Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
- model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+ Additional model specific kwargs will be forwarded to the `forward` function of the model. If
+ model is an encoder-decoder model the kwargs should include `encoder_outputs`.
Return:
- :class:`~transformers.generation_utils.SampleDecoderOnlyOutput`,
- :class:`~transformers.generation_utils.SampleEncoderDecoderOutput` or obj:`torch.LongTensor`: A
- :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
- :class:`~transformers.generation_utils.SampleDecoderOnlyOutput` if
- ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
- :class:`~transformers.generation_utils.SampleEncoderDecoderOutput` if
- ``model.config.is_encoder_decoder=True``.
+ [`~generation_utils.SampleDecoderOnlyOutput`],
+ [`~generation_utils.SampleEncoderDecoderOutput`] or obj:*torch.LongTensor*: A
+ `torch.LongTensor` containing the generated tokens (default behaviour) or a
+ [`~generation_utils.SampleDecoderOnlyOutput`] if
+ `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+ [`~generation_utils.SampleEncoderDecoderOutput`] if
+ `model.config.is_encoder_decoder=True`.
- Examples::
+ Examples:
- >>> from transformers import (
- ... AutoTokenizer,
- ... AutoModelForCausalLM,
- ... LogitsProcessorList,
- ... MinLengthLogitsProcessor,
- ... TopKLogitsWarper,
- ... TemperatureLogitsWarper,
- ... )
+ ```python
+ >>> from transformers import (
+ ... AutoTokenizer,
+ ... AutoModelForCausalLM,
+ ... LogitsProcessorList,
+ ... MinLengthLogitsProcessor,
+ ... TopKLogitsWarper,
+ ... TemperatureLogitsWarper,
+ ... )
- >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
- >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+ >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+ >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
- >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
- >>> model.config.pad_token_id = model.config.eos_token_id
+ >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+ >>> model.config.pad_token_id = model.config.eos_token_id
- >>> input_prompt = "Today is a beautiful day, and"
- >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+ >>> input_prompt = "Today is a beautiful day, and"
+ >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
- >>> # instantiate logits processors
- >>> logits_processor = LogitsProcessorList([
- ... MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
- ... ])
- >>> # instantiate logits processors
- >>> logits_warper = LogitsProcessorList([
- ... TopKLogitsWarper(50),
- ... TemperatureLogitsWarper(0.7),
- ... ])
+ >>> # instantiate logits processors
+ >>> logits_processor = LogitsProcessorList([
+ ... MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
+ ... ])
+ >>> # instantiate logits processors
+ >>> logits_warper = LogitsProcessorList([
+ ... TopKLogitsWarper(50),
+ ... TemperatureLogitsWarper(0.7),
+ ... ])
- >>> outputs = model.sample(input_ids, logits_processor=logits_processor, logits_warper=logits_warper)
+ >>> outputs = model.sample(input_ids, logits_processor=logits_processor, logits_warper=logits_warper)
- >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
- """
+ >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+ ```"""
# init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
@@ -1759,97 +1748,98 @@ class GenerationMixin:
Parameters:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation.
- beam_scorer (:obj:`BeamScorer`):
- An derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are
+ beam_scorer (`BeamScorer`):
+ An derived instance of [`BeamScorer`] that defines how beam hypotheses are
constructed, stored and sorted during generation. For more information, the documentation of
- :class:`~transformers.BeamScorer` should be read.
- logits_processor (:obj:`LogitsProcessorList`, `optional`):
- An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
- :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+ [`BeamScorer`] should be read.
+ logits_processor (`LogitsProcessorList`, *optional*):
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from
+ [`LogitsProcessor`] used to modify the prediction scores of the language modeling
head applied at each generation step.
- stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
- An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
- :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
- max_length (:obj:`int`, `optional`, defaults to 20):
- **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
+ An instance of [`StoppingCriteriaList`]. List of instances of class derived from
+ [`StoppingCriteria`] used to tell if the generation loop should stop.
+ max_length (`int`, *optional*, defaults to 20):
+ **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of
generated tokens. The maximum length of the sequence to be generated.
- pad_token_id (:obj:`int`, `optional`):
- The id of the `padding` token.
- eos_token_id (:obj:`int`, `optional`):
- The id of the `end-of-sequence` token.
- output_attentions (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
+ output_attentions (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
- output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
- output_scores (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
- return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
- synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ output_scores (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+ return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+ synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
model_kwargs:
- Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
- model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+ Additional model specific kwargs will be forwarded to the `forward` function of the model. If
+ model is an encoder-decoder model the kwargs should include `encoder_outputs`.
Return:
- :class:`~transformers.generation_utilsBeamSearchDecoderOnlyOutput`,
- :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A
- :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
- :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if
- ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
- :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` if
- ``model.config.is_encoder_decoder=True``.
+ [`generation_utilsBeamSearchDecoderOnlyOutput`],
+ [`~generation_utils.BeamSearchEncoderDecoderOutput`] or obj:*torch.LongTensor*: A
+ `torch.LongTensor` containing the generated tokens (default behaviour) or a
+ [`~generation_utils.BeamSearchDecoderOnlyOutput`] if
+ `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+ [`~generation_utils.BeamSearchEncoderDecoderOutput`] if
+ `model.config.is_encoder_decoder=True`.
- Examples::
+ Examples:
- >>> from transformers import (
- ... AutoTokenizer,
- ... AutoModelForSeq2SeqLM,
- ... LogitsProcessorList,
- ... MinLengthLogitsProcessor,
- ... BeamSearchScorer,
- ... )
- >>> import torch
+ ```python
+ >>> from transformers import (
+ ... AutoTokenizer,
+ ... AutoModelForSeq2SeqLM,
+ ... LogitsProcessorList,
+ ... MinLengthLogitsProcessor,
+ ... BeamSearchScorer,
+ ... )
+ >>> import torch
- >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
- >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+ >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+ >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
- >>> encoder_input_str = "translate English to German: How old are you?"
- >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+ >>> encoder_input_str = "translate English to German: How old are you?"
+ >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
- >>> # lets run beam search using 3 beams
- >>> num_beams = 3
- >>> # define decoder start token ids
- >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
- >>> input_ids = input_ids * model.config.decoder_start_token_id
+ >>> # lets run beam search using 3 beams
+ >>> num_beams = 3
+ >>> # define decoder start token ids
+ >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+ >>> input_ids = input_ids * model.config.decoder_start_token_id
- >>> # add encoder_outputs to model keyword arguments
- >>> model_kwargs = {
- ... "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
- ... }
+ >>> # add encoder_outputs to model keyword arguments
+ >>> model_kwargs = {
+ ... "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+ ... }
- >>> # instantiate beam scorer
- >>> beam_scorer = BeamSearchScorer(
- ... batch_size=1,
- ... num_beams=num_beams,
- ... device=model.device,
- ... )
+ >>> # instantiate beam scorer
+ >>> beam_scorer = BeamSearchScorer(
+ ... batch_size=1,
+ ... num_beams=num_beams,
+ ... device=model.device,
+ ... )
- >>> # instantiate logits processors
- >>> logits_processor = LogitsProcessorList([
- ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
- ... ])
+ >>> # instantiate logits processors
+ >>> logits_processor = LogitsProcessorList([
+ ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+ ... ])
- >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+ >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
- >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
- """
+ >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+ ```"""
# init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
@@ -2052,109 +2042,110 @@ class GenerationMixin:
Parameters:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation.
- beam_scorer (:obj:`BeamScorer`):
- A derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are
+ beam_scorer (`BeamScorer`):
+ A derived instance of [`BeamScorer`] that defines how beam hypotheses are
constructed, stored and sorted during generation. For more information, the documentation of
- :class:`~transformers.BeamScorer` should be read.
- logits_processor (:obj:`LogitsProcessorList`, `optional`):
- An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
- :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+ [`BeamScorer`] should be read.
+ logits_processor (`LogitsProcessorList`, *optional*):
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from
+ [`LogitsProcessor`] used to modify the prediction scores of the language modeling
head applied at each generation step.
- stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
- An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
- :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
- logits_warper (:obj:`LogitsProcessorList`, `optional`):
- An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
- :class:`~transformers.LogitsWarper` used to warp the prediction score distribution of the language
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
+ An instance of [`StoppingCriteriaList`]. List of instances of class derived from
+ [`StoppingCriteria`] used to tell if the generation loop should stop.
+ logits_warper (`LogitsProcessorList`, *optional*):
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from
+ [`LogitsWarper`] used to warp the prediction score distribution of the language
modeling head applied before multinomial sampling at each generation step.
- max_length (:obj:`int`, `optional`, defaults to 20):
- **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+ max_length (`int`, *optional*, defaults to 20):
+ **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of
generated tokens. The maximum length of the sequence to be generated.
- pad_token_id (:obj:`int`, `optional`):
- The id of the `padding` token.
- eos_token_id (:obj:`int`, `optional`):
- The id of the `end-of-sequence` token.
- output_attentions (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
+ output_attentions (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
- output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
- output_scores (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
- return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
- synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ output_scores (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+ return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+ synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
model_kwargs:
- Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
- model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+ Additional model specific kwargs will be forwarded to the `forward` function of the model. If
+ model is an encoder-decoder model the kwargs should include `encoder_outputs`.
Return:
- :class:`~transformers.generation_utils.BeamSampleDecoderOnlyOutput`,
- :class:`~transformers.generation_utils.BeamSampleEncoderDecoderOutput` or obj:`torch.LongTensor`: A
- :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
- :class:`~transformers.generation_utils.BeamSampleDecoderOnlyOutput` if
- ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
- :class:`~transformers.generation_utils.BeamSampleEncoderDecoderOutput` if
- ``model.config.is_encoder_decoder=True``.
+ [`~generation_utils.BeamSampleDecoderOnlyOutput`],
+ [`~generation_utils.BeamSampleEncoderDecoderOutput`] or obj:*torch.LongTensor*: A
+ `torch.LongTensor` containing the generated tokens (default behaviour) or a
+ [`~generation_utils.BeamSampleDecoderOnlyOutput`] if
+ `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+ [`~generation_utils.BeamSampleEncoderDecoderOutput`] if
+ `model.config.is_encoder_decoder=True`.
- Examples::
+ Examples:
- >>> from transformers import (
- ... AutoTokenizer,
- ... AutoModelForSeq2SeqLM,
- ... LogitsProcessorList,
- ... MinLengthLogitsProcessor,
- ... TopKLogitsWarper,
- ... TemperatureLogitsWarper,
- ... BeamSearchScorer,
- ... )
- >>> import torch
+ ```python
+ >>> from transformers import (
+ ... AutoTokenizer,
+ ... AutoModelForSeq2SeqLM,
+ ... LogitsProcessorList,
+ ... MinLengthLogitsProcessor,
+ ... TopKLogitsWarper,
+ ... TemperatureLogitsWarper,
+ ... BeamSearchScorer,
+ ... )
+ >>> import torch
- >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
- >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+ >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+ >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
- >>> encoder_input_str = "translate English to German: How old are you?"
- >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+ >>> encoder_input_str = "translate English to German: How old are you?"
+ >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
- >>> # lets run beam search using 3 beams
- >>> num_beams = 3
- >>> # define decoder start token ids
- >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
- >>> input_ids = input_ids * model.config.decoder_start_token_id
+ >>> # lets run beam search using 3 beams
+ >>> num_beams = 3
+ >>> # define decoder start token ids
+ >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+ >>> input_ids = input_ids * model.config.decoder_start_token_id
- >>> # add encoder_outputs to model keyword arguments
- >>> model_kwargs = {
- ... "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
- ... }
+ >>> # add encoder_outputs to model keyword arguments
+ >>> model_kwargs = {
+ ... "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+ ... }
- >>> # instantiate beam scorer
- >>> beam_scorer = BeamSearchScorer(
- ... batch_size=1,
- ... max_length=model.config.max_length,
- ... num_beams=num_beams,
- ... device=model.device,
- ... )
+ >>> # instantiate beam scorer
+ >>> beam_scorer = BeamSearchScorer(
+ ... batch_size=1,
+ ... max_length=model.config.max_length,
+ ... num_beams=num_beams,
+ ... device=model.device,
+ ... )
- >>> # instantiate logits processors
- >>> logits_processor = LogitsProcessorList([
- ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)
- ... ])
- >>> # instantiate logits processors
- >>> logits_warper = LogitsProcessorList([
- ... TopKLogitsWarper(50),
- ... TemperatureLogitsWarper(0.7),
- ... ])
+ >>> # instantiate logits processors
+ >>> logits_processor = LogitsProcessorList([
+ ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)
+ ... ])
+ >>> # instantiate logits processors
+ >>> logits_warper = LogitsProcessorList([
+ ... TopKLogitsWarper(50),
+ ... TemperatureLogitsWarper(0.7),
+ ... ])
- >>> outputs = model.beam_sample(
- ... input_ids, beam_scorer, logits_processor=logits_processor, logits_warper=logits_warper, **model_kwargs
- ... )
+ >>> outputs = model.beam_sample(
+ ... input_ids, beam_scorer, logits_processor=logits_processor, logits_warper=logits_warper, **model_kwargs
+ ... )
- >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
- """
+ >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+ ```"""
# init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
@@ -2354,102 +2345,103 @@ class GenerationMixin:
Parameters:
- input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation.
- beam_scorer (:obj:`BeamScorer`):
- An derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are
+ beam_scorer (`BeamScorer`):
+ An derived instance of [`BeamScorer`] that defines how beam hypotheses are
constructed, stored and sorted during generation. For more information, the documentation of
- :class:`~transformers.BeamScorer` should be read.
- logits_processor (:obj:`LogitsProcessorList`, `optional`):
- An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
- :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+ [`BeamScorer`] should be read.
+ logits_processor (`LogitsProcessorList`, *optional*):
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from
+ [`LogitsProcessor`] used to modify the prediction scores of the language modeling
head applied at each generation step.
- stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
- An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
- :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
- max_length (:obj:`int`, `optional`, defaults to 20):
- **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
+ An instance of [`StoppingCriteriaList`]. List of instances of class derived from
+ [`StoppingCriteria`] used to tell if the generation loop should stop.
+ max_length (`int`, *optional*, defaults to 20):
+ **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of
generated tokens. The maximum length of the sequence to be generated.
- pad_token_id (:obj:`int`, `optional`):
- The id of the `padding` token.
- eos_token_id (:obj:`int`, `optional`):
- The id of the `end-of-sequence` token.
- output_attentions (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
+ output_attentions (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
- output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+ output_hidden_states (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
- output_scores (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
- return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
- synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ output_scores (`bool`, *optional*, defaults to *False*):
+ Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+ return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+ synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
model_kwargs:
- Additional model specific kwargs that will be forwarded to the :obj:`forward` function of the model. If
- model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+ Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
+ model is an encoder-decoder model the kwargs should include `encoder_outputs`.
Return:
- :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput`,
- :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A
- :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
- :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if
- :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if
- ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
- :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` if
- ``model.config.is_encoder_decoder=True``.
+ [`~generation_utils.BeamSearchDecoderOnlyOutput`],
+ [`~generation_utils.BeamSearchEncoderDecoderOutput`] or obj:*torch.LongTensor*: A
+ `torch.LongTensor` containing the generated tokens (default behaviour) or a
+ [`~generation_utils.BeamSearchDecoderOnlyOutput`] if
+ [`~generation_utils.BeamSearchDecoderOnlyOutput`] if
+ `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+ [`~generation_utils.BeamSearchEncoderDecoderOutput`] if
+ `model.config.is_encoder_decoder=True`.
- Examples::
+ Examples:
- >>> from transformers import (
- ... AutoTokenizer,
- ... AutoModelForSeq2SeqLM,
- ... LogitsProcessorList,
- ... MinLengthLogitsProcessor,
- ... HammingDiversityLogitsProcessor,
- ... BeamSearchScorer,
- ... )
- >>> import torch
+ ```python
+ >>> from transformers import (
+ ... AutoTokenizer,
+ ... AutoModelForSeq2SeqLM,
+ ... LogitsProcessorList,
+ ... MinLengthLogitsProcessor,
+ ... HammingDiversityLogitsProcessor,
+ ... BeamSearchScorer,
+ ... )
+ >>> import torch
- >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
- >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+ >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+ >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
- >>> encoder_input_str = "translate English to German: How old are you?"
- >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+ >>> encoder_input_str = "translate English to German: How old are you?"
+ >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
- >>> # lets run diverse beam search using 6 beams
- >>> num_beams = 6
- >>> # define decoder start token ids
- >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
- >>> input_ids = input_ids * model.config.decoder_start_token_id
+ >>> # lets run diverse beam search using 6 beams
+ >>> num_beams = 6
+ >>> # define decoder start token ids
+ >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+ >>> input_ids = input_ids * model.config.decoder_start_token_id
- >>> # add encoder_outputs to model keyword arguments
- >>> model_kwargs = {
- ... "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
- ... }
+ >>> # add encoder_outputs to model keyword arguments
+ >>> model_kwargs = {
+ ... "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+ ... }
- >>> # instantiate beam scorer
- >>> beam_scorer = BeamSearchScorer(
- ... batch_size=1,
- ... max_length=model.config.max_length,
- ... num_beams=num_beams,
- ... device=model.device,
- ... num_beam_groups=3
- ... )
+ >>> # instantiate beam scorer
+ >>> beam_scorer = BeamSearchScorer(
+ ... batch_size=1,
+ ... max_length=model.config.max_length,
+ ... num_beams=num_beams,
+ ... device=model.device,
+ ... num_beam_groups=3
+ ... )
- >>> # instantiate logits processors
- >>> logits_processor = LogitsProcessorList([
- ... HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3),
- ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
- ... ])
+ >>> # instantiate logits processors
+ >>> logits_processor = LogitsProcessorList([
+ ... HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3),
+ ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+ ... ])
- >>> outputs = model.group_beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+ >>> outputs = model.group_beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
- >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
- """
+ >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+ ```"""
# init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
@@ -2688,12 +2680,12 @@ def top_k_top_p_filtering(
Args:
logits: logits distribution shape (batch size, vocabulary size)
- top_k (:obj:`int`, `optional`, defaults to 0):
+ top_k (`int`, *optional*, defaults to 0):
If > 0, only keep the top k tokens with highest probability (top-k filtering)
- top_p (:obj:`float`, `optional`, defaults to 1.0):
+ top_p (`float`, *optional*, defaults to 1.0):
If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus
filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
- min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+ min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimumber of tokens we keep per batch example in the output.
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index d6cf5badbe..773f3e1cad 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -41,14 +41,14 @@ def is_torch_tensor(obj):
def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
"""
- Loads :obj:`image` to a PIL Image.
+ Loads `image` to a PIL Image.
Args:
- image (:obj:`str` or :obj:`PIL.Image.Image`):
+ image (`str` or `PIL.Image.Image`):
The image to convert to the PIL Image format.
Returns:
- :obj:`PIL.Image.Image`: A PIL Image.
+ `PIL.Image.Image`: A PIL Image.
"""
if isinstance(image, str):
if image.startswith("http://") or image.startswith("https://"):
@@ -87,15 +87,15 @@ class ImageFeatureExtractionMixin:
def to_pil_image(self, image, rescale=None):
"""
- Converts :obj:`image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last
+ Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last
axis if needed.
Args:
- image (:obj:`PIL.Image.Image` or :obj:`numpy.ndarray` or :obj:`torch.Tensor`):
+ image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
The image to convert to the PIL Image format.
- rescale (:obj:`bool`, `optional`):
+ rescale (`bool`, *optional*):
Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
- default to :obj:`True` if the image type is a floating type, :obj:`False` otherwise.
+ default to `True` if the image type is a floating type, `False` otherwise.
"""
self._ensure_format_supported(image)
@@ -117,17 +117,17 @@ class ImageFeatureExtractionMixin:
def to_numpy_array(self, image, rescale=None, channel_first=True):
"""
- Converts :obj:`image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
+ Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
dimension.
Args:
- image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to convert to a NumPy array.
- rescale (:obj:`bool`, `optional`):
+ rescale (`bool`, *optional*):
Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
- default to :obj:`True` if the image is a PIL Image or an array/tensor of integers, :obj:`False`
+ default to `True` if the image is a PIL Image or an array/tensor of integers, `False`
otherwise.
- channel_first (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ channel_first (`bool`, *optional*, defaults to `True`):
Whether or not to permute the dimensions of the image to put the channel dimension first.
"""
self._ensure_format_supported(image)
@@ -151,15 +151,15 @@ class ImageFeatureExtractionMixin:
def normalize(self, image, mean, std):
"""
- Normalizes :obj:`image` with :obj:`mean` and :obj:`std`. Note that this will trigger a conversion of
- :obj:`image` to a NumPy array if it's a PIL Image.
+ Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of
+ `image` to a NumPy array if it's a PIL Image.
Args:
- image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to normalize.
- mean (:obj:`List[float]` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+ mean (`List[float]` or `np.ndarray` or `torch.Tensor`):
The mean (per channel) to use for normalization.
- std (:obj:`List[float]` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+ std (`List[float]` or `np.ndarray` or `torch.Tensor`):
The standard deviation (per channel) to use for normalization.
"""
self._ensure_format_supported(image)
@@ -187,14 +187,14 @@ class ImageFeatureExtractionMixin:
def resize(self, image, size, resample=PIL.Image.BILINEAR):
"""
- Resizes :obj:`image`. Note that this will trigger a conversion of :obj:`image` to a PIL Image.
+ Resizes `image`. Note that this will trigger a conversion of `image` to a PIL Image.
Args:
- image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to resize.
- size (:obj:`int` or :obj:`Tuple[int, int]`):
+ size (`int` or `Tuple[int, int]`):
The size to use for resizing the image.
- resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
+ resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
The filter to user for resampling.
"""
self._ensure_format_supported(image)
@@ -210,13 +210,13 @@ class ImageFeatureExtractionMixin:
def center_crop(self, image, size):
"""
- Crops :obj:`image` to the given size using a center crop. Note that if the image is too small to be cropped to
+ Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to
the size given, it will be padded (so the returned result has the size asked).
Args:
- image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to resize.
- size (:obj:`int` or :obj:`Tuple[int, int]`):
+ size (`int` or `Tuple[int, int]`):
The size to which crop the image.
"""
self._ensure_format_supported(image)
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 57bc0251fb..15ef465131 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -264,11 +264,11 @@ def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestR
@functools.wraps(trainable)
def dynamic_modules_import_trainable(*args, **kwargs):
"""
- Wrapper around ``tune.with_parameters`` to ensure datasets_modules are loaded on each Actor.
+ Wrapper around `tune.with_parameters` to ensure datasets_modules are loaded on each Actor.
Without this, an ImportError will be thrown. See https://github.com/huggingface/transformers/issues/11565.
- Assumes that ``_objective``, defined above, is a function.
+ Assumes that `_objective`, defined above, is a function.
"""
if is_datasets_available():
import datasets.load
@@ -372,11 +372,10 @@ def rewrite_logs(d):
class TensorBoardCallback(TrainerCallback):
"""
- A :class:`~transformers.TrainerCallback` that sends the logs to `TensorBoard
- `__.
+ A [`TrainerCallback`] that sends the logs to [TensorBoard](https://www.tensorflow.org/tensorboard).
Args:
- tb_writer (:obj:`SummaryWriter`, `optional`):
+ tb_writer (`SummaryWriter`, *optional*):
The writer to use. Will instantiate one if not set.
"""
@@ -461,7 +460,7 @@ class TensorBoardCallback(TrainerCallback):
class WandbCallback(TrainerCallback):
"""
- A :class:`~transformers.TrainerCallback` that sends the logs to `Weight and Biases `__.
+ A [`TrainerCallback`] that sends the logs to [Weight and Biases](https://www.wandb.com/).
"""
def __init__(self):
@@ -478,22 +477,21 @@ class WandbCallback(TrainerCallback):
def setup(self, args, state, model, **kwargs):
"""
- Setup the optional Weights & Biases (`wandb`) integration.
+ Setup the optional Weights & Biases (*wandb*) integration.
- One can subclass and override this method to customize the setup if needed. Find more information `here
- `__. You can also override the following environment variables:
+ One can subclass and override this method to customize the setup if needed. Find more information [here](https://docs.wandb.ai/integrations/huggingface). You can also override the following environment variables:
Environment:
- WANDB_LOG_MODEL (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ WANDB_LOG_MODEL (`bool`, *optional*, defaults to `False`):
Whether or not to log model as artifact at the end of training. Use along with
- `TrainingArguments.load_best_model_at_end` to upload best model.
- WANDB_WATCH (:obj:`str`, `optional` defaults to :obj:`"gradients"`):
- Can be :obj:`"gradients"`, :obj:`"all"` or :obj:`"false"`. Set to :obj:`"false"` to disable gradient
- logging or :obj:`"all"` to log gradients and parameters.
- WANDB_PROJECT (:obj:`str`, `optional`, defaults to :obj:`"huggingface"`):
+ *TrainingArguments.load_best_model_at_end* to upload best model.
+ WANDB_WATCH (`str`, *optional* defaults to `"gradients"`):
+ Can be `"gradients"`, `"all"` or `"false"`. Set to `"false"` to disable gradient
+ logging or `"all"` to log gradients and parameters.
+ WANDB_PROJECT (`str`, *optional*, defaults to `"huggingface"`):
Set this to a custom string to store results in a different project.
- WANDB_DISABLED (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether or not to disable wandb entirely. Set `WANDB_DISABLED=true` to disable.
+ WANDB_DISABLED (`bool`, *optional*, defaults to `False`):
+ Whether or not to disable wandb entirely. Set *WANDB_DISABLED=true* to disable.
"""
if self._wandb is None:
return
@@ -585,7 +583,7 @@ class WandbCallback(TrainerCallback):
class CometCallback(TrainerCallback):
"""
- A :class:`~transformers.TrainerCallback` that sends the logs to `Comet ML `__.
+ A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.ml/site/).
"""
def __init__(self):
@@ -599,19 +597,18 @@ class CometCallback(TrainerCallback):
Setup the optional Comet.ml integration.
Environment:
- COMET_MODE (:obj:`str`, `optional`):
+ COMET_MODE (`str`, *optional*):
Whether to create an online, offline experiment or disable Comet logging. Can be "OFFLINE", "ONLINE",
or "DISABLED". Defaults to "ONLINE".
- COMET_PROJECT_NAME (:obj:`str`, `optional`):
+ COMET_PROJECT_NAME (`str`, *optional*):
Comet project name for experiments
- COMET_OFFLINE_DIRECTORY (:obj:`str`, `optional`):
- Folder to use for saving offline experiments when :obj:`COMET_MODE` is "OFFLINE"
- COMET_LOG_ASSETS (:obj:`str`, `optional`):
+ COMET_OFFLINE_DIRECTORY (`str`, *optional*):
+ Folder to use for saving offline experiments when `COMET_MODE` is "OFFLINE"
+ COMET_LOG_ASSETS (`str`, *optional*):
Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be "TRUE", or
"FALSE". Defaults to "TRUE".
- For a number of configurable items in the environment, see `here
- `__.
+ For a number of configurable items in the environment, see [here](https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables).
"""
self._initialized = True
log_assets = os.getenv("COMET_LOG_ASSETS", "FALSE").upper()
@@ -661,8 +658,7 @@ class CometCallback(TrainerCallback):
class AzureMLCallback(TrainerCallback):
"""
- A :class:`~transformers.TrainerCallback` that sends the logs to `AzureML
- `__.
+ A [`TrainerCallback`] that sends the logs to [AzureML](https://pypi.org/project/azureml-sdk/).
"""
def __init__(self, azureml_run=None):
@@ -685,7 +681,7 @@ class AzureMLCallback(TrainerCallback):
class MLflowCallback(TrainerCallback):
"""
- A :class:`~transformers.TrainerCallback` that sends the logs to `MLflow `__.
+ A [`TrainerCallback`] that sends the logs to [MLflow](https://www.mlflow.org/).
"""
def __init__(self):
@@ -705,11 +701,11 @@ class MLflowCallback(TrainerCallback):
Setup the optional MLflow integration.
Environment:
- HF_MLFLOW_LOG_ARTIFACTS (:obj:`str`, `optional`):
+ HF_MLFLOW_LOG_ARTIFACTS (`str`, *optional*):
Whether to use MLflow .log_artifact() facility to log artifacts.
- This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to `True` or `1`, will copy
- whatever is in :class:`~transformers.TrainingArguments`'s ``output_dir`` to the local or remote
+ This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to *True* or *1*, will copy
+ whatever is in [`TrainingArguments`]'s `output_dir` to the local or remote
artifact storage. Using it without a remote storage will just copy the files to your artifact location.
"""
log_artifacts = os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper()
@@ -774,7 +770,7 @@ class MLflowCallback(TrainerCallback):
class NeptuneCallback(TrainerCallback):
"""
- A :class:`~transformers.TrainerCallback` that sends the logs to `Neptune `.
+ A [`TrainerCallback`] that sends the logs to *Neptune *.
"""
def __init__(self):
@@ -793,13 +789,13 @@ class NeptuneCallback(TrainerCallback):
Setup the Neptune integration.
Environment:
- NEPTUNE_PROJECT (:obj:`str`, `required`):
- The project ID for neptune.ai account. Should be in format `workspace_name/project_name`
- NEPTUNE_API_TOKEN (:obj:`str`, `required`):
+ NEPTUNE_PROJECT (`str`, *required*):
+ The project ID for neptune.ai account. Should be in format *workspace_name/project_name*
+ NEPTUNE_API_TOKEN (`str`, *required*):
API-token for neptune.ai account
- NEPTUNE_CONNECTION_MODE (:obj:`str`, `optional`):
- Neptune connection mode. `async` by default
- NEPTUNE_RUN_NAME (:obj:`str`, `optional`):
+ NEPTUNE_CONNECTION_MODE (`str`, *optional*):
+ Neptune connection mode. *async* by default
+ NEPTUNE_RUN_NAME (`str`, *optional*):
The name of run process on Neptune dashboard
"""
if state.is_world_process_zero:
@@ -831,7 +827,7 @@ class NeptuneCallback(TrainerCallback):
def __del__(self):
"""
Environment:
- NEPTUNE_STOP_TIMEOUT (:obj:`int`, `optional`):
+ NEPTUNE_STOP_TIMEOUT (`int`, *optional*):
Number of seconsds to wait for all Neptune.ai tracking calls to finish, before stopping the tracked
run. If not set it will wait for all tracking calls to finish.
"""
@@ -845,7 +841,7 @@ class NeptuneCallback(TrainerCallback):
class CodeCarbonCallback(TrainerCallback):
"""
- A :class:`~transformers.TrainerCallback` that tracks the CO2 emission of training.
+ A [`TrainerCallback`] that tracks the CO2 emission of training.
"""
def __init__(self):
diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index ff1b938cec..670248524e 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -29,32 +29,32 @@ class PushToHubCallback(Callback):
**model_card_args
):
"""
- output_dir (:obj:`str`):
+ output_dir (`str`):
The output directory where the model predictions and checkpoints will be written and synced with the
repository on the Hub.
- save_strategy (:obj:`str` or :class:`~transformers.trainer_utils.IntervalStrategy`, `optional`, defaults to :obj:`"epoch"`):
+ save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"epoch"`):
The checkpoint save strategy to adopt during training. Possible values are:
- * :obj:`"no"`: No save is done during training.
- * :obj:`"epoch"`: Save is done at the end of each epoch.
- * :obj:`"steps"`: Save is done every :obj:`save_steps`
- save_steps (:obj:`int`, `optional`):
+ - `"no"`: No save is done during training.
+ - `"epoch"`: Save is done at the end of each epoch.
+ - `"steps"`: Save is done every `save_steps`
+ save_steps (`int`, *optional*):
The number of steps between saves when using the "steps" save_strategy.
- tokenizer (:obj:`PreTrainedTokenizerBase`, `optional`):
+ tokenizer (`PreTrainedTokenizerBase`, *optional*):
The tokenizer used by the model. If supplied, will be uploaded to the repo alongside the weights.
- hub_model_id (:obj:`str`, `optional`):
- The name of the repository to keep in sync with the local `output_dir`. It can be a simple model ID in
+ hub_model_id (`str`, *optional*):
+ The name of the repository to keep in sync with the local *output_dir*. It can be a simple model ID in
which case the model will be pushed in your namespace. Otherwise it should be the whole repository name,
- for instance :obj:`"user_name/model"`, which allows you to push to an organization you are a member of with
- :obj:`"organization_name/model"`.
+ for instance `"user_name/model"`, which allows you to push to an organization you are a member of with
+ `"organization_name/model"`.
- Will default to to the name of :obj:`output_dir`.
- hub_token (:obj:`str`, `optional`):
+ Will default to to the name of `output_dir`.
+ hub_token (`str`, *optional*):
The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
- :obj:`huggingface-cli login`.
- checkpoint (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ `huggingface-cli login`.
+ checkpoint (`bool`, *optional*, defaults to `False`):
Whether to save full training checkpoints (including epoch and optimizer state) to allow training to be
- resumed. Only usable when `save_strategy` is `epoch`.
+ resumed. Only usable when *save_strategy* is *epoch*.
"""
super().__init__()
if checkpoint and save_strategy != "epoch":
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index dc663ca493..184bd80660 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -126,53 +126,53 @@ class ModelCard:
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
- Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
+ Instantiate a [`ModelCard`] from a pre-trained model model card.
Parameters:
pretrained_model_name_or_path: either:
- - a string, the `model id` of a pretrained model card hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a
- user or organization name, like ``dbmdz/bert-base-german-cased``.
- - a path to a `directory` containing a model card file saved using the
- :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
- - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/modelcard.json``.
+ - a string, the *model id* of a pretrained model card hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+ user or organization name, like `dbmdz/bert-base-german-cased`.
+ - a path to a *directory* containing a model card file saved using the
+ [`~ModelCard.save_pretrained`] method, e.g.: `./my_model_directory/`.
+ - a path or url to a saved model card JSON *file*, e.g.: `./my_model_directory/modelcard.json`.
- cache_dir: (`optional`) string:
+ cache_dir: (*optional*) string:
Path to a directory in which a downloaded pre-trained model card should be cached if the standard cache
should not be used.
- kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading.
+ kwargs: (*optional*) dict: key/value pairs with which to update the ModelCard object after loading.
- The values in kwargs of any keys which are model card attributes will be used to override the loaded
values.
- Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the
- `return_unused_kwargs` keyword parameter.
+ *return_unused_kwargs* keyword parameter.
- proxies: (`optional`) dict, default None:
+ proxies: (*optional*) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.
- find_from_standard_name: (`optional`) boolean, default True:
+ find_from_standard_name: (*optional*) boolean, default True:
If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them
with our standard modelcard filename. Can be used to directly feed a model/config url and access the
colocated modelcard.
- return_unused_kwargs: (`optional`) bool:
+ return_unused_kwargs: (*optional*) bool:
- If False, then this function returns just the final model card object.
- - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a
+ - If True, then this functions returns a tuple *(model card, unused_kwargs)* where *unused_kwargs* is a
dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of
- kwargs which has not been used to update `ModelCard` and is otherwise ignored.
+ kwargs which has not been used to update *ModelCard* and is otherwise ignored.
- Examples::
+ Examples:
- modelcard = ModelCard.from_pretrained('bert-base-uncased') # Download model card from huggingface.co and cache.
- modelcard = ModelCard.from_pretrained('./test/saved_model/') # E.g. model card was saved using `save_pretrained('./test/saved_model/')`
- modelcard = ModelCard.from_pretrained('./test/saved_model/modelcard.json')
- modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
-
- """
+ ```python
+ modelcard = ModelCard.from_pretrained('bert-base-uncased') # Download model card from huggingface.co and cache.
+ modelcard = ModelCard.from_pretrained('./test/saved_model/') # E.g. model card was saved using *save_pretrained('./test/saved_model/')*
+ modelcard = ModelCard.from_pretrained('./test/saved_model/modelcard.json')
+ modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
+ ```"""
# This imports every model so let's do it dynamically here.
from transformers.models.auto.configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py
index c822f11e98..100e032a38 100644
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@@ -69,7 +69,7 @@ def rename_key_and_reshape_tensor(
"""Rename PT weight names to corresponding Flax weight names and reshape tensor if necessary"""
def is_key_or_prefix_key_in_dict(key: Tuple[str]) -> bool:
- """Checks if ``key`` of ``(prefix,) + key`` is in random_flax_state_dict"""
+ """Checks if `key` of `(prefix,) + key` is in random_flax_state_dict"""
return len(set(random_flax_state_dict) & set([key, (model_prefix,) + key])) > 0
# layer norm
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 2be53474c3..945349f361 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -67,17 +67,17 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
r"""
Base class for all models.
- :class:`~transformers.FlaxPreTrainedModel` takes care of storing the configuration of the models and handles
+ [`FlaxPreTrainedModel`] takes care of storing the configuration of the models and handles
methods for loading, downloading and saving models.
Class attributes (overridden by derived classes):
- - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
- :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
- - **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
+ - **config_class** ([`PretrainedConfig`]) -- A subclass of
+ [`PretrainedConfig`] to use as configuration class for this model architecture.
+ - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in
derived classes of the same architecture adding modules on top of the base model.
- - **main_input_name** (:obj:`str`) -- The name of the principal input to the model (often :obj:`input_ids` for
- NLP models, :obj:`pixel_values` for vision models and :obj:`input_values` for speech models).
+ - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for
+ NLP models, `pixel_values` for vision models and `input_values` for speech models).
"""
config_class = None
base_model_prefix = ""
@@ -159,7 +159,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
def _cast_floating_to(self, params: Union[Dict, FrozenDict], dtype: jnp.dtype, mask: Any = None) -> Any:
"""
- Helper method to cast floating-point values of given parameter ``PyTree`` to given ``dtype``.
+ Helper method to cast floating-point values of given parameter `PyTree` to given `dtype`.
"""
# taken from https://github.com/deepmind/jmp/blob/3a8318abc3292be38582794dbf7b094e6583b192/jmp/_src/policy.py#L27
@@ -183,94 +183,97 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None):
r"""
- Cast the floating-point ``params`` to ``jax.numpy.bfloat16``. This returns a new ``params`` tree and does not
- cast the ``params`` in place.
+ Cast the floating-point `params` to `jax.numpy.bfloat16`. This returns a new `params` tree and does not
+ cast the `params` in place.
This method can be used on TPU to explicitly convert the model parameters to bfloat16 precision to do full
half-precision training or to save weights in bfloat16 for inference in order to save memory and improve speed.
Arguments:
- params (:obj:`Union[Dict, FrozenDict]`):
- A ``PyTree`` of model parameters.
- mask (:obj:`Union[Dict, FrozenDict]`):
- A ``PyTree`` with same structure as the ``params`` tree. The leaves should be booleans, :obj:`True` for
- params you want to cast, and should be :obj:`False` for those you want to skip.
+ params (`Union[Dict, FrozenDict]`):
+ A `PyTree` of model parameters.
+ mask (`Union[Dict, FrozenDict]`):
+ A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for
+ params you want to cast, and should be `False` for those you want to skip.
- Examples::
+ Examples:
- >>> from transformers import FlaxBertModel
- >>> # load model
- >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
- >>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision
- >>> model.params = model.to_bf16(model.params)
- >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
- >>> # then pass the mask as follows
- >>> from flax import traverse_util
- >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
- >>> flat_params = traverse_util.flatten_dict(model.params)
- >>> mask = {path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
- >>> mask = traverse_util.unflatten_dict(mask)
- >>> model.params = model.to_bf16(model.params, mask)
- """
+ ```python
+ >>> from transformers import FlaxBertModel
+ >>> # load model
+ >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+ >>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision
+ >>> model.params = model.to_bf16(model.params)
+ >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
+ >>> # then pass the mask as follows
+ >>> from flax import traverse_util
+ >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+ >>> flat_params = traverse_util.flatten_dict(model.params)
+ >>> mask = {path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+ >>> mask = traverse_util.unflatten_dict(mask)
+ >>> model.params = model.to_bf16(model.params, mask)
+ ```"""
return self._cast_floating_to(params, jnp.bfloat16, mask)
def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
r"""
- Cast the floating-point ``parmas`` to ``jax.numpy.float32``. This method can be used to explicitly convert the
- model parameters to fp32 precision. This returns a new ``params`` tree and does not cast the ``params`` in
+ Cast the floating-point `parmas` to `jax.numpy.float32`. This method can be used to explicitly convert the
+ model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in
place.
Arguments:
- params (:obj:`Union[Dict, FrozenDict]`):
- A ``PyTree`` of model parameters.
- mask (:obj:`Union[Dict, FrozenDict]`):
- A ``PyTree`` with same structure as the ``params`` tree. The leaves should be booleans, :obj:`True` for
- params you want to cast, and should be :obj:`False` for those you want to skip
+ params (`Union[Dict, FrozenDict]`):
+ A `PyTree` of model parameters.
+ mask (`Union[Dict, FrozenDict]`):
+ A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for
+ params you want to cast, and should be `False` for those you want to skip
- Examples::
+ Examples:
- >>> from transformers import FlaxBertModel
- >>> # Download model and configuration from huggingface.co
- >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
- >>> # By default, the model params will be in fp32, to illustrate the use of this method,
- >>> # we'll first cast to fp16 and back to fp32
- >>> model.params = model.to_f16(model.params)
- >>> # now cast back to fp32
- >>> model.params = model.to_fp32(model.params)
- """
+ ```python
+ >>> from transformers import FlaxBertModel
+ >>> # Download model and configuration from huggingface.co
+ >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+ >>> # By default, the model params will be in fp32, to illustrate the use of this method,
+ >>> # we'll first cast to fp16 and back to fp32
+ >>> model.params = model.to_f16(model.params)
+ >>> # now cast back to fp32
+ >>> model.params = model.to_fp32(model.params)
+ ```"""
return self._cast_floating_to(params, jnp.float32, mask)
def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
r"""
- Cast the floating-point ``parmas`` to ``jax.numpy.float16``. This returns a new ``params`` tree and does not
- cast the ``params`` in place.
+ Cast the floating-point `parmas` to `jax.numpy.float16`. This returns a new `params` tree and does not
+ cast the `params` in place.
This method can be used on GPU to explicitly convert the model parameters to float16 precision to do full
half-precision training or to save weights in float16 for inference in order to save memory and improve speed.
Arguments:
- params (:obj:`Union[Dict, FrozenDict]`):
- A ``PyTree`` of model parameters.
- mask (:obj:`Union[Dict, FrozenDict]`):
- A ``PyTree`` with same structure as the ``params`` tree. The leaves should be booleans, :obj:`True` for
- params you want to cast, and should be :obj:`False` for those you want to skip
+ params (`Union[Dict, FrozenDict]`):
+ A `PyTree` of model parameters.
+ mask (`Union[Dict, FrozenDict]`):
+ A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for
+ params you want to cast, and should be `False` for those you want to skip
- Examples::
+ Examples:
- >>> from transformers import FlaxBertModel
- >>> # load model
- >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
- >>> # By default, the model params will be in fp32, to cast these to float16
- >>> model.params = model.to_fp16(model.params)
- >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
- >>> # then pass the mask as follows
- >>> from flax import traverse_util
- >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
- >>> flat_params = traverse_util.flatten_dict(model.params)
- >>> mask = {path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
- >>> mask = traverse_util.unflatten_dict(mask)
- >>> model.params = model.to_fp16(model.params, mask)
- """
+ ```python
+ >>> from transformers import FlaxBertModel
+ >>> # load model
+ >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+ >>> # By default, the model params will be in fp32, to cast these to float16
+ >>> model.params = model.to_fp16(model.params)
+ >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
+ >>> # then pass the mask as follows
+ >>> from flax import traverse_util
+ >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+ >>> flat_params = traverse_util.flatten_dict(model.params)
+ >>> mask = {path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+ >>> mask = traverse_util.unflatten_dict(mask)
+ >>> model.params = model.to_fp16(model.params, mask)
+ ```"""
return self._cast_floating_to(params, jnp.float16, mask)
@classmethod
@@ -285,104 +288,104 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
r"""
Instantiate a pretrained flax model from a pre-trained model configuration.
- The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
+ The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
task.
- The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those
+ The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
weights are discarded.
Parameters:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `pt index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In this
- case, ``from_pt`` should be set to :obj:`True`.
- dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
- The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
- GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *pt index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In this
+ case, `from_pt` should be set to `True`.
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+ GPUs) and `jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
- specified all the computation will be performed with the given ``dtype``.
+ specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see
- :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and
- :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
- model_args (sequence of positional arguments, `optional`):
- All remaining positional arguments will be passed to the underlying model's ``__init__`` method.
- config (:obj:`Union[PretrainedConfig, str, os.PathLike]`, `optional`):
+ [`~FlaxPreTrainedModel.to_fp16`] and
+ [`~FlaxPreTrainedModel.to_bf16`].
+ model_args (sequence of positional arguments, *optional*):
+ All remaining positional arguments will be passed to the underlying model's `__init__` method.
+ config (`Union[PretrainedConfig, str, os.PathLike]`, *optional*):
Can be either:
- - an instance of a class derived from :class:`~transformers.PretrainedConfig`,
- - a string or path valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`.
+ - an instance of a class derived from [`PretrainedConfig`],
+ - a string or path valid as input to [`~PretrainedConfig.from_pretrained`].
Configuration for the model to use instead of an automatically loaded configuration. Configuration can
be automatically loaded when:
- - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+ - The model is a model provided by the library (loaded with the *model id* string of a pretrained
model).
- - The model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
+ - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
by supplying the save directory.
- - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
- configuration JSON file named `config.json` is found in the directory.
- cache_dir (:obj:`Union[str, os.PathLike]`, `optional`):
+ - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+ configuration JSON file named *config.json* is found in the directory.
+ cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
- from_pt (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ from_pt (`bool`, *optional*, defaults to `False`):
Load the model weights from a PyTorch checkpoint save file (see docstring of
- ``pretrained_model_name_or_path`` argument).
- ignore_mismatched_sizes (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ `pretrained_model_name_or_path` argument).
+ ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
checkpoint with 3 labels).
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
- local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ local_files_only(`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (i.e., do not try to download the model).
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- kwargs (remaining dictionary of keyword arguments, `optional`):
+ kwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+ `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
automatically loaded:
- - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
- underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+ - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+ underlying model's `__init__` method (we assume all relevant updates to the configuration have
already been done)
- - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
- initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
- ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
- with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
- attribute will be passed to the underlying model's ``__init__`` function.
+ - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+ initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+ `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+ with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+ attribute will be passed to the underlying model's `__init__` function.
- Examples::
+ Examples:
- >>> from transformers import BertConfig, FlaxBertModel
- >>> # Download model and configuration from huggingface.co and cache.
- >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
- >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable).
- >>> model = FlaxBertModel.from_pretrained('./test/saved_model/')
- >>> # Loading from a PyTorch checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
- >>> config = BertConfig.from_json_file('./pt_model/config.json')
- >>> model = FlaxBertModel.from_pretrained('./pt_model/pytorch_model.bin', from_pt=True, config=config)
- """
+ ```python
+ >>> from transformers import BertConfig, FlaxBertModel
+ >>> # Download model and configuration from huggingface.co and cache.
+ >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+ >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
+ >>> model = FlaxBertModel.from_pretrained('./test/saved_model/')
+ >>> # Loading from a PyTorch checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
+ >>> config = BertConfig.from_json_file('./pt_model/config.json')
+ >>> model = FlaxBertModel.from_pretrained('./pt_model/pytorch_model.bin', from_pt=True, config=config)
+ ```"""
config = kwargs.pop("config", None)
cache_dir = kwargs.pop("cache_dir", None)
from_pt = kwargs.pop("from_pt", False)
@@ -592,24 +595,26 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
def save_pretrained(self, save_directory: Union[str, os.PathLike], params=None, push_to_hub=False, **kwargs):
"""
Save a model and its configuration file to a directory, so that it can be re-loaded using the
- `:func:`~transformers.FlaxPreTrainedModel.from_pretrained`` class method
+ `[`~FlaxPreTrainedModel.from_pretrained`]` class method
Arguments:
- save_directory (:obj:`str` or :obj:`os.PathLike`):
+ save_directory (`str` or `os.PathLike`):
Directory to which to save. Will be created if it doesn't exist.
- push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ push_to_hub (`bool`, *optional*, defaults to `False`):
Whether or not to push your model to the Hugging Face model hub after saving it.
- .. warning::
+
- Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
- :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
- pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
- instead.
+ Using `push_to_hub=True` will synchronize the repository you are pushing to with
+ `save_directory`, which requires `save_directory` to be a local clone of the repo you are
+ pushing to if it's an existing folder. Pass along `temp_dir=True` to use a temporary directory
+ instead.
+
+
kwargs:
Additional key word arguments passed along to the
- :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method.
+ [`~file_utils.PushToHubMixin.push_to_hub`] method.
"""
if os.path.isfile(save_directory):
logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index bb66e3f62f..b562686b68 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -74,7 +74,7 @@ def dummy_loss(y_true, y_pred):
class TFModelUtilsMixin:
"""
- A few utilities for :obj:`tf.keras.Model`, to be used as a mixin.
+ A few utilities for `tf.keras.Model`, to be used as a mixin.
"""
def num_parameters(self, only_trainable: bool = False) -> int:
@@ -82,11 +82,11 @@ class TFModelUtilsMixin:
Get the number of (optionally, trainable) parameters in the model.
Args:
- only_trainable (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ only_trainable (`bool`, *optional*, defaults to `False`):
Whether or not to return only the number of trainable parameters
Returns:
- :obj:`int`: The number of parameters.
+ `int`: The number of parameters.
"""
if only_trainable:
return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables))
@@ -100,16 +100,16 @@ def keras_serializable(cls):
This is done by:
- 1. Adding a :obj:`transformers_config` dict to the Keras config dictionary in :obj:`get_config` (called by Keras at
+ 1. Adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at
serialization time.
- 2. Wrapping :obj:`__init__` to accept that :obj:`transformers_config` dict (passed by Keras at deserialization
+ 2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization
time) and convert it to a config object for the actual layer initializer.
3. Registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not
- need to be supplied in :obj:`custom_objects` in the call to :obj:`tf.keras.models.load_model`.
+ need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`.
Args:
- cls (a :obj:`tf.keras.layers.Layers subclass`):
- Typically a :obj:`TF.MainLayer` class in this project, in general must accept a :obj:`config` argument to
+ cls (a `tf.keras.layers.Layers subclass`):
+ Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to
its initializer.
Returns:
@@ -163,10 +163,11 @@ class TFCausalLanguageModelingLoss:
"""
Loss function suitable for causal language modeling (CLM), that is, the task of guessing the next token.
- .. note::
+
- Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+ Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+
"""
def compute_loss(self, labels, logits):
@@ -199,10 +200,11 @@ class TFTokenClassificationLoss:
"""
Loss function suitable for token classification.
- .. note::
+
- Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+ Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+
"""
def compute_loss(self, labels, logits):
@@ -252,9 +254,11 @@ class TFMaskedLanguageModelingLoss(TFCausalLanguageModelingLoss):
"""
Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens.
- .. note::
+
- Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+ Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+
+
"""
@@ -262,8 +266,11 @@ class TFNextSentencePredictionLoss:
"""
Loss function suitable for next sentence prediction (NSP), that is, the task of guessing the next sentence.
- .. note::
- Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+
+
+ Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+
+
"""
def compute_loss(self, labels, logits):
@@ -285,7 +292,7 @@ def booleans_processing(config, **kwargs):
graph)
Args:
- config (:class:`~transformers.PretrainedConfig`):
+ config ([`PretrainedConfig`]):
The config of the running model.
**kwargs:
The boolean parameters
@@ -345,9 +352,9 @@ def input_processing(func, config, input_ids, **kwargs):
name="input_ids")` otherwise the order of the tensors will not be guaranteed during the training.
Args:
- func (:obj:`callable`):
+ func (`callable`):
The callable function of the TensorFlow model.
- config (:class:`~transformers.PretrainedConfig`):
+ config ([`PretrainedConfig`]):
The config of the running model.
**kwargs:
The inputs of the model.
@@ -491,11 +498,11 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False,
Detect missing and unexpected layers and load the TF weights accordingly to their names and shapes.
Args:
- model (:obj:`tf.keras.models.Model`):
+ model (`tf.keras.models.Model`):
The model to load the weights into.
- resolved_archive_file (:obj:`str`):
+ resolved_archive_file (`str`):
The location of the H5 file.
- ignore_mismatched_sizes (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
Whether or not to ignore weights with shapes that don't match between the checkpoint of the model.
Returns:
@@ -641,20 +648,20 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
r"""
Base class for all TF models.
- :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods
+ [`TFPreTrainedModel`] takes care of storing the configuration of the models and handles methods
for loading, downloading and saving models as well as a few methods common to all models to:
- * resize the input embeddings,
- * prune heads in the self-attention heads.
+ - resize the input embeddings,
+ - prune heads in the self-attention heads.
Class attributes (overridden by derived classes):
- - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
- :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
- - **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
+ - **config_class** ([`PretrainedConfig`]) -- A subclass of
+ [`PretrainedConfig`] to use as configuration class for this model architecture.
+ - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in
derived classes of the same architecture adding modules on top of the base model.
- - **main_input_name** (:obj:`str`) -- The name of the principal input to the model (often :obj:`input_ids` for
- NLP models, :obj:`pixel_values` for vision models and :obj:`input_values` for speech models).
+ - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for
+ NLP models, `pixel_values` for vision models and `input_values` for speech models).
"""
config_class = None
base_model_prefix = ""
@@ -674,7 +681,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Dummy inputs to build the network.
Returns:
- :obj:`Dict[str, tf.Tensor]`: The dummy inputs.
+ `Dict[str, tf.Tensor]`: The dummy inputs.
"""
return {
"input_ids": tf.constant(DUMMY_INPUTS),
@@ -729,7 +736,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Method used for serving the model.
Args:
- inputs (:obj:`Dict[str, tf.Tensor]`):
+ inputs (`Dict[str, tf.Tensor]`):
The input of the saved model as a dictionary of tensors.
"""
output = self.call(inputs)
@@ -741,7 +748,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Prepare the output of the saved model. Each model must implement this function.
Args:
- output (:class:`~transformers.TFBaseModelOutput`):
+ output ([`TFBaseModelOutput`]):
The output returned by the model.
"""
raise NotImplementedError
@@ -751,7 +758,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Returns the model's input embeddings layer.
Returns:
- :obj:`tf.Variable`: The embeddings layer mapping vocabulary to hidden states.
+ `tf.Variable`: The embeddings layer mapping vocabulary to hidden states.
"""
main_layer = getattr(self, self.base_model_prefix, self)
@@ -779,12 +786,12 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
the checkpoint was made.
Args:
- repo_path_or_name (:obj:`str`):
+ repo_path_or_name (`str`):
Can either be a repository name for your {object} in the Hub or a path to a local folder (in which case
the repository will have the name of that local folder).
Returns:
- :obj:`dict`: A dictionary of extra metadata from the checkpoint, most commonly an "epoch" count.
+ `dict`: A dictionary of extra metadata from the checkpoint, most commonly an "epoch" count.
"""
if getattr(self, "optimizer", None) is None:
raise RuntimeError(
@@ -971,7 +978,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Set model's input embeddings
Args:
- value (:obj:`tf.Variable`):
+ value (`tf.Variable`):
The new weights mapping hidden states to vocabulary.
"""
main_layer = getattr(self, self.base_model_prefix)
@@ -991,7 +998,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Returns the model's output embeddings
Returns:
- :obj:`tf.Variable`: The new weights mapping vocabulary to hidden states.
+ `tf.Variable`: The new weights mapping vocabulary to hidden states.
"""
if self.get_lm_head() is not None:
lm_head = self.get_lm_head()
@@ -1011,7 +1018,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Set model's output embeddings
Args:
- value (:obj:`tf.Variable`):
+ value (`tf.Variable`):
The new weights mapping hidden states to vocabulary.
"""
if self.get_lm_head() is not None:
@@ -1029,7 +1036,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
embeddings
Return:
- :obj:`tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
+ `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
"""
warnings.warn(
"The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning
@@ -1041,7 +1048,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Get the concatenated _prefix name of the bias from the model name to the parent layer
Return:
- :obj:`str`: The _prefix name of the bias.
+ `str`: The _prefix name of the bias.
"""
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
return None
@@ -1051,7 +1058,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Dict of bias attached to an LM head. The key represents the name of the bias attribute.
Return:
- :obj:`tf.Variable`: The weights representing the bias, None if not an LM model.
+ `tf.Variable`: The weights representing the bias, None if not an LM model.
"""
if self.get_lm_head() is not None:
lm_head = self.get_lm_head()
@@ -1068,7 +1075,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Set all the bias in the LM head.
Args:
- value (:obj:`Dict[tf.Variable]`):
+ value (`Dict[tf.Variable]`):
All the new bias attached to an LM head.
"""
if self.get_lm_head() is not None:
@@ -1084,25 +1091,25 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
The LM Head layer. This method must be overwritten by all the models that have a lm head.
Return:
- :obj:`tf.keras.layers.Layer`: The LM head layer if the model has one, None if not.
+ `tf.keras.layers.Layer`: The LM head layer if the model has one, None if not.
"""
return None
def resize_token_embeddings(self, new_num_tokens=None) -> tf.Variable:
"""
- Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
+ Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
- Takes care of tying weights embeddings afterwards if the model class has a :obj:`tie_weights()` method.
+ Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
Arguments:
- new_num_tokens (:obj:`int`, `optional`):
+ new_num_tokens (`int`, *optional*):
The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
- vectors at the end. Reducing the size will remove vectors from the end. If not provided or :obj:`None`,
- just returns a pointer to the input tokens :obj:`tf.Variable` module of the model without doing
+ vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`,
+ just returns a pointer to the input tokens `tf.Variable` module of the model without doing
anything.
Return:
- :obj:`tf.Variable`: Pointer to the input tokens Embeddings Module of the model.
+ `tf.Variable`: Pointer to the input tokens Embeddings Module of the model.
"""
if new_num_tokens is None or new_num_tokens == self.config.vocab_size:
return self._get_word_embedding_weight(self.get_input_embeddings())
@@ -1166,16 +1173,16 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Reducing the size will remove vectors from the end
Args:
- old_lm_head_bias (:obj:`tf.Variable`):
+ old_lm_head_bias (`tf.Variable`):
Old lm head bias to be resized.
- new_num_tokens (:obj:`int`, `optional`):
+ new_num_tokens (`int`, *optional*):
New number of tokens in the linear matrix.
Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
- vectors from the end. If not provided or :obj:`None`, just returns None
+ vectors from the end. If not provided or `None`, just returns None
Return:
- :obj:`tf.Variable`: Pointer to the resized bias.
+ `tf.Variable`: Pointer to the resized bias.
"""
new_lm_head_bias = {}
@@ -1218,16 +1225,16 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Reducing the size will remove vectors from the end
Args:
- old_lm_head_decoder (:obj:`tf.Variable`):
+ old_lm_head_decoder (`tf.Variable`):
Old lm head decoder to be resized.
- new_num_tokens (:obj:`int`, `optional`):
+ new_num_tokens (`int`, *optional*):
New number of tokens in the linear matrix.
Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
- vectors from the end. If not provided or :obj:`None`, just returns None
+ vectors from the end. If not provided or `None`, just returns None
Return:
- :obj:`tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the
+ `tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the
input ones.
"""
new_lm_head_decoder = old_lm_head_decoder
@@ -1256,18 +1263,18 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
initialized vectors at the end. Reducing the size will remove vectors from the end
Args:
- old_embeddings (:obj:`tf.Variable`):
+ old_embeddings (`tf.Variable`):
Old embeddings to be resized.
- new_num_tokens (:obj:`int`, `optional`):
+ new_num_tokens (`int`, *optional*):
New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
- vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
- :obj:`tf.Variable`` module of the model without doing anything.
+ vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
+ ``tf.Variable``` module of the model without doing anything.
Return:
- :obj:`tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if
- :obj:`new_num_tokens` is :obj:`None`
+ `tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if
+ `new_num_tokens` is `None`
"""
old_embedding_dim = shape_list(old_embeddings)[1]
init_range = getattr(self.config, "initializer_range", 0.02)
@@ -1289,9 +1296,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Prunes heads of the base model.
Arguments:
- heads_to_prune (:obj:`Dict[int, List[int]]`):
- Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of
- heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
+ heads_to_prune (`Dict[int, List[int]]`):
+ Dictionary with keys being selected layer indices (`int`) and associated values being the list of
+ heads to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
0 and 2 on layer 1 and heads 2 and 3 on layer 2.
"""
raise NotImplementedError
@@ -1299,30 +1306,32 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
def save_pretrained(self, save_directory, saved_model=False, version=1, push_to_hub=False, **kwargs):
"""
Save a model and its configuration file to a directory, so that it can be re-loaded using the
- :func:`~transformers.TFPreTrainedModel.from_pretrained` class method.
+ [`~TFPreTrainedModel.from_pretrained`] class method.
Arguments:
- save_directory (:obj:`str`):
+ save_directory (`str`):
Directory to which to save. Will be created if it doesn't exist.
- saved_model (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ saved_model (`bool`, *optional*, defaults to `False`):
If the model has to be saved in saved model format as well or not.
- version (:obj:`int`, `optional`, defaults to 1):
+ version (`int`, *optional*, defaults to 1):
The version of the saved model. A saved model needs to be versioned in order to be properly loaded by
TensorFlow Serving as detailed in the official documentation
https://www.tensorflow.org/tfx/serving/serving_basic
- push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ push_to_hub (`bool`, *optional*, defaults to `False`):
Whether or not to push your model to the Hugging Face model hub after saving it.
- .. warning::
+
- Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
- :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
- pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
- instead.
+ Using `push_to_hub=True` will synchronize the repository you are pushing to with
+ `save_directory`, which requires `save_directory` to be a local clone of the repo you are
+ pushing to if it's an existing folder. Pass along `temp_dir=True` to use a temporary directory
+ instead.
+
+
kwargs:
Additional key word arguments passed along to the
- :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method.
+ [`~file_utils.PushToHubMixin.push_to_hub`] method.
"""
if os.path.isfile(save_directory):
logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -1357,113 +1366,113 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
r"""
Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
- The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
+ The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
task.
- The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those
+ The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
weights are discarded.
Parameters:
- pretrained_model_name_or_path (:obj:`str`, `optional`):
+ pretrained_model_name_or_path (`str`, *optional*):
Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `PyTorch state_dict save file` (e.g, ``./pt_model/pytorch_model.bin``). In
- this case, ``from_pt`` should be set to :obj:`True` and a configuration object should be provided
- as ``config`` argument. This loading path is slower than converting the PyTorch model in a
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In
+ this case, `from_pt` should be set to `True` and a configuration object should be provided
+ as `config` argument. This loading path is slower than converting the PyTorch model in a
TensorFlow model using the provided conversion scripts and loading the TensorFlow model
afterwards.
- - :obj:`None` if you are both providing the configuration and state dictionary (resp. with keyword
- arguments ``config`` and ``state_dict``).
- model_args (sequence of positional arguments, `optional`):
- All remaining positional arguments will be passed to the underlying model's ``__init__`` method.
- config (:obj:`Union[PretrainedConfig, str]`, `optional`):
+ - `None` if you are both providing the configuration and state dictionary (resp. with keyword
+ arguments `config` and `state_dict`).
+ model_args (sequence of positional arguments, *optional*):
+ All remaining positional arguments will be passed to the underlying model's `__init__` method.
+ config (`Union[PretrainedConfig, str]`, *optional*):
Can be either:
- - an instance of a class derived from :class:`~transformers.PretrainedConfig`,
- - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`.
+ - an instance of a class derived from [`PretrainedConfig`],
+ - a string valid as input to [`~PretrainedConfig.from_pretrained`].
Configuration for the model to use instead of an automatically loaded configuration. Configuration can
be automatically loaded when:
- - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+ - The model is a model provided by the library (loaded with the *model id* string of a pretrained
model).
- - The model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded
+ - The model was saved using [`~TFPreTrainedModel.save_pretrained`] and is reloaded
by supplying the save directory.
- - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
- configuration JSON file named `config.json` is found in the directory.
- from_pt: (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+ configuration JSON file named *config.json* is found in the directory.
+ from_pt: (`bool`, *optional*, defaults to `False`):
Load the model weights from a PyTorch state_dict save file (see docstring of
- ``pretrained_model_name_or_path`` argument).
- ignore_mismatched_sizes (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ `pretrained_model_name_or_path` argument).
+ ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
checkpoint with 3 labels).
- cache_dir (:obj:`str`, `optional`):
+ cache_dir (`str`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
- proxies: (:obj:`Dict[str, str], `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
- output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
+ proxies: (`Dict[str, str], `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+ 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. output_loading_info(`bool`, *optional*, defaults to `False`):
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
- local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+ local_files_only(`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (e.g., not try doanloading the model).
- use_auth_token (:obj:`str` or `bool`, `optional`):
- The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
- generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+ generated when running `transformers-cli login` (stored in `~/.huggingface`).
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- mirror(:obj:`str`, `optional`):
+ mirror(`str`, *optional*):
Mirror source to accelerate downloads in China. If you are from China and have an accessibility
problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
Please refer to the mirror site for more information.
- kwargs (remaining dictionary of keyword arguments, `optional`):
+ kwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+ `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
automatically loaded:
- - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
- underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+ - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+ underlying model's `__init__` method (we assume all relevant updates to the configuration have
already been done)
- - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
- initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
- ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
- with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
- attribute will be passed to the underlying model's ``__init__`` function.
+ - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+ initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+ `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+ with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+ attribute will be passed to the underlying model's `__init__` function.
- .. note::
+
- Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+ Passing `use_auth_token=True` is required when you want to use a private model.
- Examples::
+
- >>> from transformers import BertConfig, TFBertModel
- >>> # Download model and configuration from huggingface.co and cache.
- >>> model = TFBertModel.from_pretrained('bert-base-uncased')
- >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable).
- >>> model = TFBertModel.from_pretrained('./test/saved_model/')
- >>> # Update configuration during loading.
- >>> model = TFBertModel.from_pretrained('bert-base-uncased', output_attentions=True)
- >>> assert model.config.output_attentions == True
- >>> # Loading from a Pytorch model file instead of a TensorFlow checkpoint (slower, for example purposes, not runnable).
- >>> config = BertConfig.from_json_file('./pt_model/my_pt_model_config.json')
- >>> model = TFBertModel.from_pretrained('./pt_model/my_pytorch_model.bin', from_pt=True, config=config)
+ Examples:
- """
+ ```python
+ >>> from transformers import BertConfig, TFBertModel
+ >>> # Download model and configuration from huggingface.co and cache.
+ >>> model = TFBertModel.from_pretrained('bert-base-uncased')
+ >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
+ >>> model = TFBertModel.from_pretrained('./test/saved_model/')
+ >>> # Update configuration during loading.
+ >>> model = TFBertModel.from_pretrained('bert-base-uncased', output_attentions=True)
+ >>> assert model.config.output_attentions == True
+ >>> # Loading from a Pytorch model file instead of a TensorFlow checkpoint (slower, for example purposes, not runnable).
+ >>> config = BertConfig.from_json_file('./pt_model/my_pt_model_config.json')
+ >>> model = TFBertModel.from_pretrained('./pt_model/my_pytorch_model.bin', from_pt=True, config=config)
+ ```"""
config = kwargs.pop("config", None)
cache_dir = kwargs.pop("cache_dir", None)
from_pt = kwargs.pop("from_pt", False)
@@ -1685,14 +1694,14 @@ class TFConv1D(tf.keras.layers.Layer):
Basically works like a linear layer but the weights are transposed.
Args:
- nf (:obj:`int`):
+ nf (`int`):
The number of output features.
- nx (:obj:`int`):
+ nx (`int`):
The number of input features.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation to use to initialize the weights.
kwargs:
- Additional keyword arguments passed along to the :obj:`__init__` of :obj:`tf.keras.layers.Layer`.
+ Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
"""
def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
@@ -1726,15 +1735,15 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
modeling.
Args:
- vocab_size (:obj:`int`):
+ vocab_size (`int`):
The size of the vocabulary, e.g., the number of unique tokens.
- hidden_size (:obj:`int`):
+ hidden_size (`int`):
The size of the embedding vectors.
- initializer_range (:obj:`float`, `optional`):
+ initializer_range (`float`, *optional*):
The standard deviation to use when initializing the weights. If no value is provided, it will default to
- :math:`1/\sqrt{hidden\_size}`.
+ \\(1/\sqrt{hidden\_size}\\).
kwargs:
- Additional keyword arguments passed along to the :obj:`__init__` of :obj:`tf.keras.layers.Layer`.
+ Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
"""
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optional[float] = None, **kwargs):
@@ -1768,25 +1777,24 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
Get token embeddings of inputs or decode final hidden state.
Args:
- inputs (:obj:`tf.Tensor`):
- In embedding mode, should be an int64 tensor with shape :obj:`[batch_size, length]`.
+ inputs (`tf.Tensor`):
+ In embedding mode, should be an int64 tensor with shape `[batch_size, length]`.
- In linear mode, should be a float tensor with shape :obj:`[batch_size, length, hidden_size]`.
- mode (:obj:`str`, defaults to :obj:`"embedding"`):
- A valid value is either :obj:`"embedding"` or :obj:`"linear"`, the first one indicates that the layer
+ In linear mode, should be a float tensor with shape `[batch_size, length, hidden_size]`.
+ mode (`str`, defaults to `"embedding"`):
+ A valid value is either `"embedding"` or `"linear"`, the first one indicates that the layer
should be used as an embedding layer, the second one that the layer should be used as a linear decoder.
Returns:
- :obj:`tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape
- :obj:`[batch_size, length, embedding_size]`.
+ `tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape
+ `[batch_size, length, embedding_size]`.
- In linear mode, the output is a float32 with shape :obj:`[batch_size, length, vocab_size]`.
+ In linear mode, the output is a float32 with shape `[batch_size, length, vocab_size]`.
Raises:
- ValueError: if :obj:`mode` is not valid.
+ ValueError: if `mode` is not valid.
- Shared weights logic is adapted from `here
- `__.
+ Shared weights logic is adapted from [here](https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24).
"""
if mode == "embedding":
return self._embedding(inputs)
@@ -1821,31 +1829,31 @@ class TFSequenceSummary(tf.keras.layers.Layer):
Compute a single vector summary of a sequence hidden states.
Args:
- config (:class:`~transformers.PretrainedConfig`):
+ config ([`PretrainedConfig`]):
The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
config class of your model for the default values it uses):
- - **summary_type** (:obj:`str`) -- The method to use to make this summary. Accepted values are:
+ - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
- - :obj:`"last"` -- Take the last token hidden state (like XLNet)
- - :obj:`"first"` -- Take the first token hidden state (like Bert)
- - :obj:`"mean"` -- Take the mean of all tokens hidden states
- - :obj:`"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
- - :obj:`"attn"` -- Not implemented now, use multi-head attention
+ - `"last"` -- Take the last token hidden state (like XLNet)
+ - `"first"` -- Take the first token hidden state (like Bert)
+ - `"mean"` -- Take the mean of all tokens hidden states
+ - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+ - `"attn"` -- Not implemented now, use multi-head attention
- - **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction.
- - **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to
- :obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`).
- - **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
- output, another string or :obj:`None` will add no activation.
- - **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and
+ - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+ - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to
+ `config.num_labels` classes (otherwise to `config.hidden_size`).
+ - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the
+ output, another string or `None` will add no activation.
+ - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and
activation.
- - **summary_last_dropout** (:obj:`float`)-- Optional dropout probability after the projection and
+ - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and
activation.
- initializer_range (:obj:`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
+ initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
kwargs:
- Additional keyword arguments passed along to the :obj:`__init__` of :obj:`tf.keras.layers.Layer`.
+ Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
"""
def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **kwargs):
@@ -1937,10 +1945,10 @@ def shape_list(tensor: tf.Tensor) -> List[int]:
Deal with dynamic shape in tensorflow cleanly.
Args:
- tensor (:obj:`tf.Tensor`): The tensor we want the shape of.
+ tensor (`tf.Tensor`): The tensor we want the shape of.
Returns:
- :obj:`List[int]`: The shape of the tensor as a list.
+ `List[int]`: The shape of the tensor as a list.
"""
dynamic = tf.shape(tensor)
@@ -1954,13 +1962,13 @@ def shape_list(tensor: tf.Tensor) -> List[int]:
def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal:
"""
- Creates a :obj:`tf.initializers.TruncatedNormal` with the given range.
+ Creates a `tf.initializers.TruncatedNormal` with the given range.
Args:
- initializer_range (`float`, defaults to 0.02): Standard deviation of the initializer range.
+ initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range.
Returns:
- :obj:`tf.initializers.TruncatedNormal`: The truncated normal initializer.
+ `tf.initializers.TruncatedNormal`: The truncated normal initializer.
"""
return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 2cc37a6f94..e4d58ff501 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -89,16 +89,16 @@ def find_pruneable_heads_and_indices(
heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int]
) -> Tuple[Set[int], torch.LongTensor]:
"""
- Finds the heads and their indices taking :obj:`already_pruned_heads` into account.
+ Finds the heads and their indices taking `already_pruned_heads` into account.
Args:
- heads (:obj:`List[int]`): List of the indices of heads to prune.
- n_heads (:obj:`int`): The number of heads in the model.
- head_size (:obj:`int`): The size of each head.
- already_pruned_heads (:obj:`Set[int]`): A set of already pruned heads.
+ heads (`List[int]`): List of the indices of heads to prune.
+ n_heads (`int`): The number of heads in the model.
+ head_size (`int`): The size of each head.
+ already_pruned_heads (`Set[int]`): A set of already pruned heads.
Returns:
- :obj:`Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices.
+ `Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices.
"""
mask = torch.ones(n_heads, head_size)
heads = set(heads) - already_pruned_heads # Convert to set and remove already pruned heads
@@ -143,7 +143,7 @@ def get_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "ModuleUtil
class ModuleUtilsMixin:
"""
- A few utilities for :obj:`torch.nn.Modules`, to be used as a mixin.
+ A few utilities for `torch.nn.Modules`, to be used as a mixin.
"""
@staticmethod
@@ -176,8 +176,8 @@ class ModuleUtilsMixin:
"""
Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.
- Increase in memory consumption is stored in a :obj:`mem_rss_diff` attribute for each module and can be reset to
- zero with :obj:`model.reset_memory_hooks_state()`.
+ Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to
+ zero with `model.reset_memory_hooks_state()`.
"""
for module in self.modules():
module.register_forward_pre_hook(self._hook_rss_memory_pre_forward)
@@ -186,8 +186,8 @@ class ModuleUtilsMixin:
def reset_memory_hooks_state(self):
"""
- Reset the :obj:`mem_rss_diff` attribute of each module (see
- :func:`~transformers.modeling_utils.ModuleUtilsMixin.add_memory_hooks`).
+ Reset the `mem_rss_diff` attribute of each module (see
+ [`~modeling_utils.ModuleUtilsMixin.add_memory_hooks`]).
"""
for module in self.modules():
module.mem_rss_diff = 0
@@ -197,7 +197,7 @@ class ModuleUtilsMixin:
@property
def device(self) -> device:
"""
- :obj:`torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+ `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
device).
"""
return get_parameter_device(self)
@@ -205,7 +205,7 @@ class ModuleUtilsMixin:
@property
def dtype(self) -> torch.dtype:
"""
- :obj:`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+ `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
"""
return get_parameter_dtype(self)
@@ -214,10 +214,10 @@ class ModuleUtilsMixin:
Invert an attention mask (e.g., switches 0. and 1.).
Args:
- encoder_attention_mask (:obj:`torch.Tensor`): An attention mask.
+ encoder_attention_mask (`torch.Tensor`): An attention mask.
Returns:
- :obj:`torch.Tensor`: The inverted attention mask.
+ `torch.Tensor`: The inverted attention mask.
"""
if encoder_attention_mask.dim() == 3:
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
@@ -246,15 +246,15 @@ class ModuleUtilsMixin:
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
Arguments:
- attention_mask (:obj:`torch.Tensor`):
+ attention_mask (`torch.Tensor`):
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
- input_shape (:obj:`Tuple[int]`):
+ input_shape (`Tuple[int]`):
The shape of the input to the model.
- device: (:obj:`torch.device`):
+ device: (`torch.device`):
The device of the input to the model.
Returns:
- :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+ `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
"""
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
@@ -308,16 +308,16 @@ class ModuleUtilsMixin:
Prepare the head mask if needed.
Args:
- head_mask (:obj:`torch.Tensor` with shape :obj:`[num_heads]` or :obj:`[num_hidden_layers x num_heads]`, `optional`):
+ head_mask (`torch.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
- num_hidden_layers (:obj:`int`):
+ num_hidden_layers (`int`):
The number of hidden layers in the model.
- is_attention_chunked: (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ is_attention_chunked: (`bool`, *optional*, defaults to `False`):
Whether or not the attentions scores are computed by chunks or not.
Returns:
- :obj:`torch.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or
- list with :obj:`[None]` for each layer.
+ `torch.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or
+ list with `[None]` for each layer.
"""
if head_mask is not None:
head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
@@ -344,14 +344,14 @@ class ModuleUtilsMixin:
Get number of (optionally, trainable or non-embeddings) parameters in the module.
Args:
- only_trainable (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ only_trainable (`bool`, *optional*, defaults to `False`):
Whether or not to return only the number of trainable parameters
- exclude_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ exclude_embeddings (`bool`, *optional*, defaults to `False`):
Whether or not to return only the number of non-embeddings parameters
Returns:
- :obj:`int`: The number of parameters.
+ `int`: The number of parameters.
"""
if exclude_embeddings:
@@ -370,10 +370,10 @@ class ModuleUtilsMixin:
Helper function to estimate the total number of tokens from the model inputs.
Args:
- inputs (:obj:`dict`): The model inputs.
+ inputs (`dict`): The model inputs.
Returns:
- :obj:`int`: The total number of tokens.
+ `int`: The total number of tokens.
"""
if self.main_input_name in input_dict:
return input_dict[self.main_input_name].numel()
@@ -389,22 +389,21 @@ class ModuleUtilsMixin:
"""
Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
- tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper
- `__ section 2.1. Should be overridden for transformers with parameter
+ tokens (valid if `12 * d_model << sequence_length`) as laid out in [this paper](https://arxiv.org/pdf/2001.08361.pdf) section 2.1. Should be overridden for transformers with parameter
re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.
Args:
- batch_size (:obj:`int`):
+ batch_size (`int`):
The batch size for the forward pass.
- sequence_length (:obj:`int`):
+ sequence_length (`int`):
The number of tokens in each line of the batch.
- exclude_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ exclude_embeddings (`bool`, *optional*, defaults to `True`):
Whether or not to count embedding and softmax operations.
Returns:
- :obj:`int`: The number of floating-point operations.
+ `int`: The number of floating-point operations.
"""
return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings)
@@ -414,30 +413,30 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
r"""
Base class for all models.
- :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods
+ [`PreTrainedModel`] takes care of storing the configuration of the models and handles methods
for loading, downloading and saving models as well as a few methods common to all models to:
- * resize the input embeddings,
- * prune heads in the self-attention heads.
+ - resize the input embeddings,
+ - prune heads in the self-attention heads.
Class attributes (overridden by derived classes):
- - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
- :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
- - **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a PyTorch
+ - **config_class** ([`PretrainedConfig`]) -- A subclass of
+ [`PretrainedConfig`] to use as configuration class for this model architecture.
+ - **load_tf_weights** (`Callable`) -- A python *method* for loading a TensorFlow checkpoint in a PyTorch
model, taking as arguments:
- - **model** (:class:`~transformers.PreTrainedModel`) -- An instance of the model on which to load the
+ - **model** ([`PreTrainedModel`]) -- An instance of the model on which to load the
TensorFlow checkpoint.
- - **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated to
+ - **config** ([`PreTrainedConfig`]) -- An instance of the configuration associated to
the model.
- - **path** (:obj:`str`) -- A path to the TensorFlow checkpoint.
+ - **path** (`str`) -- A path to the TensorFlow checkpoint.
- - **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
+ - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in
derived classes of the same architecture adding modules on top of the base model.
- - **is_parallelizable** (:obj:`bool`) -- A flag indicating whether this model supports model parallelization.
- - **main_input_name** (:obj:`str`) -- The name of the principal input to the model (often :obj:`input_ids` for
- NLP models, :obj:`pixel_values` for vision models and :obj:`input_values` for speech models).
+ - **is_parallelizable** (`bool`) -- A flag indicating whether this model supports model parallelization.
+ - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for
+ NLP models, `pixel_values` for vision models and `input_values` for speech models).
"""
config_class = None
base_model_prefix = ""
@@ -459,7 +458,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
@property
def dummy_inputs(self) -> Dict[str, torch.Tensor]:
"""
- :obj:`Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network.
+ `Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network.
"""
return {"input_ids": torch.tensor(DUMMY_INPUTS)}
@@ -502,8 +501,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
All context managers that the model should be initialized under go here.
Args:
- torch_dtype (:obj:`torch.dtype`, `optional`):
- Override the default ``torch.dtype`` and load the model under this dtype.
+ torch_dtype (`torch.dtype`, *optional*):
+ Override the default `torch.dtype` and load the model under this dtype.
"""
torch_dtype = kwargs.pop("torch_dtype", None)
@@ -536,15 +535,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
under specific dtype.
Args:
- dtype (:obj:`torch.dtype`):
+ dtype (`torch.dtype`):
a floating dtype to set to.
Returns:
- :obj:`torch.dtype`: the original ``dtype`` that can be used to restore ``torch.set_default_dtype(dtype)``
- if it was modified. If it wasn't, returns :obj:`None`.
+ `torch.dtype`: the original `dtype` that can be used to restore `torch.set_default_dtype(dtype)`
+ if it was modified. If it wasn't, returns `None`.
- Note ``set_default_dtype`` currently only works with floating-point types and asserts if for example,
- ``torch.int64`` is passed. So if a non-float ``dtype`` is passed this functions will throw an exception.
+ Note `set_default_dtype` currently only works with floating-point types and asserts if for example,
+ `torch.int64` is passed. So if a non-float `dtype` is passed this functions will throw an exception.
"""
if not dtype.is_floating_point:
raise ValueError(
@@ -559,7 +558,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
@property
def base_model(self) -> nn.Module:
"""
- :obj:`torch.nn.Module`: The main body of the model.
+ `torch.nn.Module`: The main body of the model.
"""
return getattr(self, self.base_model_prefix, self)
@@ -568,7 +567,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
Returns the model's input embeddings.
Returns:
- :obj:`nn.Module`: A torch module mapping vocabulary to hidden states.
+ `nn.Module`: A torch module mapping vocabulary to hidden states.
"""
base_model = getattr(self, self.base_model_prefix, self)
if base_model is not self:
@@ -581,7 +580,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
Set model's input embeddings.
Args:
- value (:obj:`nn.Module`): A module mapping vocabulary to hidden states.
+ value (`nn.Module`): A module mapping vocabulary to hidden states.
"""
base_model = getattr(self, self.base_model_prefix, self)
if base_model is not self:
@@ -594,7 +593,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
Returns the model's output embeddings.
Returns:
- :obj:`nn.Module`: A torch module mapping hidden states to vocabulary.
+ `nn.Module`: A torch module mapping hidden states to vocabulary.
"""
return None # Overwrite for models with output embeddings
@@ -608,7 +607,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
"""
Tie the weights between the input embeddings and the output embeddings.
- If the :obj:`torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
+ If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
the weights instead.
"""
output_embeddings = self.get_output_embeddings()
@@ -719,19 +718,19 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding:
"""
- Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
+ Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
- Takes care of tying weights embeddings afterwards if the model class has a :obj:`tie_weights()` method.
+ Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
Arguments:
- new_num_tokens (:obj:`int`, `optional`):
+ new_num_tokens (`int`, *optional*):
The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
- vectors at the end. Reducing the size will remove vectors from the end. If not provided or :obj:`None`,
- just returns a pointer to the input tokens :obj:`torch.nn.Embedding` module of the model without doing
+ vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`,
+ just returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing
anything.
Return:
- :obj:`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
+ `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
"""
model_embeds = self._resize_token_embeddings(new_num_tokens)
if new_num_tokens is None:
@@ -767,18 +766,18 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
initialized vectors at the end. Reducing the size will remove vectors from the end
Args:
- old_embeddings (:obj:`torch.nn.Embedding`):
+ old_embeddings (`torch.nn.Embedding`):
Old embeddings to be resized.
- new_num_tokens (:obj:`int`, `optional`):
+ new_num_tokens (`int`, *optional*):
New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
- vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
- :obj:`torch.nn.Embedding`` module of the model without doing anything.
+ vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
+ ``torch.nn.Embedding``` module of the model without doing anything.
Return:
- :obj:`torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
- :obj:`new_num_tokens` is :obj:`None`
+ `torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
+ `new_num_tokens` is `None`
"""
if new_num_tokens is None:
return old_embeddings
@@ -830,21 +829,19 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
vectors at the end. Reducing the size will remove vectors from the end
Args:
- old_lm_head (:obj:`torch.nn.Linear`):
+ old_lm_head (`torch.nn.Linear`):
Old lm head liner layer to be resized.
- new_num_tokens (:obj:`int`, `optional`):
+ new_num_tokens (`int`, *optional*):
New number of tokens in the linear matrix.
Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
- vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
- :obj:`torch.nn.Linear`` module of the model without doing anything.
- transposed (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether ``old_lm_head`` is transposed or not. If True ``old_lm_head.size()`` is ``lm_head_dim,
- vocab_size`` else ``vocab_size, lm_head_dim``.
+ vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
+ ``torch.nn.Linear``` module of the model without doing anything. transposed (`bool`, *optional*, defaults to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim,
+ vocab_size` else `vocab_size, lm_head_dim`.
Return:
- :obj:`torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if
- :obj:`new_num_tokens` is :obj:`None`
+ `torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if
+ `new_num_tokens` is `None`
"""
if new_num_tokens is None:
return old_lm_head
@@ -946,9 +943,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
Prunes heads of the base model.
Arguments:
- heads_to_prune (:obj:`Dict[int, List[int]]`):
- Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of
- heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
+ heads_to_prune (`Dict[int, List[int]]`):
+ Dictionary with keys being selected layer indices (`int`) and associated values being the list of
+ heads to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
0 and 2 on layer 1 and heads 2 and 3 on layer 2.
"""
# save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
@@ -1000,35 +997,37 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
):
"""
Save a model and its configuration file to a directory, so that it can be re-loaded using the
- `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
+ `[`~PreTrainedModel.from_pretrained`]` class method.
Arguments:
- save_directory (:obj:`str` or :obj:`os.PathLike`):
+ save_directory (`str` or `os.PathLike`):
Directory to which to save. Will be created if it doesn't exist.
- save_config (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ save_config (`bool`, *optional*, defaults to `True`):
Whether or not to save the config of the model. Useful when in distributed training like TPUs and need
- to call this function on all processes. In this case, set :obj:`save_config=True` only on the main
+ to call this function on all processes. In this case, set `save_config=True` only on the main
process to avoid race conditions.
- state_dict (nested dictionary of :obj:`torch.Tensor`):
- The state dictionary of the model to save. Will default to :obj:`self.state_dict()`, but can be used to
+ state_dict (nested dictionary of `torch.Tensor`):
+ The state dictionary of the model to save. Will default to `self.state_dict()`, but can be used to
only save parts of the model or if special precautions need to be taken when recovering the state
dictionary of a model (like when using model parallelism).
- save_function (:obj:`Callable`):
+ save_function (`Callable`):
The function to use to save the state dictionary. Useful on distributed training like TPUs when one
- need to replace :obj:`torch.save` by another method.
- push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ need to replace `torch.save` by another method.
+ push_to_hub (`bool`, *optional*, defaults to `False`):
Whether or not to push your model to the Hugging Face model hub after saving it.
- .. warning::
+
- Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
- :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
- pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
- instead.
+ Using `push_to_hub=True` will synchronize the repository you are pushing to with
+ `save_directory`, which requires `save_directory` to be a local clone of the repo you are
+ pushing to if it's an existing folder. Pass along `temp_dir=True` to use a temporary directory
+ instead.
+
+
kwargs:
Additional key word arguments passed along to the
- :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method.
+ [`~file_utils.PushToHubMixin.push_to_hub`] method.
"""
if os.path.isfile(save_directory):
logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -1080,152 +1079,155 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
r"""
Instantiate a pretrained pytorch model from a pre-trained model configuration.
- The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). To
- train the model, you should first set it back in training mode with ``model.train()``.
+ The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To
+ train the model, you should first set it back in training mode with `model.train()`.
- The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
+ The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
task.
- The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those
+ The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
weights are discarded.
Parameters:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, `optional`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
- this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
- as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+ this case, `from_tf` should be set to `True` and a configuration object should be provided
+ as `config` argument. This loading path is slower than converting the TensorFlow checkpoint in
a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- - A path or url to a model folder containing a `flax checkpoint file` in `.msgpack` format (e.g,
- ``./flax_model/`` containing ``flax_model.msgpack``). In this case, ``from_flax`` should be set
- to :obj:`True`.
- - :obj:`None` if you are both providing the configuration and state dictionary (resp. with keyword
- arguments ``config`` and ``state_dict``).
- model_args (sequence of positional arguments, `optional`):
- All remaining positional arguments will be passed to the underlying model's ``__init__`` method.
- config (:obj:`Union[PretrainedConfig, str, os.PathLike]`, `optional`):
+ - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g,
+ `./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set
+ to `True`.
+ - `None` if you are both providing the configuration and state dictionary (resp. with keyword
+ arguments `config` and `state_dict`).
+ model_args (sequence of positional arguments, *optional*):
+ All remaining positional arguments will be passed to the underlying model's `__init__` method.
+ config (`Union[PretrainedConfig, str, os.PathLike]`, *optional*):
Can be either:
- - an instance of a class derived from :class:`~transformers.PretrainedConfig`,
- - a string or path valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`.
+ - an instance of a class derived from [`PretrainedConfig`],
+ - a string or path valid as input to [`~PretrainedConfig.from_pretrained`].
Configuration for the model to use instead of an automatically loaded configuration. Configuration can
be automatically loaded when:
- - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+ - The model is a model provided by the library (loaded with the *model id* string of a pretrained
model).
- - The model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
+ - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
by supplying the save directory.
- - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
- configuration JSON file named `config.json` is found in the directory.
- state_dict (:obj:`Dict[str, torch.Tensor]`, `optional`):
+ - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+ configuration JSON file named *config.json* is found in the directory.
+ state_dict (`Dict[str, torch.Tensor]`, *optional*):
A state dictionary to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own
weights. In this case though, you should check if using
- :func:`~transformers.PreTrainedModel.save_pretrained` and
- :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
- cache_dir (:obj:`Union[str, os.PathLike]`, `optional`):
+ [`~PreTrainedModel.save_pretrained`] and
+ [`~PreTrainedModel.from_pretrained`] is not a simpler option.
+ cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
- from_tf (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ from_tf (`bool`, *optional*, defaults to `False`):
Load the model weights from a TensorFlow checkpoint save file (see docstring of
- ``pretrained_model_name_or_path`` argument).
- from_flax (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ `pretrained_model_name_or_path` argument).
+ from_flax (`bool`, *optional*, defaults to `False`):
Load the model weights from a Flax checkpoint save file (see docstring of
- ``pretrained_model_name_or_path`` argument).
- ignore_mismatched_sizes (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ `pretrained_model_name_or_path` argument).
+ ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
checkpoint with 3 labels).
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
- output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ output_loading_info(`bool`, *optional*, defaults to `False`):
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
- local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+ local_files_only(`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (i.e., do not try to download the model).
- use_auth_token (:obj:`str` or `bool`, `optional`):
- The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
- generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+ generated when running `transformers-cli login` (stored in `~/.huggingface`).
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- mirror(:obj:`str`, `optional`):
+ mirror(`str`, *optional*):
Mirror source to accelerate downloads in China. If you are from China and have an accessibility
problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
Please refer to the mirror site for more information.
- _fast_init(:obj:`bool`, `optional`, defaults to `:obj:`True`):
+ _fast_init(`bool`, *optional*, defaults to ```True`):
Whether or not to disable fast initialization.
- low_cpu_mem_usage(:obj:`bool`, `optional`, defaults to `:obj:`False`):
+ low_cpu_mem_usage(`bool``, *optional*, defaults to ```False`):
Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
This is an experimental feature and a subject to change at any moment.
- torch_dtype (:obj:`str` or :obj:`torch.dtype`, `optional`):
- Override the default ``torch.dtype`` and load the model under this dtype. If ``"auto"`` is passed the
+ torch_dtype (`str` or `torch.dtype`, *optional*):
+ Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the
dtype will be automatically derived from the model's weights.
- .. warning::
+
- One should only disable `_fast_init` to ensure backwards compatibility with
- ``transformers.__version__ < 4.6.0`` for seeded model initialization. This argument will be removed
- at the next major version. See `pull request 11471
- `__ for more information.
+ One should only disable *_fast_init* to ensure backwards compatibility with
+ `transformers.__version__ < 4.6.0` for seeded model initialization. This argument will be removed
+ at the next major version. See [pull request 11471](https://github.com/huggingface/transformers/pull/11471) for more information.
- kwargs (remaining dictionary of keyword arguments, `optional`):
+
+
+ kwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+ `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
automatically loaded:
- - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
- underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+ - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+ underlying model's `__init__` method (we assume all relevant updates to the configuration have
already been done)
- - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
- initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
- ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
- with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
- attribute will be passed to the underlying model's ``__init__`` function.
+ - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+ initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+ `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+ with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+ attribute will be passed to the underlying model's `__init__` function.
- .. note::
+
- Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+ Passing `use_auth_token=True`` is required when you want to use a private model.
- .. note::
+
- Activate the special `"offline-mode"
- `__ to use this method in a firewalled
- environment.
+
- Examples::
+ Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to use this method in a firewalled
+ environment.
- >>> from transformers import BertConfig, BertModel
- >>> # Download model and configuration from huggingface.co and cache.
- >>> model = BertModel.from_pretrained('bert-base-uncased')
- >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable).
- >>> model = BertModel.from_pretrained('./test/saved_model/')
- >>> # Update configuration during loading.
- >>> model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
- >>> assert model.config.output_attentions == True
- >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
- >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
- >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
- >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
- >>> model = BertModel.from_pretrained('bert-base-uncased', from_flax=True)
+
- """
+ Examples:
+
+ ```python
+ >>> from transformers import BertConfig, BertModel
+ >>> # Download model and configuration from huggingface.co and cache.
+ >>> model = BertModel.from_pretrained('bert-base-uncased')
+ >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
+ >>> model = BertModel.from_pretrained('./test/saved_model/')
+ >>> # Update configuration during loading.
+ >>> model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
+ >>> assert model.config.output_attentions == True
+ >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
+ >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
+ >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+ >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
+ >>> model = BertModel.from_pretrained('bert-base-uncased', from_flax=True)
+ ```"""
config = kwargs.pop("config", None)
state_dict = kwargs.pop("state_dict", None)
cache_dir = kwargs.pop("cache_dir", None)
@@ -1747,8 +1749,8 @@ class Conv1D(nn.Module):
Basically works like a linear layer but the weights are transposed.
Args:
- nf (:obj:`int`): The number of output features.
- nx (:obj:`int`): The number of input features.
+ nf (`int`): The number of output features.
+ nx (`int`): The number of input features.
"""
def __init__(self, nf, nx):
@@ -1771,8 +1773,8 @@ class PoolerStartLogits(nn.Module):
Compute SQuAD start logits from sequence hidden states.
Args:
- config (:class:`~transformers.PretrainedConfig`):
- The config used by the model, will be used to grab the :obj:`hidden_size` of the model.
+ config ([`PretrainedConfig`]):
+ The config used by the model, will be used to grab the `hidden_size` of the model.
"""
def __init__(self, config: PretrainedConfig):
@@ -1784,14 +1786,14 @@ class PoolerStartLogits(nn.Module):
) -> torch.FloatTensor:
"""
Args:
- hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
+ hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
The final hidden states of the model.
- p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
+ p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
should be masked.
Returns:
- :obj:`torch.FloatTensor`: The start logits for SQuAD.
+ `torch.FloatTensor`: The start logits for SQuAD.
"""
x = self.dense(hidden_states).squeeze(-1)
@@ -1809,9 +1811,9 @@ class PoolerEndLogits(nn.Module):
Compute SQuAD end logits from sequence hidden states.
Args:
- config (:class:`~transformers.PretrainedConfig`):
- The config used by the model, will be used to grab the :obj:`hidden_size` of the model and the
- :obj:`layer_norm_eps` to use.
+ config ([`PretrainedConfig`]):
+ The config used by the model, will be used to grab the `hidden_size` of the model and the
+ `layer_norm_eps` to use.
"""
def __init__(self, config: PretrainedConfig):
@@ -1830,23 +1832,25 @@ class PoolerEndLogits(nn.Module):
) -> torch.FloatTensor:
"""
Args:
- hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
+ hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
The final hidden states of the model.
- start_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`, `optional`):
+ start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
The hidden states of the first tokens for the labeled span.
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
The position of the first token for the labeled span.
- p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
+ p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
should be masked.
- .. note::
+
- One of ``start_states`` or ``start_positions`` should be not obj:`None`. If both are set,
- ``start_positions`` overrides ``start_states``.
+ One of `start_states` or `start_positions` should be not obj:*None*. If both are set,
+ `start_positions` overrides `start_states`.
+
+
Returns:
- :obj:`torch.FloatTensor`: The end logits for SQuAD.
+ `torch.FloatTensor`: The end logits for SQuAD.
"""
assert (
start_states is not None or start_positions is not None
@@ -1876,8 +1880,8 @@ class PoolerAnswerClass(nn.Module):
Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
Args:
- config (:class:`~transformers.PretrainedConfig`):
- The config used by the model, will be used to grab the :obj:`hidden_size` of the model.
+ config ([`PretrainedConfig`]):
+ The config used by the model, will be used to grab the `hidden_size` of the model.
"""
def __init__(self, config):
@@ -1895,22 +1899,24 @@ class PoolerAnswerClass(nn.Module):
) -> torch.FloatTensor:
"""
Args:
- hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
+ hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
The final hidden states of the model.
- start_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`, `optional`):
+ start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
The hidden states of the first tokens for the labeled span.
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
The position of the first token for the labeled span.
- cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
+ cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
- .. note::
+
- One of ``start_states`` or ``start_positions`` should be not obj:`None`. If both are set,
- ``start_positions`` overrides ``start_states``.
+ One of `start_states` or `start_positions` should be not obj:*None*. If both are set,
+ `start_positions` overrides `start_states`.
+
+
Returns:
- :obj:`torch.FloatTensor`: The SQuAD 2.0 answer class.
+ `torch.FloatTensor`: The SQuAD 2.0 answer class.
"""
# No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
hsz = hidden_states.shape[-1]
@@ -1937,23 +1943,23 @@ class PoolerAnswerClass(nn.Module):
@dataclass
class SquadHeadOutput(ModelOutput):
"""
- Base class for outputs of question answering models using a :class:`~transformers.modeling_utils.SQuADHead`.
+ Base class for outputs of question answering models using a [`~modeling_utils.SQuADHead`].
Args:
- loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
Classification loss as the sum of start token, end token (and is_impossible if provided) classification
losses.
- start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
+ start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
- start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
+ start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search).
- end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
- Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities
+ end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+ Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
(beam-search).
- end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
- Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
- cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
- Log probabilities for the ``is_impossible`` label of the answers.
+ end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+ Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
+ cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+ Log probabilities for the `is_impossible` label of the answers.
"""
@@ -1970,9 +1976,9 @@ class SQuADHead(nn.Module):
A SQuAD head inspired by XLNet.
Args:
- config (:class:`~transformers.PretrainedConfig`):
- The config used by the model, will be used to grab the :obj:`hidden_size` of the model and the
- :obj:`layer_norm_eps` to use.
+ config ([`PretrainedConfig`]):
+ The config used by the model, will be used to grab the `hidden_size` of the model and the
+ `layer_norm_eps` to use.
"""
def __init__(self, config):
@@ -1997,21 +2003,21 @@ class SQuADHead(nn.Module):
) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]:
"""
Args:
- hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
+ hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
Final hidden states of the model on the sequence tokens.
- start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Positions of the first token for the labeled span.
- end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Positions of the last token for the labeled span.
- cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
- Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
- is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
+ is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Whether the question has a possible answer in the paragraph or not.
- p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
+ p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
should be masked.
- return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+ return_dict (`bool`, *optional*, defaults to `False`):
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
Returns:
"""
@@ -2087,26 +2093,26 @@ class SequenceSummary(nn.Module):
Compute a single vector summary of a sequence hidden states.
Args:
- config (:class:`~transformers.PretrainedConfig`):
+ config ([`PretrainedConfig`]):
The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
config class of your model for the default values it uses):
- - **summary_type** (:obj:`str`) -- The method to use to make this summary. Accepted values are:
+ - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
- - :obj:`"last"` -- Take the last token hidden state (like XLNet)
- - :obj:`"first"` -- Take the first token hidden state (like Bert)
- - :obj:`"mean"` -- Take the mean of all tokens hidden states
- - :obj:`"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
- - :obj:`"attn"` -- Not implemented now, use multi-head attention
+ - `"last"` -- Take the last token hidden state (like XLNet)
+ - `"first"` -- Take the first token hidden state (like Bert)
+ - `"mean"` -- Take the mean of all tokens hidden states
+ - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+ - `"attn"` -- Not implemented now, use multi-head attention
- - **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction.
- - **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to
- :obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`).
- - **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
- output, another string or :obj:`None` will add no activation.
- - **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and
+ - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+ - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to
+ `config.num_labels` classes (otherwise to `config.hidden_size`).
+ - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the
+ output, another string or `None` will add no activation.
+ - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and
activation.
- - **summary_last_dropout** (:obj:`float`)-- Optional dropout probability after the projection and
+ - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and
activation.
"""
@@ -2146,14 +2152,14 @@ class SequenceSummary(nn.Module):
Compute a single vector summary of a sequence hidden states.
Args:
- hidden_states (:obj:`torch.FloatTensor` of shape :obj:`[batch_size, seq_len, hidden_size]`):
+ hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
The hidden states of the last layer.
- cls_index (:obj:`torch.LongTensor` of shape :obj:`[batch_size]` or :obj:`[batch_size, ...]` where ... are optional leading dimensions of :obj:`hidden_states`, `optional`):
- Used if :obj:`summary_type == "cls_index"` and takes the last token of the sequence as classification
+ cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+ Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification
token.
Returns:
- :obj:`torch.FloatTensor`: The summary of the sequence hidden states.
+ `torch.FloatTensor`: The summary of the sequence hidden states.
"""
if self.summary_type == "last":
output = hidden_states[:, -1]
@@ -2189,7 +2195,7 @@ def unwrap_model(model: nn.Module) -> nn.Module:
Recursively unwraps a model from potential containers (as used in distributed training).
Args:
- model (:obj:`torch.nn.Module`): The model to unwrap.
+ model (`torch.nn.Module`): The model to unwrap.
"""
# since there could be multiple levels of wrapping, unwrap recursively
if hasattr(model, "module"):
@@ -2205,12 +2211,12 @@ def prune_linear_layer(layer: nn.Linear, index: torch.LongTensor, dim: int = 0)
Used to remove heads.
Args:
- layer (:obj:`torch.nn.Linear`): The layer to prune.
- index (:obj:`torch.LongTensor`): The indices to keep in the layer.
- dim (:obj:`int`, `optional`, defaults to 0): The dimension on which to keep the indices.
+ layer (`torch.nn.Linear`): The layer to prune.
+ index (`torch.LongTensor`): The indices to keep in the layer.
+ dim (`int`, *optional*, defaults to 0): The dimension on which to keep the indices.
Returns:
- :obj:`torch.nn.Linear`: The pruned layer as a new layer with :obj:`requires_grad=True`.
+ `torch.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`.
"""
index = index.to(layer.weight.device)
W = layer.weight.index_select(dim, index).clone().detach()
@@ -2240,12 +2246,12 @@ def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) ->
Used to remove heads.
Args:
- layer (:class:`~transformers.modeling_utils.Conv1D`): The layer to prune.
- index (:obj:`torch.LongTensor`): The indices to keep in the layer.
- dim (:obj:`int`, `optional`, defaults to 1): The dimension on which to keep the indices.
+ layer ([`~modeling_utils.Conv1D`]): The layer to prune.
+ index (`torch.LongTensor`): The indices to keep in the layer.
+ dim (`int`, *optional*, defaults to 1): The dimension on which to keep the indices.
Returns:
- :class:`~transformers.modeling_utils.Conv1D`: The pruned layer as a new layer with :obj:`requires_grad=True`.
+ [`~modeling_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`.
"""
index = index.to(layer.weight.device)
W = layer.weight.index_select(dim, index).clone().detach()
@@ -2274,13 +2280,13 @@ def prune_layer(
Used to remove heads.
Args:
- layer (:obj:`Union[torch.nn.Linear, Conv1D]`): The layer to prune.
- index (:obj:`torch.LongTensor`): The indices to keep in the layer.
- dim (:obj:`int`, `optional`): The dimension on which to keep the indices.
+ layer (`Union[torch.nn.Linear, Conv1D]`): The layer to prune.
+ index (`torch.LongTensor`): The indices to keep in the layer.
+ dim (`int`, *optional*): The dimension on which to keep the indices.
Returns:
- :obj:`torch.nn.Linear` or :class:`~transformers.modeling_utils.Conv1D`: The pruned layer as a new layer with
- :obj:`requires_grad=True`.
+ `torch.nn.Linear` or [`~modeling_utils.Conv1D`]: The pruned layer as a new layer with
+ `requires_grad=True`.
"""
if isinstance(layer, nn.Linear):
return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
@@ -2294,37 +2300,38 @@ def apply_chunking_to_forward(
forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors
) -> torch.Tensor:
"""
- This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the
- dimension :obj:`chunk_dim`. It then applies a layer :obj:`forward_fn` to each chunk independently to save memory.
+ This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the
+ dimension `chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory.
- If the :obj:`forward_fn` is independent across the :obj:`chunk_dim` this function will yield the same result as
- directly applying :obj:`forward_fn` to :obj:`input_tensors`.
+ If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as
+ directly applying `forward_fn` to `input_tensors`.
Args:
- forward_fn (:obj:`Callable[..., torch.Tensor]`):
+ forward_fn (`Callable[..., torch.Tensor]`):
The forward function of the model.
- chunk_size (:obj:`int`):
- The chunk size of a chunked tensor: :obj:`num_chunks = len(input_tensors[0]) / chunk_size`.
- chunk_dim (:obj:`int`):
- The dimension over which the :obj:`input_tensors` should be chunked.
- input_tensors (:obj:`Tuple[torch.Tensor]`):
- The input tensors of ``forward_fn`` which will be chunked
+ chunk_size (`int`):
+ The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`.
+ chunk_dim (`int`):
+ The dimension over which the `input_tensors` should be chunked.
+ input_tensors (`Tuple[torch.Tensor]`):
+ The input tensors of `forward_fn` which will be chunked
Returns:
- :obj:`torch.Tensor`: A tensor with the same shape as the :obj:`forward_fn` would have given if applied`.
+ `torch.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`.
- Examples::
+ Examples:
- # rename the usual forward() fn to forward_chunk()
- def forward_chunk(self, hidden_states):
- hidden_states = self.decoder(hidden_states)
- return hidden_states
+ ```python
+ # rename the usual forward() fn to forward_chunk()
+ def forward_chunk(self, hidden_states):
+ hidden_states = self.decoder(hidden_states)
+ return hidden_states
- # implement a chunked forward function
- def forward(self, hidden_states):
- return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
- """
+ # implement a chunked forward function
+ def forward(self, hidden_states):
+ return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
+ ```"""
assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
diff --git a/src/transformers/models/albert/configuration_albert.py b/src/transformers/models/albert/configuration_albert.py
index 2bf3171d0d..4f9b6be85e 100644
--- a/src/transformers/models/albert/configuration_albert.py
+++ b/src/transformers/models/albert/configuration_albert.py
@@ -35,79 +35,78 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class AlbertConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel` or a
- :class:`~transformers.TFAlbertModel`. It is used to instantiate an ALBERT model according to the specified
+ This is the configuration class to store the configuration of a [`AlbertModel`] or a
+ [`TFAlbertModel`]. It is used to instantiate an ALBERT model according to the specified
arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
- configuration to that of the ALBERT `xxlarge `__ architecture.
+ configuration to that of the ALBERT [xxlarge](https://huggingface.co/albert-xxlarge-v2) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30000):
+ vocab_size (`int`, *optional*, defaults to 30000):
Vocabulary size of the ALBERT model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.AlbertModel` or
- :class:`~transformers.TFAlbertModel`.
- embedding_size (:obj:`int`, `optional`, defaults to 128):
+ `inputs_ids` passed when calling [`AlbertModel`] or
+ [`TFAlbertModel`].
+ embedding_size (`int`, *optional*, defaults to 128):
Dimensionality of vocabulary embeddings.
- hidden_size (:obj:`int`, `optional`, defaults to 4096):
+ hidden_size (`int`, *optional*, defaults to 4096):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_hidden_groups (:obj:`int`, `optional`, defaults to 1):
+ num_hidden_groups (`int`, *optional*, defaults to 1):
Number of groups for the hidden layers, parameters in the same group are shared.
- num_attention_heads (:obj:`int`, `optional`, defaults to 64):
+ num_attention_heads (`int`, *optional*, defaults to 64):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 16384):
+ intermediate_size (`int`, *optional*, defaults to 16384):
The dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
- inner_group_num (:obj:`int`, `optional`, defaults to 1):
+ inner_group_num (`int`, *optional*, defaults to 1):
The number of inner repetition of attention and ffn.
- hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu_new"`):
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu_new"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
(e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.AlbertModel` or
- :class:`~transformers.TFAlbertModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling [`AlbertModel`] or
+ [`TFAlbertModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- classifier_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for attached classifiers.
- position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
- Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
- :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
- :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
- `__. For more information on :obj:`"relative_key_query"`, please refer to
- `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
- `__.
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+ `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+ `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+ *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
- Examples::
+ Examples:
- >>> from transformers import AlbertConfig, AlbertModel
- >>> # Initializing an ALBERT-xxlarge style configuration
- >>> albert_xxlarge_configuration = AlbertConfig()
+ ```python
+ >>> from transformers import AlbertConfig, AlbertModel
+ >>> # Initializing an ALBERT-xxlarge style configuration
+ >>> albert_xxlarge_configuration = AlbertConfig()
- >>> # Initializing an ALBERT-base style configuration
- >>> albert_base_configuration = AlbertConfig(
- ... hidden_size=768,
- ... num_attention_heads=12,
- ... intermediate_size=3072,
- ... )
+ >>> # Initializing an ALBERT-base style configuration
+ >>> albert_base_configuration = AlbertConfig(
+ ... hidden_size=768,
+ ... num_attention_heads=12,
+ ... intermediate_size=3072,
+ ... )
- >>> # Initializing a model from the ALBERT-base style configuration
- >>> model = AlbertModel(albert_xxlarge_configuration)
+ >>> # Initializing a model from the ALBERT-base style configuration
+ >>> model = AlbertModel(albert_xxlarge_configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "albert"
diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py
index 7ff4552163..c7ae117a1b 100644
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -742,18 +742,20 @@ class FlaxAlbertForPreTraining(FlaxAlbertPreTrainedModel):
FLAX_ALBERT_FOR_PRETRAINING_DOCSTRING = """
Returns:
- Example::
+ Example:
- >>> from transformers import AlbertTokenizer, FlaxAlbertForPreTraining
+ ```python
+ >>> from transformers import AlbertTokenizer, FlaxAlbertForPreTraining
- >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
- >>> model = FlaxAlbertForPreTraining.from_pretrained('albert-base-v2')
+ >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+ >>> model = FlaxAlbertForPreTraining.from_pretrained('albert-base-v2')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.prediction_logits
- >>> seq_relationship_logits = outputs.sop_logits
+ >>> prediction_logits = outputs.prediction_logits
+ >>> seq_relationship_logits = outputs.sop_logits
+ ```
"""
overwrite_call_docstring(
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index 56ced8ec4d..05c755188f 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -885,20 +885,21 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
r"""
Return:
- Example::
+ Example:
- >>> import tensorflow as tf
- >>> from transformers import AlbertTokenizer, TFAlbertForPreTraining
+ ```python
+ >>> import tensorflow as tf
+ >>> from transformers import AlbertTokenizer, TFAlbertForPreTraining
- >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
- >>> model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
+ >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+ >>> model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
- >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
- >>> outputs = model(input_ids)
+ >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
+ >>> outputs = model(input_ids)
- >>> prediction_logits = outputs.prediction_logits
- >>> sop_logits = outputs.sop_logits
- """
+ >>> prediction_logits = outputs.prediction_logits
+ >>> sop_logits = outputs.sop_logits
+ ```"""
inputs = input_processing(
func=self.call,
diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py
index 6caa93db57..1d91860d0d 100644
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -58,68 +58,73 @@ SPIECE_UNDERLINE = "▁"
class AlbertTokenizer(PreTrainedTokenizer):
"""
- Construct an ALBERT tokenizer. Based on `SentencePiece `__.
+ Construct an ALBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
- remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ remove_space (`bool`, *optional*, defaults to `True`):
Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
- keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ keep_accents (`bool`, *optional*, defaults to `False`):
Whether or not to keep accents when tokenizing.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ bos_token (`str`, *optional*, defaults to `"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `"[SEP]"`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes:
- sp_model (:obj:`SentencePieceProcessor`):
- The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+ sp_model (`SentencePieceProcessor`):
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -252,17 +257,17 @@ class AlbertTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An ALBERT sequence has the following format:
- - single sequence: ``[CLS] X [SEP]``
- - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+ - single sequence: `[CLS] X [SEP]`
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
@@ -275,18 +280,18 @@ class AlbertTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -305,21 +310,21 @@ class AlbertTokenizer(PreTrainedTokenizer):
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
- ::
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
-
- If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py
index 9eb91ec555..3c5adfbd21 100644
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -72,44 +72,46 @@ SPIECE_UNDERLINE = "▁"
class AlbertTokenizerFast(PreTrainedTokenizerFast):
"""
- Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
- `__. This tokenizer
- inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
+ Construct a "fast" ALBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This tokenizer
+ inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
- remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ remove_space (`bool`, *optional*, defaults to `True`):
Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
- keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ keep_accents (`bool`, *optional*, defaults to `False`):
Whether or not to keep accents when tokenizing.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ bos_token (`str`, *optional*, defaults to `"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `"[SEP]"`):
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
- that is used for the end of sequence. The token used is the :obj:`sep_token`.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ that is used for the end of sequence. The token used is the `sep_token`.
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
"""
@@ -172,17 +174,17 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An ALBERT sequence has the following format:
- - single sequence: ``[CLS] X [SEP]``
- - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+ - single sequence: `[CLS] X [SEP]`
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
@@ -197,21 +199,21 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
- ::
-
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of ids.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 34124fc272..b9afb0d363 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -28,10 +28,10 @@ logger = logging.get_logger(__name__)
CLASS_DOCSTRING = """
This is a generic model class that will be instantiated as one of the model classes of the library when created
- with the :meth:`~transformers.BaseAutoModelClass.from_pretrained` class method or the
- :meth:`~transformers.BaseAutoModelClass.from_config` class method.
+ with the [`~BaseAutoModelClass.from_pretrained`] class method or the
+ [`~BaseAutoModelClass.from_config`] class method.
- This class cannot be instantiated directly using ``__init__()`` (throws an error).
+ This class cannot be instantiated directly using `__init__()` (throws an error).
"""
FROM_CONFIG_DOCSTRING = """
@@ -39,309 +39,314 @@ FROM_CONFIG_DOCSTRING = """
Note:
Loading a model from its configuration file does **not** load the model weights. It only affects the
- model's configuration. Use :meth:`~transformers.BaseAutoModelClass.from_pretrained` to load the model
+ model's configuration. Use [`~BaseAutoModelClass.from_pretrained`] to load the model
weights.
Args:
- config (:class:`~transformers.PretrainedConfig`):
+ config ([`PretrainedConfig`]):
The model class to instantiate is selected based on the configuration class:
List options
- Examples::
+ Examples:
- >>> from transformers import AutoConfig, BaseAutoModelClass
- >>> # Download configuration from huggingface.co and cache.
- >>> config = AutoConfig.from_pretrained('checkpoint_placeholder')
- >>> model = BaseAutoModelClass.from_config(config)
+ ```python
+ >>> from transformers import AutoConfig, BaseAutoModelClass
+ >>> # Download configuration from huggingface.co and cache.
+ >>> config = AutoConfig.from_pretrained('checkpoint_placeholder')
+ >>> model = BaseAutoModelClass.from_config(config)
+ ```
"""
FROM_PRETRAINED_TORCH_DOCSTRING = """
Instantiate one of the model classes of the library from a pretrained model.
- The model class to instantiate is selected based on the :obj:`model_type` property of the config object (either
- passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's missing,
- by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+ The model class to instantiate is selected based on the `model_type` property of the config object (either
+ passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing,
+ by falling back to using pattern matching on `pretrained_model_name_or_path`:
List options
- The model is set in evaluation mode by default using ``model.eval()`` (so for instance, dropout modules are
- deactivated). To train the model, you should first set it back in training mode with ``model.train()``
+ The model is set in evaluation mode by default using `model.eval()` (so for instance, dropout modules are
+ deactivated). To train the model, you should first set it back in training mode with `model.train()`
Args:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
- this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
- as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+ this case, `from_tf` should be set to `True` and a configuration object should be provided
+ as `config` argument. This loading path is slower than converting the TensorFlow checkpoint in
a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- model_args (additional positional arguments, `optional`):
- Will be passed along to the underlying model ``__init__()`` method.
- config (:class:`~transformers.PretrainedConfig`, `optional`):
+ model_args (additional positional arguments, *optional*):
+ Will be passed along to the underlying model `__init__()` method.
+ config ([`PretrainedConfig`], *optional*):
Configuration for the model to use instead of an automatically loaded configuration. Configuration can
be automatically loaded when:
- - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+ - The model is a model provided by the library (loaded with the *model id* string of a pretrained
model).
- - The model was saved using :meth:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
+ - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
by supplying the save directory.
- - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
- configuration JSON file named `config.json` is found in the directory.
- state_dict (`Dict[str, torch.Tensor]`, `optional`):
+ - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+ configuration JSON file named *config.json* is found in the directory.
+ state_dict (*Dict[str, torch.Tensor]*, *optional*):
A state dictionary to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own
weights. In this case though, you should check if using
- :func:`~transformers.PreTrainedModel.save_pretrained` and
- :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
- cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+ [`~PreTrainedModel.save_pretrained`] and
+ [`~PreTrainedModel.from_pretrained`] is not a simpler option.
+ cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
- from_tf (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ from_tf (`bool`, *optional*, defaults to `False`):
Load the model weights from a TensorFlow checkpoint save file (see docstring of
- ``pretrained_model_name_or_path`` argument).
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ `pretrained_model_name_or_path` argument).
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
- output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ output_loading_info(`bool`, *optional*, defaults to `False`):
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
- local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+ local_files_only(`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (e.g., not try downloading the model).
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- trust_remote_code (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ trust_remote_code (`bool`, *optional*, defaults to `False`):
Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
- should only be set to :obj:`True` for repositories you trust and in which you have read the code, as it
+ should only be set to `True` for repositories you trust and in which you have read the code, as it
will execute code present on the Hub on your local machine.
- kwargs (additional keyword arguments, `optional`):
+ kwargs (additional keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+ `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
automatically loaded:
- - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
- underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+ - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+ underlying model's `__init__` method (we assume all relevant updates to the configuration have
already been done)
- - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
- initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
- ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
- with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
- attribute will be passed to the underlying model's ``__init__`` function.
+ - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+ initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+ `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+ with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+ attribute will be passed to the underlying model's `__init__` function.
- Examples::
+ Examples:
- >>> from transformers import AutoConfig, BaseAutoModelClass
+ ```python
+ >>> from transformers import AutoConfig, BaseAutoModelClass
- >>> # Download model and configuration from huggingface.co and cache.
- >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
+ >>> # Download model and configuration from huggingface.co and cache.
+ >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
- >>> # Update configuration during loading
- >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
- >>> model.config.output_attentions
- True
+ >>> # Update configuration during loading
+ >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
+ >>> model.config.output_attentions
+ True
- >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
- >>> config = AutoConfig.from_pretrained('./tf_model/shortcut_placeholder_tf_model_config.json')
- >>> model = BaseAutoModelClass.from_pretrained('./tf_model/shortcut_placeholder_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+ >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+ >>> config = AutoConfig.from_pretrained('./tf_model/shortcut_placeholder_tf_model_config.json')
+ >>> model = BaseAutoModelClass.from_pretrained('./tf_model/shortcut_placeholder_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+ ```
"""
FROM_PRETRAINED_TF_DOCSTRING = """
Instantiate one of the model classes of the library from a pretrained model.
- The model class to instantiate is selected based on the :obj:`model_type` property of the config object (either
- passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's missing,
- by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+ The model class to instantiate is selected based on the `model_type` property of the config object (either
+ passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing,
+ by falling back to using pattern matching on `pretrained_model_name_or_path`:
List options
Args:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `PyTorch state_dict save file` (e.g, ``./pt_model/pytorch_model.bin``). In
- this case, ``from_pt`` should be set to :obj:`True` and a configuration object should be provided
- as ``config`` argument. This loading path is slower than converting the PyTorch model in a
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In
+ this case, `from_pt` should be set to `True` and a configuration object should be provided
+ as `config` argument. This loading path is slower than converting the PyTorch model in a
TensorFlow model using the provided conversion scripts and loading the TensorFlow model
afterwards.
- model_args (additional positional arguments, `optional`):
- Will be passed along to the underlying model ``__init__()`` method.
- config (:class:`~transformers.PretrainedConfig`, `optional`):
+ model_args (additional positional arguments, *optional*):
+ Will be passed along to the underlying model `__init__()` method.
+ config ([`PretrainedConfig`], *optional*):
Configuration for the model to use instead of an automatically loaded configuration. Configuration can
be automatically loaded when:
- - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+ - The model is a model provided by the library (loaded with the *model id* string of a pretrained
model).
- - The model was saved using :meth:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
+ - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
by supplying the save directory.
- - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
- configuration JSON file named `config.json` is found in the directory.
- cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+ - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+ configuration JSON file named *config.json* is found in the directory.
+ cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
- from_pt (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ from_pt (`bool`, *optional*, defaults to `False`):
Load the model weights from a PyTorch checkpoint save file (see docstring of
- ``pretrained_model_name_or_path`` argument).
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ `pretrained_model_name_or_path` argument).
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
- output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ output_loading_info(`bool`, *optional*, defaults to `False`):
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
- local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+ local_files_only(`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (e.g., not try downloading the model).
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- trust_remote_code (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ trust_remote_code (`bool`, *optional*, defaults to `False`):
Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
- should only be set to :obj:`True` for repositories you trust and in which you have read the code, as it
+ should only be set to `True` for repositories you trust and in which you have read the code, as it
will execute code present on the Hub on your local machine.
- kwargs (additional keyword arguments, `optional`):
+ kwargs (additional keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+ `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
automatically loaded:
- - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
- underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+ - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+ underlying model's `__init__` method (we assume all relevant updates to the configuration have
already been done)
- - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
- initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
- ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
- with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
- attribute will be passed to the underlying model's ``__init__`` function.
+ - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+ initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+ `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+ with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+ attribute will be passed to the underlying model's `__init__` function.
- Examples::
+ Examples:
- >>> from transformers import AutoConfig, BaseAutoModelClass
+ ```python
+ >>> from transformers import AutoConfig, BaseAutoModelClass
- >>> # Download model and configuration from huggingface.co and cache.
- >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
+ >>> # Download model and configuration from huggingface.co and cache.
+ >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
- >>> # Update configuration during loading
- >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
- >>> model.config.output_attentions
- True
+ >>> # Update configuration during loading
+ >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
+ >>> model.config.output_attentions
+ True
- >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
- >>> config = AutoConfig.from_pretrained('./pt_model/shortcut_placeholder_pt_model_config.json')
- >>> model = BaseAutoModelClass.from_pretrained('./pt_model/shortcut_placeholder_pytorch_model.bin', from_pt=True, config=config)
+ >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
+ >>> config = AutoConfig.from_pretrained('./pt_model/shortcut_placeholder_pt_model_config.json')
+ >>> model = BaseAutoModelClass.from_pretrained('./pt_model/shortcut_placeholder_pytorch_model.bin', from_pt=True, config=config)
+ ```
"""
FROM_PRETRAINED_FLAX_DOCSTRING = """
Instantiate one of the model classes of the library from a pretrained model.
- The model class to instantiate is selected based on the :obj:`model_type` property of the config object (either
- passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's missing,
- by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+ The model class to instantiate is selected based on the `model_type` property of the config object (either
+ passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing,
+ by falling back to using pattern matching on `pretrained_model_name_or_path`:
List options
Args:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
Can be either:
- - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing model weights saved using
- :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
- - A path or url to a `PyTorch state_dict save file` (e.g, ``./pt_model/pytorch_model.bin``). In
- this case, ``from_pt`` should be set to :obj:`True` and a configuration object should be provided
- as ``config`` argument. This loading path is slower than converting the PyTorch model in a
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing model weights saved using
+ [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+ - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In
+ this case, `from_pt` should be set to `True` and a configuration object should be provided
+ as `config` argument. This loading path is slower than converting the PyTorch model in a
TensorFlow model using the provided conversion scripts and loading the TensorFlow model
afterwards.
- model_args (additional positional arguments, `optional`):
- Will be passed along to the underlying model ``__init__()`` method.
- config (:class:`~transformers.PretrainedConfig`, `optional`):
+ model_args (additional positional arguments, *optional*):
+ Will be passed along to the underlying model `__init__()` method.
+ config ([`PretrainedConfig`], *optional*):
Configuration for the model to use instead of an automatically loaded configuration. Configuration can
be automatically loaded when:
- - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+ - The model is a model provided by the library (loaded with the *model id* string of a pretrained
model).
- - The model was saved using :meth:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
+ - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
by supplying the save directory.
- - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
- configuration JSON file named `config.json` is found in the directory.
- cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+ - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+ configuration JSON file named *config.json* is found in the directory.
+ cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
- from_pt (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ from_pt (`bool`, *optional*, defaults to `False`):
Load the model weights from a PyTorch checkpoint save file (see docstring of
- ``pretrained_model_name_or_path`` argument).
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ `pretrained_model_name_or_path` argument).
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
- output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ output_loading_info(`bool`, *optional*, defaults to `False`):
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
- local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+ local_files_only(`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (e.g., not try downloading the model).
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- trust_remote_code (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ trust_remote_code (`bool`, *optional*, defaults to `False`):
Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
- should only be set to :obj:`True` for repositories you trust and in which you have read the code, as it
+ should only be set to `True` for repositories you trust and in which you have read the code, as it
will execute code present on the Hub on your local machine.
- kwargs (additional keyword arguments, `optional`):
+ kwargs (additional keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+ `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
automatically loaded:
- - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
- underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+ - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+ underlying model's `__init__` method (we assume all relevant updates to the configuration have
already been done)
- - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
- initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
- ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
- with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
- attribute will be passed to the underlying model's ``__init__`` function.
+ - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+ initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+ `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+ with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+ attribute will be passed to the underlying model's `__init__` function.
- Examples::
+ Examples:
- >>> from transformers import AutoConfig, BaseAutoModelClass
+ ```python
+ >>> from transformers import AutoConfig, BaseAutoModelClass
- >>> # Download model and configuration from huggingface.co and cache.
- >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
+ >>> # Download model and configuration from huggingface.co and cache.
+ >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
- >>> # Update configuration during loading
- >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
- >>> model.config.output_attentions
- True
+ >>> # Update configuration during loading
+ >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
+ >>> model.config.output_attentions
+ True
- >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
- >>> config = AutoConfig.from_pretrained('./pt_model/shortcut_placeholder_pt_model_config.json')
- >>> model = BaseAutoModelClass.from_pretrained('./pt_model/shortcut_placeholder_pytorch_model.bin', from_pt=True, config=config)
+ >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
+ >>> config = AutoConfig.from_pretrained('./pt_model/shortcut_placeholder_pt_model_config.json')
+ >>> model = BaseAutoModelClass.from_pretrained('./pt_model/shortcut_placeholder_pytorch_model.bin', from_pt=True, config=config)
+ ```
"""
@@ -445,9 +450,9 @@ class _BaseAutoModelClass:
Register a new model for this class.
Args:
- config_class (:class:`~transformers.PretrainedConfig`):
+ config_class ([`PretrainedConfig`]):
The configuration corresponding to the model to register.
- model_class (:class:`~transformers.PreTrainedModel`):
+ model_class ([`PreTrainedModel`]):
The model to register.
"""
if hasattr(model_class, "config_class") and model_class.config_class != config_class:
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 81e0749c51..bfe9772036 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -475,9 +475,9 @@ def replace_list_option_in_docstrings(config_to_class=None, use_model_types=True
class AutoConfig:
r"""
This is a generic configuration class that will be instantiated as one of the configuration classes of the library
- when created with the :meth:`~transformers.AutoConfig.from_pretrained` class method.
+ when created with the [`~AutoConfig.from_pretrained`] class method.
- This class cannot be instantiated directly using ``__init__()`` (throws an error).
+ This class cannot be instantiated directly using `__init__()` (throws an error).
"""
def __init__(self):
@@ -501,81 +501,81 @@ class AutoConfig:
r"""
Instantiate one of the configuration classes of the library from a pretrained model configuration.
- The configuration class to instantiate is selected based on the :obj:`model_type` property of the config object
+ The configuration class to instantiate is selected based on the `model_type` property of the config object
that is loaded, or when it's missing, by falling back to using pattern matching on
- :obj:`pretrained_model_name_or_path`:
+ `pretrained_model_name_or_path`:
List options
Args:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
Can be either:
- - A string, the `model id` of a pretrained model configuration hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
- namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing a configuration file saved using the
- :meth:`~transformers.PretrainedConfig.save_pretrained` method, or the
- :meth:`~transformers.PreTrainedModel.save_pretrained` method, e.g., ``./my_model_directory/``.
- - A path or url to a saved configuration JSON `file`, e.g.,
- ``./my_model_directory/configuration.json``.
- cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+ - A string, the *model id* of a pretrained model configuration hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+ namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing a configuration file saved using the
+ [`~PretrainedConfig.save_pretrained`] method, or the
+ [`~PreTrainedModel.save_pretrained`] method, e.g., `./my_model_directory/`.
+ - A path or url to a saved configuration JSON *file*, e.g.,
+ `./my_model_directory/configuration.json`.
+ cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download the model weights and configuration files and override the
cached versions if they exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
- If :obj:`False`, then this function returns just the final configuration object.
+ return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+ If `False`, then this function returns just the final configuration object.
- If :obj:`True`, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs`
+ If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs*
is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e.,
- the part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored.
- trust_remote_code (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ the part of `kwargs` which has not been used to update `config` and is otherwise ignored.
+ trust_remote_code (`bool`, *optional*, defaults to `False`):
Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
- should only be set to :obj:`True` for repositories you trust and in which you have read the code, as it
+ should only be set to `True` for repositories you trust and in which you have read the code, as it
will execute code present on the Hub on your local machine.
- kwargs(additional keyword arguments, `optional`):
+ kwargs(additional keyword arguments, *optional*):
The values in kwargs of any keys which are configuration attributes will be used to override the loaded
values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
- by the ``return_unused_kwargs`` keyword parameter.
+ by the `return_unused_kwargs` keyword parameter.
- Examples::
+ Examples:
- >>> from transformers import AutoConfig
+ ```python
+ >>> from transformers import AutoConfig
- >>> # Download configuration from huggingface.co and cache.
- >>> config = AutoConfig.from_pretrained('bert-base-uncased')
+ >>> # Download configuration from huggingface.co and cache.
+ >>> config = AutoConfig.from_pretrained('bert-base-uncased')
- >>> # Download configuration from huggingface.co (user-uploaded) and cache.
- >>> config = AutoConfig.from_pretrained('dbmdz/bert-base-german-cased')
+ >>> # Download configuration from huggingface.co (user-uploaded) and cache.
+ >>> config = AutoConfig.from_pretrained('dbmdz/bert-base-german-cased')
- >>> # If configuration file is in a directory (e.g., was saved using `save_pretrained('./test/saved_model/')`).
- >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/')
+ >>> # If configuration file is in a directory (e.g., was saved using *save_pretrained('./test/saved_model/')*).
+ >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/')
- >>> # Load a specific configuration file.
- >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
+ >>> # Load a specific configuration file.
+ >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
- >>> # Change some config attributes when loading a pretrained config.
- >>> config = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
- >>> config.output_attentions
- True
- >>> config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False, return_unused_kwargs=True)
- >>> config.output_attentions
- True
- >>> config.unused_kwargs
- {'foo': False}
- """
+ >>> # Change some config attributes when loading a pretrained config.
+ >>> config = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
+ >>> config.output_attentions
+ True
+ >>> config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False, return_unused_kwargs=True)
+ >>> config.output_attentions
+ True
+ >>> config.unused_kwargs
+ {'foo': False}
+ ```"""
kwargs["_from_auto"] = True
kwargs["name_or_path"] = pretrained_model_name_or_path
trust_remote_code = kwargs.pop("trust_remote_code", False)
@@ -619,8 +619,8 @@ class AutoConfig:
Register a new configuration for this class.
Args:
- model_type (:obj:`str`): The model type like "bert" or "gpt".
- config (:class:`~transformers.PretrainedConfig`): The config to register.
+ model_type (`str`): The model type like "bert" or "gpt".
+ config ([`PretrainedConfig`]): The config to register.
"""
if issubclass(config, PretrainedConfig) and config.model_type != model_type:
raise ValueError(
diff --git a/src/transformers/models/auto/dynamic.py b/src/transformers/models/auto/dynamic.py
index daf8161a05..1185298d85 100644
--- a/src/transformers/models/auto/dynamic.py
+++ b/src/transformers/models/auto/dynamic.py
@@ -120,60 +120,63 @@ def get_class_from_dynamic_module(
"""
Extracts a class from a module file, present in the local folder or repository of a model.
- .. warning::
+
- Calling this function will execute the code in the module file found locally or downloaded from the Hub. It
- should therefore only be called on trusted repos.
+ Calling this function will execute the code in the module file found locally or downloaded from the Hub. It
+ should therefore only be called on trusted repos.
+
+
Args:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- - a string, the `model id` of a pretrained model configuration hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
- namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - a path to a `directory` containing a configuration file saved using the
- :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g., ``./my_model_directory/``.
+ - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+ namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - a path to a *directory* containing a configuration file saved using the
+ [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
- module_file (:obj:`str`):
+ module_file (`str`):
The name of the module file containing the class to look for.
- class_name (:obj:`str`):
+ class_name (`str`):
The name of the class to import in the module.
- cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+ cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
cache should not be used.
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the configuration files and override the cached versions if they
exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
- use_auth_token (:obj:`str` or `bool`, `optional`):
- The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
- generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+ generated when running `transformers-cli login` (stored in `~/.huggingface`).
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
- If :obj:`True`, will only try to load the tokenizer configuration from local files.
+ local_files_only (`bool`, *optional*, defaults to `False`):
+ If `True`, will only try to load the tokenizer configuration from local files.
- .. note::
+
- Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+ Passing `use_auth_token=True` is required when you want to use a private model.
+
Returns:
- :obj:`type`: The class, dynamically imported from the module.
+ `type`: The class, dynamically imported from the module.
- Examples::
+ Examples:
- # Download module `modeling.py` from huggingface.co and cache then extract the class `MyBertModel` from this
- # module.
- cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel")
- """
+ ```python
+ # Download module *modeling.py* from huggingface.co and cache then extract the class *MyBertModel* from this
+ # module.
+ cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel")
+ ```"""
if is_offline_mode() and not local_files_only:
logger.info("Offline mode: forcing local_files_only=True")
local_files_only = True
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 45f12953f9..d5b2213ec4 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -65,9 +65,9 @@ def feature_extractor_class_from_name(class_name: str):
class AutoFeatureExtractor:
r"""
This is a generic feature extractor class that will be instantiated as one of the feature extractor classes of the
- library when created with the :meth:`AutoFeatureExtractor.from_pretrained` class method.
+ library when created with the [`AutoFeatureExtractor.from_pretrained`] class method.
- This class cannot be instantiated directly using ``__init__()`` (throws an error).
+ This class cannot be instantiated directly using `__init__()` (throws an error).
"""
def __init__(self):
@@ -82,68 +82,69 @@ class AutoFeatureExtractor:
r"""
Instantiate one of the feature extractor classes of the library from a pretrained model vocabulary.
- The feature extractor class to instantiate is selected based on the :obj:`model_type` property of the config
- object (either passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when
- it's missing, by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+ The feature extractor class to instantiate is selected based on the `model_type` property of the config
+ object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when
+ it's missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
List options
Params:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
- namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - a path to a `directory` containing a feature extractor file saved using the
- :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` method, e.g.,
- ``./my_model_directory/``.
- - a path or url to a saved feature extractor JSON `file`, e.g.,
- ``./my_model_directory/preprocessor_config.json``.
- cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+ - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+ namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - a path to a *directory* containing a feature extractor file saved using the
+ [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
+ `./my_model_directory/`.
+ - a path or url to a saved feature extractor JSON *file*, e.g.,
+ `./my_model_directory/preprocessor_config.json`.
+ cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
standard cache should not be used.
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the feature extractor files and override the cached versions
if they exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received file. Attempts to resume the download if such a file
exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
- use_auth_token (:obj:`str` or `bool`, `optional`):
- The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
- generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+ generated when running `transformers-cli login` (stored in `~/.huggingface`).
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
- If :obj:`False`, then this function returns just the final feature extractor object. If :obj:`True`,
- then this functions returns a :obj:`Tuple(feature_extractor, unused_kwargs)` where `unused_kwargs` is a
+ return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+ If `False`, then this function returns just the final feature extractor object. If `True`,
+ then this functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a
dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the
- part of ``kwargs`` which has not been used to update ``feature_extractor`` and is otherwise ignored.
- kwargs (:obj:`Dict[str, Any]`, `optional`):
+ part of `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+ kwargs (`Dict[str, Any]`, *optional*):
The values in kwargs of any keys which are feature extractor attributes will be used to override the
loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
- controlled by the ``return_unused_kwargs`` keyword parameter.
+ controlled by the `return_unused_kwargs` keyword parameter.
- .. note::
+
- Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+ Passing `use_auth_token=True` is required when you want to use a private model.
- Examples::
+
- >>> from transformers import AutoFeatureExtractor
+ Examples:
- >>> # Download feature extractor from huggingface.co and cache.
- >>> feature_extractor = AutoFeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')
+ ```python
+ >>> from transformers import AutoFeatureExtractor
- >>> # If feature extractor files are in a directory (e.g. feature extractor was saved using `save_pretrained('./test/saved_model/')`)
- >>> feature_extractor = AutoFeatureExtractor.from_pretrained('./test/saved_model/')
+ >>> # Download feature extractor from huggingface.co and cache.
+ >>> feature_extractor = AutoFeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')
- """
+ >>> # If feature extractor files are in a directory (e.g. feature extractor was saved using *save_pretrained('./test/saved_model/')*)
+ >>> feature_extractor = AutoFeatureExtractor.from_pretrained('./test/saved_model/')
+ ```"""
config = kwargs.pop("config", None)
kwargs["_from_auto"] = True
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 5a5cf8ac8a..f9f4868369 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -62,9 +62,9 @@ def processor_class_from_name(class_name: str):
class AutoProcessor:
r"""
This is a generic processor class that will be instantiated as one of the processor classes of the library when
- created with the :meth:`AutoProcessor.from_pretrained` class method.
+ created with the [`AutoProcessor.from_pretrained`] class method.
- This class cannot be instantiated directly using ``__init__()`` (throws an error).
+ This class cannot be instantiated directly using `__init__()` (throws an error).
"""
def __init__(self):
@@ -79,64 +79,65 @@ class AutoProcessor:
r"""
Instantiate one of the processor classes of the library from a pretrained model vocabulary.
- The processor class to instantiate is selected based on the :obj:`model_type` property of the config object
- (either passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible):
+ The processor class to instantiate is selected based on the `model_type` property of the config object
+ (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible):
List options
Params:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
- namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - a path to a `directory` containing a processor files saved using the :obj:`save_pretrained()` method,
- e.g., ``./my_model_directory/``.
- cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+ - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+ namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - a path to a *directory* containing a processor files saved using the `save_pretrained()` method,
+ e.g., `./my_model_directory/`.
+ cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
standard cache should not be used.
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the feature extractor files and override the cached versions
if they exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received file. Attempts to resume the download if such a file
exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
- use_auth_token (:obj:`str` or `bool`, `optional`):
- The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
- generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
- revision (:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+ generated when running `transformers-cli login` (stored in `~/.huggingface`).
+ revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
- If :obj:`False`, then this function returns just the final feature extractor object. If :obj:`True`,
- then this functions returns a :obj:`Tuple(feature_extractor, unused_kwargs)` where `unused_kwargs` is a
+ return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+ If `False`, then this function returns just the final feature extractor object. If `True`,
+ then this functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a
dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the
- part of ``kwargs`` which has not been used to update ``feature_extractor`` and is otherwise ignored.
- kwargs (:obj:`Dict[str, Any]`, `optional`):
+ part of `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+ kwargs (`Dict[str, Any]`, *optional*):
The values in kwargs of any keys which are feature extractor attributes will be used to override the
loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
- controlled by the ``return_unused_kwargs`` keyword parameter.
+ controlled by the `return_unused_kwargs` keyword parameter.
- .. note::
+
- Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+ Passing `use_auth_token=True` is required when you want to use a private model.
- Examples::
+
- >>> from transformers import AutoProcessor
+ Examples:
- >>> # Download processor from huggingface.co and cache.
- >>> processor = AutoProcessor.from_pretrained('facebook/wav2vec2-base-960h')
+ ```python
+ >>> from transformers import AutoProcessor
- >>> # If processor files are in a directory (e.g. processor was saved using `save_pretrained('./test/saved_model/')`)
- >>> processor = AutoProcessor.from_pretrained('./test/saved_model/')
+ >>> # Download processor from huggingface.co and cache.
+ >>> processor = AutoProcessor.from_pretrained('facebook/wav2vec2-base-960h')
- """
+ >>> # If processor files are in a directory (e.g. processor was saved using *save_pretrained('./test/saved_model/')*)
+ >>> processor = AutoProcessor.from_pretrained('./test/saved_model/')
+ ```"""
config = kwargs.pop("config", None)
kwargs["_from_auto"] = True
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index ca7103f238..6ec092f368 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -273,58 +273,59 @@ def get_tokenizer_config(
Loads the tokenizer configuration from a pretrained model tokenizer configuration.
Args:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- - a string, the `model id` of a pretrained model configuration hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
- namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - a path to a `directory` containing a configuration file saved using the
- :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g., ``./my_model_directory/``.
+ - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+ namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - a path to a *directory* containing a configuration file saved using the
+ [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
- cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+ cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
cache should not be used.
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the configuration files and override the cached versions if they
exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
- use_auth_token (:obj:`str` or `bool`, `optional`):
- The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
- generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+ generated when running `transformers-cli login` (stored in `~/.huggingface`).
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
- If :obj:`True`, will only try to load the tokenizer configuration from local files.
+ local_files_only (`bool`, *optional*, defaults to `False`):
+ If `True`, will only try to load the tokenizer configuration from local files.
- .. note::
+
- Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+ Passing `use_auth_token=True` is required when you want to use a private model.
+
Returns:
- :obj:`Dict`: The configuration of the tokenizer.
+ `Dict`: The configuration of the tokenizer.
- Examples::
+ Examples:
- # Download configuration from huggingface.co and cache.
- tokenizer_config = get_tokenizer_config("bert-base-uncased")
- # This model does not have a tokenizer config so the result will be an empty dict.
- tokenizer_config = get_tokenizer_config("xlm-roberta-base")
+ ```python
+ # Download configuration from huggingface.co and cache.
+ tokenizer_config = get_tokenizer_config("bert-base-uncased")
+ # This model does not have a tokenizer config so the result will be an empty dict.
+ tokenizer_config = get_tokenizer_config("xlm-roberta-base")
- # Save a pretrained tokenizer locally and you can reload its config
- from transformers import AutoTokenizer
+ # Save a pretrained tokenizer locally and you can reload its config
+ from transformers import AutoTokenizer
- tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
- tokenizer.save_pretrained("tokenizer-test")
- tokenizer_config = get_tokenizer_config("tokenizer-test")
- """
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+ tokenizer.save_pretrained("tokenizer-test")
+ tokenizer_config = get_tokenizer_config("tokenizer-test")
+ ```"""
if is_offline_mode() and not local_files_only:
logger.info("Offline mode: forcing local_files_only=True")
local_files_only = True
@@ -360,9 +361,9 @@ def get_tokenizer_config(
class AutoTokenizer:
r"""
This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
- created with the :meth:`AutoTokenizer.from_pretrained` class method.
+ created with the [`AutoTokenizer.from_pretrained`] class method.
- This class cannot be instantiated directly using ``__init__()`` (throws an error).
+ This class cannot be instantiated directly using `__init__()` (throws an error).
"""
def __init__(self):
@@ -377,75 +378,74 @@ class AutoTokenizer:
r"""
Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
- The tokenizer class to instantiate is selected based on the :obj:`model_type` property of the config object
- (either passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's
- missing, by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+ The tokenizer class to instantiate is selected based on the `model_type` property of the config object
+ (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's
+ missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
List options
Params:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
Can be either:
- - A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co.
- Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
- a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved
- using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.,
- ``./my_model_directory/``.
+ - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+ a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
+ using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g.,
+ `./my_model_directory/`.
- A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
- single vocabulary file (like Bert or XLNet), e.g.: ``./my_model_directory/vocab.txt``. (Not
+ single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
applicable to all derived classes)
- inputs (additional positional arguments, `optional`):
- Will be passed along to the Tokenizer ``__init__()`` method.
- config (:class:`~transformers.PretrainedConfig`, `optional`)
+ inputs (additional positional arguments, *optional*):
+ Will be passed along to the Tokenizer `__init__()` method.
+ config ([`PretrainedConfig`], *optional*)
The configuration object used to dertermine the tokenizer class to instantiate.
- cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+ cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
- force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download the model weights and configuration files and override the
cached versions if they exist.
- resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
- proxies (:obj:`Dict[str, str]`, `optional`):
- A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
- 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
- revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ revision(`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
- git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
- subfolder (:obj:`str`, `optional`):
+ subfolder (`str`, *optional*):
In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
facebook/rag-token-base), specify it here.
- use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_fast (`bool`, *optional*, defaults to `True`):
Whether or not to try to load the fast version of the tokenizer.
- tokenizer_type (:obj:`str`, `optional`):
+ tokenizer_type (`str`, *optional*):
Tokenizer type to be loaded.
- trust_remote_code (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ trust_remote_code (`bool`, *optional*, defaults to `False`):
Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
- should only be set to :obj:`True` for repositories you trust and in which you have read the code, as it
+ should only be set to `True` for repositories you trust and in which you have read the code, as it
will execute code present on the Hub on your local machine.
- kwargs (additional keyword arguments, `optional`):
- Will be passed to the Tokenizer ``__init__()`` method. Can be used to set special tokens like
- ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``,
- ``mask_token``, ``additional_special_tokens``. See parameters in the ``__init__()`` for more details.
+ kwargs (additional keyword arguments, *optional*):
+ Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
+ `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`,
+ `mask_token`, `additional_special_tokens`. See parameters in the `__init__()` for more details.
- Examples::
+ Examples:
- >>> from transformers import AutoTokenizer
+ ```python
+ >>> from transformers import AutoTokenizer
- >>> # Download vocabulary from huggingface.co and cache.
- >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+ >>> # Download vocabulary from huggingface.co and cache.
+ >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
- >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
- >>> tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+ >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
+ >>> tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
- >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
- >>> tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
-
- """
+ >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
+ >>> tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
+ ```"""
config = kwargs.pop("config", None)
kwargs["_from_auto"] = True
@@ -568,11 +568,11 @@ class AutoTokenizer:
Args:
- config_class (:class:`~transformers.PretrainedConfig`):
+ config_class ([`PretrainedConfig`]):
The configuration corresponding to the model to register.
- slow_tokenizer_class (:class:`~transformers.PretrainedTokenizer`, `optional`):
+ slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
The slow tokenizer to register.
- slow_tokenizer_class (:class:`~transformers.PretrainedTokenizerFast`, `optional`):
+ slow_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
The fast tokenizer to register.
"""
if slow_tokenizer_class is None and fast_tokenizer_class is None:
diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 86ca38a61d..3e978bba50 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -32,79 +32,79 @@ BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BartConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.BartModel`. It is used to
+ This is the configuration class to store the configuration of a [`BartModel`]. It is used to
instantiate a BART model according to the specified arguments, defining the model architecture. Instantiating a
- configuration with the defaults will yield a similar configuration to that of the BART `facebook/bart-large
- `__ architecture.
+ configuration with the defaults will yield a similar configuration to that of the BART [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50265):
+ vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.BartModel` or
- :class:`~transformers.TFBartModel`.
- d_model (:obj:`int`, `optional`, defaults to 1024):
+ `inputs_ids` passed when calling [`BartModel`] or
+ [`TFBartModel`].
+ d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
- encoder_layers (:obj:`int`, `optional`, defaults to 12):
+ encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
- decoder_layers (:obj:`int`, `optional`, defaults to 12):
+ decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
- encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
- decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
- classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- init_std (:obj:`float`, `optional`, defaults to 0.02):
+ init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details.
- decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the decoder. See the `LayerDrop paper `__ for more details.
- scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ scale_embedding (`bool`, *optional*, defaults to `False`):
Scale embeddings by diving by sqrt(d_model).
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
- num_labels: (:obj:`int`, `optional`, defaults to 3):
- The number of labels to use in :class:`~transformers.BartForSequenceClassification`.
- forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
- The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
- :obj:`eos_token_id`.
+ num_labels: (`int`, *optional*, defaults to 3):
+ The number of labels to use in [`BartForSequenceClassification`].
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+ `eos_token_id`.
- Example::
+ Example:
- >>> from transformers import BartModel, BartConfig
+ ```python
+ >>> from transformers import BartModel, BartConfig
- >>> # Initializing a BART facebook/bart-large style configuration
- >>> configuration = BartConfig()
+ >>> # Initializing a BART facebook/bart-large style configuration
+ >>> configuration = BartConfig()
- >>> # Initializing a model from the facebook/bart-large style configuration
- >>> model = BartModel(configuration)
+ >>> # Initializing a model from the facebook/bart-large style configuration
+ >>> model = BartModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "bart"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py
index b001b5bf58..67a5872e65 100644
--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -1016,17 +1016,18 @@ class FlaxBartPreTrainedModel(FlaxPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+ ```python
+ >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
- >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
- >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+ >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+ >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
- >>> encoder_outputs = model.encode(**inputs)
- """
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+ >>> encoder_outputs = model.encode(**inputs)
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1081,23 +1082,24 @@ class FlaxBartPreTrainedModel(FlaxPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+ ```python
+ >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
- >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
- >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+ >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+ >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
- >>> encoder_outputs = model.encode(**inputs)
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+ >>> encoder_outputs = model.encode(**inputs)
- >>> decoder_start_token_id = model.config.decoder_start_token_id
- >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+ >>> decoder_start_token_id = model.config.decoder_start_token_id
+ >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> last_decoder_hidden_states = outputs.last_hidden_state
- """
+ >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+ >>> last_decoder_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1348,23 +1350,24 @@ class FlaxBartForConditionalGeneration(FlaxBartPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+ ```python
+ >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
- >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
- >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+ >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+ >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
- >>> encoder_outputs = model.encode(**inputs)
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+ >>> encoder_outputs = model.encode(**inputs)
- >>> decoder_start_token_id = model.config.decoder_start_token_id
- >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+ >>> decoder_start_token_id = model.config.decoder_start_token_id
+ >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> logits = outputs.logits
- """
+ >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py
index 5a6b960dbb..ccd189ab1b 100644
--- a/src/transformers/models/bart/tokenization_bart.py
+++ b/src/transformers/models/bart/tokenization_bart.py
@@ -56,8 +56,8 @@ class BartTokenizer(RobertaTokenizer):
r"""
Construct a BART tokenizer.
- :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass
- :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
+ [`BartTokenizer`] is identical to [`RobertaTokenizer`]. Refer to superclass
+ [`RobertaTokenizer`] for usage examples and documentation concerning the initialization
parameters and other methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py
index 10ba84e7ab..33bda3efb1 100644
--- a/src/transformers/models/bart/tokenization_bart_fast.py
+++ b/src/transformers/models/bart/tokenization_bart_fast.py
@@ -63,10 +63,10 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class BartTokenizerFast(RobertaTokenizerFast):
r"""
- Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library).
+ Construct a "fast" BART tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer to
- superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
+ [`BartTokenizerFast`] is identical to [`RobertaTokenizerFast`]. Refer to
+ superclass [`RobertaTokenizerFast`] for usage examples and documentation concerning the
initialization parameters and other methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py
index 2f39f421bd..ea0ae5897b 100644
--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -48,65 +48,70 @@ SPIECE_UNDERLINE = "▁"
class BarthezTokenizer(PreTrainedTokenizer):
"""
- Adapted from :class:`~transformers.CamembertTokenizer` and :class:`~transformers.BartTokenizer`. Construct a
- BARThez tokenizer. Based on `SentencePiece `__.
+ Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a
+ BARThez tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes:
- sp_model (:obj:`SentencePieceProcessor`):
- The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+ sp_model (`SentencePieceProcessor`):
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -160,17 +165,17 @@ class BarthezTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BARThez sequence has the following format:
- - single sequence: `` X ``
- - pair of sequences: `` A B ``
+ - single sequence: ` X `
+ - pair of sequences: ` A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
@@ -184,18 +189,18 @@ class BarthezTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
@@ -213,13 +218,13 @@ class BarthezTokenizer(PreTrainedTokenizer):
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py
index a66f5936a9..f896a331a0 100644
--- a/src/transformers/models/barthez/tokenization_barthez_fast.py
+++ b/src/transformers/models/barthez/tokenization_barthez_fast.py
@@ -58,46 +58,52 @@ SPIECE_UNDERLINE = "▁"
class BarthezTokenizerFast(PreTrainedTokenizerFast):
"""
- Adapted from :class:`~transformers.CamembertTokenizer` and :class:`~transformers.BartTokenizer`. Construct a "fast"
- BARThez tokenizer. Based on `SentencePiece `__.
+ Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a "fast"
+ BARThez tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
"""
@@ -146,17 +152,17 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BARThez sequence has the following format:
- - single sequence: `` X ``
- - pair of sequences: `` A B ``
+ - single sequence: ` X `
+ - pair of sequences: ` A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
@@ -172,13 +178,13 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py
index adb6af893f..c2e599371c 100644
--- a/src/transformers/models/bartpho/tokenization_bartpho.py
+++ b/src/transformers/models/bartpho/tokenization_bartpho.py
@@ -45,68 +45,72 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"vinai/bartpho-syllable": 1024}
class BartphoTokenizer(PreTrainedTokenizer):
"""
- Adapted from :class:`~transformers.XLMRobertaTokenizer`. Based on `SentencePiece
- `__.
+ Adapted from [`XLMRobertaTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file. This vocabulary is the pre-trained SentencePiece model available from the
multilingual XLM-RoBERTa, also used in mBART, consisting of 250K types.
- monolingual_vocab_file (:obj:`str`):
+ monolingual_vocab_file (`str`):
Path to the monolingual vocabulary file. This monolingual vocabulary consists of Vietnamese-specialized
types extracted from the multilingual vocabulary vocab_file of 250K types.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes:
- sp_model (:obj:`SentencePieceProcessor`):
- The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+ sp_model (`SentencePieceProcessor`):
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -183,17 +187,17 @@ class BartphoTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An BARTPho sequence has the following format:
- - single sequence: `` X ``
- - pair of sequences: `` A B ``
+ - single sequence: ` X `
+ - pair of sequences: ` A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
@@ -207,18 +211,18 @@ class BartphoTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -238,13 +242,13 @@ class BartphoTokenizer(PreTrainedTokenizer):
make use of token type ids, therefore a list of zeros is returned.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py
index 15a0b82b7e..6634fc03b1 100644
--- a/src/transformers/models/beit/configuration_beit.py
+++ b/src/transformers/models/beit/configuration_beit.py
@@ -28,86 +28,87 @@ BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BeitConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.BeitModel`. It is used to
+ This is the configuration class to store the configuration of a [`BeitModel`]. It is used to
instantiate an BEiT model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the BEiT
- `microsoft/beit-base-patch16-224-in22k `__
+ [microsoft/beit-base-patch16-224-in22k](https://huggingface.co/microsoft/beit-base-patch16-224-in22k)
architecture.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 8092):
+ vocab_size (`int`, *optional*, defaults to 8092):
Vocabulary size of the BEiT model. Defines the number of different image tokens that can be used during
pre-training.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- image_size (:obj:`int`, `optional`, defaults to :obj:`224`):
+ image_size (`int`, *optional*, defaults to `224`):
The size (resolution) of each image.
- patch_size (:obj:`int`, `optional`, defaults to :obj:`16`):
+ patch_size (`int`, *optional*, defaults to `16`):
The size (resolution) of each patch.
- num_channels (:obj:`int`, `optional`, defaults to :obj:`3`):
+ num_channels (`int`, *optional*, defaults to `3`):
The number of input channels.
- use_mask_token (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ use_mask_token (`bool`, *optional*, defaults to `False`):
Whether to use a mask token for masked image modeling.
- use_absolute_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ use_absolute_position_embeddings (`bool`, *optional*, defaults to `False`):
Whether to use BERT-style absolute position embeddings.
- use_relative_position_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ use_relative_position_bias (`bool`, *optional*, defaults to `False`):
Whether to use T5-style relative position embeddings in the self-attention layers.
- use_shared_relative_position_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ use_shared_relative_position_bias (`bool`, *optional*, defaults to `False`):
Whether to use the same relative position embeddings across all self-attention layers of the Transformer.
- layer_scale_init_value (:obj:`float`, `optional`, defaults to 0.1):
+ layer_scale_init_value (`float`, *optional*, defaults to 0.1):
Scale to use in the self-attention layers. 0.1 for base, 1e-5 for large. Set 0 to disable layer scale.
- drop_path_rate (:obj:`float`, `optional`, defaults to 0.1):
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
Stochastic depth rate per sample (when applied in the main path of residual layers).
- use_mean_pooling (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_mean_pooling (`bool`, *optional*, defaults to `True`):
Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the
CLS token, before applying the classification head.
- out_indices (:obj:`List[int]`, `optional`, defaults to :obj:`[3, 5, 7, 11]`):
+ out_indices (`List[int]`, *optional*, defaults to `[3, 5, 7, 11]`):
Indices of the feature maps to use for semantic segmentation.
- pool_scales (:obj:`Tuple[int]`, `optional`, defaults to :obj:`[1, 2, 3, 6]`):
+ pool_scales (`Tuple[int]`, *optional*, defaults to `[1, 2, 3, 6]`):
Pooling scales used in Pooling Pyramid Module applied on the last feature map.
- use_auxiliary_head (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_auxiliary_head (`bool`, *optional*, defaults to `True`):
Whether to use an auxiliary head during training.
- auxiliary_loss_weight (:obj:`float`, `optional`, defaults to 0.4):
+ auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
Weight of the cross-entropy loss of the auxiliary head.
- auxiliary_channels (:obj:`int`, `optional`, defaults to 256):
+ auxiliary_channels (`int`, *optional*, defaults to 256):
Number of channels to use in the auxiliary head.
- auxiliary_num_convs (:obj:`int`, `optional`, defaults to 1):
+ auxiliary_num_convs (`int`, *optional*, defaults to 1):
Number of convolutional layers to use in the auxiliary head.
- auxiliary_concat_input (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ auxiliary_concat_input (`bool`, *optional*, defaults to `False`):
Whether to concatenate the output of the auxiliary head with the input before the classification layer.
- semantic_loss_ignore_index (:obj:`int`, `optional`, defaults to 255):
+ semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
The index that is ignored by the loss function of the semantic segmentation model.
- Example::
+ Example:
- >>> from transformers import BeitModel, BeitConfig
+ ```python
+ >>> from transformers import BeitModel, BeitConfig
- >>> # Initializing a BEiT beit-base-patch16-224-in22k style configuration
- >>> configuration = BeitConfig()
+ >>> # Initializing a BEiT beit-base-patch16-224-in22k style configuration
+ >>> configuration = BeitConfig()
- >>> # Initializing a model from the beit-base-patch16-224-in22k style configuration
- >>> model = BeitModel(configuration)
+ >>> # Initializing a model from the beit-base-patch16-224-in22k style configuration
+ >>> model = BeitModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "beit"
def __init__(
diff --git a/src/transformers/models/beit/feature_extraction_beit.py b/src/transformers/models/beit/feature_extraction_beit.py
index 66067b34ee..997f860115 100644
--- a/src/transformers/models/beit/feature_extraction_beit.py
+++ b/src/transformers/models/beit/feature_extraction_beit.py
@@ -38,34 +38,34 @@ class BeitFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
r"""
Constructs a BEiT feature extractor.
- This feature extractor inherits from :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` which
+ This feature extractor inherits from [`~feature_extraction_utils.FeatureExtractionMixin`] which
contains most of the main methods. Users should refer to this superclass for more information regarding those
methods.
Args:
- do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether to resize the input to a certain :obj:`size`.
- size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 256):
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Whether to resize the input to a certain `size`.
+ size (`int` or `Tuple(int)`, *optional*, defaults to 256):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
- integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
- is set to :obj:`True`.
- resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BICUBIC`):
- An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
- :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
- Only has an effect if :obj:`do_resize` is set to :obj:`True`.
- do_center_crop (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether to crop the input at the center. If the input size is smaller than :obj:`crop_size` along any edge,
+ integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+ is set to `True`.
+ resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
+ An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+ `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+ Only has an effect if `do_resize` is set to `True`.
+ do_center_crop (`bool`, *optional*, defaults to `True`):
+ Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge,
the image is padded with 0's and then center cropped.
- crop_size (:obj:`int`, `optional`, defaults to 224):
- Desired output size when applying center-cropping. Only has an effect if :obj:`do_center_crop` is set to
- :obj:`True`.
- do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether or not to normalize the input with :obj:`image_mean` and :obj:`image_std`.
- image_mean (:obj:`List[int]`, defaults to :obj:`[0.5, 0.5, 0.5]`):
+ crop_size (`int`, *optional*, defaults to 224):
+ Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to
+ `True`.
+ do_normalize (`bool`, *optional*, defaults to `True`):
+ Whether or not to normalize the input with `image_mean` and `image_std`.
+ image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
The sequence of means for each channel, to be used when normalizing images.
- image_std (:obj:`List[int]`, defaults to :obj:`[0.5, 0.5, 0.5]`):
+ image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
The sequence of standard deviations for each channel, to be used when normalizing images.
- reduce_labels (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ reduce_labels (`bool`, *optional*, defaults to `False`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
background label will be replaced by 255.
@@ -107,34 +107,36 @@ class BeitFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
"""
Main method to prepare for the model one or several image(s).
- .. warning::
+
- NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
- PIL images.
+ NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+ PIL images.
+
+
Args:
- images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
- segmentation_maps (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`, `optional`):
+ segmentation_maps (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
Optionally, the corresponding semantic segmentation maps with the pixel-wise annotations.
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
- * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
- * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
- * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
- :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
- - **labels** -- Optional labels to be fed to a model (when :obj:`segmentation_maps` are provided)
+ - **labels** -- Optional labels to be fed to a model (when `segmentation_maps` are provided)
"""
# Input type checking for clearer error
valid_images = False
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index bd8071ba70..d9a8f47c2f 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -626,22 +626,23 @@ class BeitModel(BeitPreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import BeitFeatureExtractor, BeitModel
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import BeitFeatureExtractor, BeitModel
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
- >>> model = BeitModel.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
+ >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
+ >>> model = BeitModel.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> last_hidden_states = outputs.last_hidden_state
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py
index 5a1b0c25af..b81f1afb41 100644
--- a/src/transformers/models/beit/modeling_flax_beit.py
+++ b/src/transformers/models/beit/modeling_flax_beit.py
@@ -728,21 +728,23 @@ class FlaxBeitModel(FlaxBeitPreTrainedModel):
FLAX_BEIT_MODEL_DOCSTRING = """
Returns:
- Examples::
+ Examples:
- >>> from transformers import BeitFeatureExtractor, FlaxBeitModel
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import BeitFeatureExtractor, FlaxBeitModel
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
- >>> model = FlaxBeitModel.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
+ >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
+ >>> model = FlaxBeitModel.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
- >>> inputs = feature_extractor(images=image, return_tensors="np")
- >>> outputs = model(**inputs)
- >>> last_hidden_states = outputs.last_hidden_state
+ >>> inputs = feature_extractor(images=image, return_tensors="np")
+ >>> outputs = model(**inputs)
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```
"""
overwrite_call_docstring(FlaxBeitModel, FLAX_BEIT_MODEL_DOCSTRING)
@@ -897,24 +899,26 @@ class FlaxBeitForImageClassification(FlaxBeitPreTrainedModel):
FLAX_BEIT_CLASSIF_DOCSTRING = """
Returns:
- Example::
+ Example:
- >>> from transformers import BeitFeatureExtractor, FlaxBeitForImageClassification
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import BeitFeatureExtractor, FlaxBeitForImageClassification
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224')
- >>> model = FlaxBeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224')
+ >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224')
+ >>> model = FlaxBeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224')
- >>> inputs = feature_extractor(images=image, return_tensors="np")
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- >>> # model predicts one of the 1000 ImageNet classes
- >>> predicted_class_idx = logits.argmax(-1).item()
- >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+ >>> inputs = feature_extractor(images=image, return_tensors="np")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ >>> # model predicts one of the 1000 ImageNet classes
+ >>> predicted_class_idx = logits.argmax(-1).item()
+ >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+ ```
"""
overwrite_call_docstring(FlaxBeitForImageClassification, FLAX_BEIT_CLASSIF_DOCSTRING)
diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py
index 861cdfbc8e..a3a3ef5ac8 100644
--- a/src/transformers/models/bert/configuration_bert.py
+++ b/src/transformers/models/bert/configuration_bert.py
@@ -53,71 +53,70 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BertConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
- :class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
+ This is the configuration class to store the configuration of a [`BertModel`] or a
+ [`TFBertModel`]. It is used to instantiate a BERT model according to the specified arguments,
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
- to that of the BERT `bert-base-uncased `__ architecture.
+ to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
+ vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
- :class:`~transformers.TFBertModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ `inputs_ids` passed when calling [`BertModel`] or
+ [`TFBertModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
- :class:`~transformers.TFBertModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or
+ [`TFBertModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
- Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
- :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
- :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
- `__. For more information on :obj:`"relative_key_query"`, please refer to
- `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
- `__.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+ `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+ `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+ *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
- relevant if ``config.is_decoder=True``.
- classifier_dropout (:obj:`float`, `optional`):
+ relevant if `config.is_decoder=True`.
+ classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
- Examples::
+ Examples:
- >>> from transformers import BertModel, BertConfig
+ ```python
+ >>> from transformers import BertModel, BertConfig
- >>> # Initializing a BERT bert-base-uncased style configuration
- >>> configuration = BertConfig()
+ >>> # Initializing a BERT bert-base-uncased style configuration
+ >>> configuration = BertConfig()
- >>> # Initializing a model from the bert-base-uncased style configuration
- >>> model = BertModel(configuration)
+ >>> # Initializing a model from the bert-base-uncased style configuration
+ >>> model = BertModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "bert"
def __init__(
diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py
index 6ffcc9d221..369195df7d 100644
--- a/src/transformers/models/bert/modeling_flax_bert.py
+++ b/src/transformers/models/bert/modeling_flax_bert.py
@@ -832,18 +832,20 @@ class FlaxBertForPreTraining(FlaxBertPreTrainedModel):
FLAX_BERT_FOR_PRETRAINING_DOCSTRING = """
Returns:
- Example::
+ Example:
- >>> from transformers import BertTokenizer, FlaxBertForPreTraining
+ ```python
+ >>> from transformers import BertTokenizer, FlaxBertForPreTraining
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- >>> model = FlaxBertForPreTraining.from_pretrained('bert-base-uncased')
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+ >>> model = FlaxBertForPreTraining.from_pretrained('bert-base-uncased')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.prediction_logits
- >>> seq_relationship_logits = outputs.seq_relationship_logits
+ >>> prediction_logits = outputs.prediction_logits
+ >>> seq_relationship_logits = outputs.seq_relationship_logits
+ ```
"""
overwrite_call_docstring(
@@ -976,20 +978,22 @@ class FlaxBertForNextSentencePrediction(FlaxBertPreTrainedModel):
FLAX_BERT_FOR_NEXT_SENT_PRED_DOCSTRING = """
Returns:
- Example::
+ Example:
- >>> from transformers import BertTokenizer, FlaxBertForNextSentencePrediction
+ ```python
+ >>> from transformers import BertTokenizer, FlaxBertForNextSentencePrediction
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- >>> model = FlaxBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+ >>> model = FlaxBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
- >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
- >>> encoding = tokenizer(prompt, next_sentence, return_tensors='jax')
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+ >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+ >>> encoding = tokenizer(prompt, next_sentence, return_tensors='jax')
- >>> outputs = model(**encoding)
- >>> logits = outputs.logits
- >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+ >>> outputs = model(**encoding)
+ >>> logits = outputs.logits
+ >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+ ```
"""
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 6c3ceaa5bf..3b3854be71 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -1599,21 +1599,22 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi
r"""
Return:
- Examples::
+ Examples:
- >>> import tensorflow as tf
- >>> from transformers import BertTokenizer, TFBertForNextSentencePrediction
+ ```python
+ >>> import tensorflow as tf
+ >>> from transformers import BertTokenizer, TFBertForNextSentencePrediction
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- >>> model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+ >>> model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
- >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
- >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+ >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+ >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
- >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
- >>> assert logits[0][0] < logits[0][1] # the next sentence was random
- """
+ >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
+ >>> assert logits[0][0] < logits[0][1] # the next sentence was random
+ ```"""
inputs = input_processing(
func=self.call,
config=self.config,
diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py
index 897fb32761..5520f845cc 100644
--- a/src/transformers/models/bert/tokenization_bert.py
+++ b/src/transformers/models/bert/tokenization_bert.py
@@ -118,42 +118,41 @@ class BertTokenizer(PreTrainedTokenizer):
r"""
Construct a BERT tokenizer. Based on WordPiece.
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
File containing the vocabulary.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
- do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_basic_tokenize (`bool`, *optional*, defaults to `True`):
Whether or not to do basic tokenization before WordPiece.
- never_split (:obj:`Iterable`, `optional`):
+ never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
- :obj:`do_basic_tokenize=True`
- unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+ `do_basic_tokenize=True`
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
- This should likely be deactivated for Japanese (see this `issue
- `__).
- strip_accents: (:obj:`bool`, `optional`):
+ This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+ strip_accents: (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
- value for :obj:`lowercase` (as in the original BERT).
+ value for `lowercase` (as in the original BERT).
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -252,17 +251,17 @@ class BertTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- - single sequence: ``[CLS] X [SEP]``
- - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+ - single sequence: `[CLS] X [SEP]`
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -275,18 +274,18 @@ class BertTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -305,21 +304,21 @@ class BertTokenizer(PreTrainedTokenizer):
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
- ::
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
-
- If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
@@ -354,19 +353,18 @@ class BasicTokenizer(object):
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
- never_split (:obj:`Iterable`, `optional`):
+ never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
- :obj:`do_basic_tokenize=True`
- tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ `do_basic_tokenize=True`
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
- This should likely be deactivated for Japanese (see this `issue
- `__).
- strip_accents: (:obj:`bool`, `optional`):
+ This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+ strip_accents: (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
- value for :obj:`lowercase` (as in the original BERT).
+ value for `lowercase` (as in the original BERT).
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
@@ -383,9 +381,9 @@ class BasicTokenizer(object):
WordPieceTokenizer.
Args:
- **never_split**: (`optional`) list of str
+ never_split (`LIst[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
- :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+ [`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
# union() returns a new set by concatenating the two sets.
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
@@ -511,14 +509,14 @@ class WordpieceTokenizer(object):
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
- For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+ For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
- text: A single token or whitespace separated tokens. This should have
- already been passed through `BasicTokenizer`.
+ text: A single token or whitespace separated tokens. This should have
+ already been passed through *BasicTokenizer*.
Returns:
- A list of wordpiece tokens.
+ A list of wordpiece tokens.
"""
output_tokens = []
diff --git a/src/transformers/models/bert/tokenization_bert_fast.py b/src/transformers/models/bert/tokenization_bert_fast.py
index 8004978f60..5b0ebaf086 100644
--- a/src/transformers/models/bert/tokenization_bert_fast.py
+++ b/src/transformers/models/bert/tokenization_bert_fast.py
@@ -116,41 +116,41 @@ PRETRAINED_INIT_CONFIGURATION = {
class BertTokenizerFast(PreTrainedTokenizerFast):
r"""
- Construct a "fast" BERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece.
+ Construct a "fast" BERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
File containing the vocabulary.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ clean_text (`bool`, *optional*, defaults to `True`):
Whether or not to clean the text before tokenization by removing any control characters and replacing all
whitespaces by the classic one.
- tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
- issue `__).
- strip_accents: (:obj:`bool`, `optional`):
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+ Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+ issue](https://github.com/huggingface/transformers/issues/328)).
+ strip_accents: (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
- value for :obj:`lowercase` (as in the original BERT).
- wordpieces_prefix: (:obj:`str`, `optional`, defaults to :obj:`"##"`):
+ value for `lowercase` (as in the original BERT).
+ wordpieces_prefix: (`str`, *optional*, defaults to `"##"`):
The prefix for subwords.
"""
@@ -205,17 +205,17 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- - single sequence: ``[CLS] X [SEP]``
- - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+ - single sequence: `[CLS] X [SEP]`
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -231,21 +231,21 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
- ::
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
-
- If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/bert_generation/configuration_bert_generation.py b/src/transformers/models/bert_generation/configuration_bert_generation.py
index 2284f873e7..3c79f25fd2 100644
--- a/src/transformers/models/bert_generation/configuration_bert_generation.py
+++ b/src/transformers/models/bert_generation/configuration_bert_generation.py
@@ -20,62 +20,61 @@ from ...configuration_utils import PretrainedConfig
class BertGenerationConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a
- :class:`~transformers.BertGenerationPreTrainedModel`. It is used to instantiate a BertGeneration model according to
+ [`BertGenerationPreTrainedModel`]. It is used to instantiate a BertGeneration model according to
the specified arguments, defining the model architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50358):
+ vocab_size (`int`, *optional*, defaults to 50358):
Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.BertGeneration`.
- hidden_size (:obj:`int`, `optional`, defaults to 1024):
+ `inputs_ids` passed when calling [`BertGeneration`].
+ hidden_size (`int`, *optional*, defaults to 1024):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+ num_hidden_layers (`int`, *optional*, defaults to 24):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ num_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (often called feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
- Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
- :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
- :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
- `__. For more information on :obj:`"relative_key_query"`, please refer to
- `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
- `__.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+ `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+ `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+ *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
- relevant if ``config.is_decoder=True``.
+ relevant if `config.is_decoder=True`.
- Examples::
+ Examples:
- >>> from transformers import BertGenerationConfig, BertGenerationEncoder
+ ```python
+ >>> from transformers import BertGenerationConfig, BertGenerationEncoder
- >>> # Initializing a BertGeneration config
- >>> configuration = BertGenerationConfig()
+ >>> # Initializing a BertGeneration config
+ >>> configuration = BertGenerationConfig()
- >>> # Initializing a model from the config
- >>> model = BertGenerationEncoder(configuration)
+ >>> # Initializing a model from the config
+ >>> model = BertGenerationEncoder(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "bert-generation"
def __init__(
diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py
index 43676e2801..f6b7a7f9cc 100644
--- a/src/transformers/models/bert_generation/tokenization_bert_generation.py
+++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py
@@ -40,37 +40,36 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"bert_for_seq_generation": 512}
class BertGenerationTokenizer(PreTrainedTokenizer):
"""
- Construct a BertGeneration tokenizer. Based on `SentencePiece `__.
+ Construct a BertGeneration tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The begin of sequence token.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""
diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
index 41c686e41e..0d50dadd00 100644
--- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -74,20 +74,20 @@ class BertJapaneseTokenizer(BertTokenizer):
Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to a one-wordpiece-per-line vocabulary file.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
- do_word_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_word_tokenize (`bool`, *optional*, defaults to `True`):
Whether to do word tokenization.
- do_subword_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_subword_tokenize (`bool`, *optional*, defaults to `True`):
Whether to do subword tokenization.
- word_tokenizer_type (:obj:`str`, `optional`, defaults to :obj:`"basic"`):
+ word_tokenizer_type (`str`, *optional*, defaults to `"basic"`):
Type of word tokenizer.
- subword_tokenizer_type (:obj:`str`, `optional`, defaults to :obj:`"wordpiece"`):
+ subword_tokenizer_type (`str`, *optional*, defaults to `"wordpiece"`):
Type of subword tokenizer.
- mecab_kwargs (:obj:`str`, `optional`):
- Dictionary passed to the :obj:`MecabTokenizer` constructor.
+ mecab_kwargs (`str`, *optional*):
+ Dictionary passed to the `MecabTokenizer` constructor.
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -210,17 +210,17 @@ class MecabTokenizer:
Constructs a MecabTokenizer.
Args:
- **do_lower_case**: (`optional`) boolean (default True)
+ **do_lower_case**: (*optional*) boolean (default True)
Whether to lowercase the input.
- **never_split**: (`optional`) list of str
+ **never_split**: (*optional*) list of str
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
- :func:`PreTrainedTokenizer.tokenize`) List of tokens not to split.
- **normalize_text**: (`optional`) boolean (default True)
+ [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
+ **normalize_text**: (*optional*) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
- **mecab_dic**: (`optional`) string (default "ipadic")
+ **mecab_dic**: (*optional*) string (default "ipadic")
Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
- set this option to `None` and modify `mecab_option`.
- **mecab_option**: (`optional`) string
+ set this option to *None* and modify *mecab_option*.
+ **mecab_option**: (*optional*) string
String passed to MeCab constructor.
"""
self.do_lower_case = do_lower_case
@@ -326,11 +326,11 @@ class CharacterTokenizer:
"""
Tokenizes a piece of text into characters.
- For example, :obj:`input = "apple""` wil return as output :obj:`["a", "p", "p", "l", "e"]`.
+ For example, `input = "apple""` wil return as output `["a", "p", "p", "l", "e"]`.
Args:
text: A single token or whitespace separated tokens.
- This should have already been passed through `BasicTokenizer`.
+ This should have already been passed through *BasicTokenizer*.
Returns:
A list of characters.
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
index 76103d051c..dfa5e74699 100644
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -69,43 +69,49 @@ class BertweetTokenizer(PreTrainedTokenizer):
"""
Constructs a BERTweet tokenizer, using Byte-Pair-Encoding.
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- merges_file (:obj:`str`):
+ merges_file (`str`):
Path to the merges file.
- normalization (:obj:`bool`, `optional`, defaults to :obj:`False`)
+ normalization (`bool`, *optional*, defaults to `False`)
Whether or not to apply a normalization preprocess.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
"""
@@ -181,17 +187,17 @@ class BertweetTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERTweet sequence has the following format:
- - single sequence: `` X ``
- - pair of sequences: `` A B ``
+ - single sequence: ` X `
+ - pair of sequences: ` A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
@@ -205,18 +211,18 @@ class BertweetTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -236,13 +242,13 @@ class BertweetTokenizer(PreTrainedTokenizer):
not make use of token type ids, therefore a list of zeros is returned.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
@@ -621,12 +627,12 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8")
Args:
text:
- A unicode string or a byte string encoded in the given `encoding` (which defaults to 'utf-8').
+ A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8').
keep (list):
- List of entity names which should not be replaced. This supports both numeric entities (``nnnn;`` and
- ``hhhh;``) and named entities (such as `` `` or ``>``).
+ List of entity names which should not be replaced. This supports both numeric entities (`nnnn;` and
+ `hhhh;`) and named entities (such as ` ` or `>`).
remove_illegal (bool):
- If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
+ If *True*, entities that can't be converted are removed. Otherwise, entities that can't be converted are
kept "as is".
Returns: A unicode string with the entities removed.
@@ -674,21 +680,22 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8")
class TweetTokenizer:
r"""
- Examples::
+ Examples:
- >>> # Tokenizer for tweets.
- >>> from nltk.tokenize import TweetTokenizer
- >>> tknzr = TweetTokenizer()
- >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
- >>> tknzr.tokenize(s0)
- ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
+ ```python
+ >>> # Tokenizer for tweets.
+ >>> from nltk.tokenize import TweetTokenizer
+ >>> tknzr = TweetTokenizer()
+ >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
+ >>> tknzr.tokenize(s0)
+ ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
- >>> # Examples using `strip_handles` and `reduce_len parameters`:
- >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
- >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
- >>> tknzr.tokenize(s1)
- [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
- """
+ >>> # Examples using *strip_handles* and *reduce_len parameters*:
+ >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
+ >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
+ >>> tknzr.tokenize(s1)
+ [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
+ ```"""
def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False):
self.preserve_case = preserve_case
diff --git a/src/transformers/models/big_bird/configuration_big_bird.py b/src/transformers/models/big_bird/configuration_big_bird.py
index 85dd8de7dd..80dd708b92 100644
--- a/src/transformers/models/big_bird/configuration_big_bird.py
+++ b/src/transformers/models/big_bird/configuration_big_bird.py
@@ -30,62 +30,65 @@ BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BigBirdConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.BigBirdModel`. It is used to
+ This is the configuration class to store the configuration of a [`BigBirdModel`]. It is used to
instantiate an BigBird model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the BigBird
- `google/bigbird-roberta-base `__ architecture.
+ [google/bigbird-roberta-base](https://huggingface.co/google/bigbird-roberta-base) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50358):
+ vocab_size (`int`, *optional*, defaults to 50358):
Vocabulary size of the BigBird model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.BigBirdModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ `inputs_ids` passed when calling [`BigBirdModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
Dimension of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 4096):
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 1024 or 2048 or 4096).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BigBirdModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling [`BigBirdModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
- relevant if ``config.is_decoder=True``.
- attention_type (:obj:`str`, `optional`, defaults to :obj:`"block_sparse"`)
+ relevant if `config.is_decoder=True`.
+ attention_type (`str`, *optional*, defaults to `"block_sparse"`)
Whether to use block sparse attention (with n complexity) as introduced in paper or original attention
- layer (with n^2 complexity). Possible values are :obj:`"original_full"` and :obj:`"block_sparse"`.
- use_bias (:obj:`bool`, `optional`, defaults to :obj:`True`)
+ layer (with n^2 complexity). Possible values are `"original_full"` and `"block_sparse"`.
+ use_bias (`bool`, *optional*, defaults to `True`)
Whether to use bias in query, key, value.
- rescale_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`)
+ rescale_embeddings (`bool`, *optional*, defaults to `False`)
Whether to rescale embeddings with (hidden_size ** 0.5).
- block_size (:obj:`int`, `optional`, defaults to 64)
- Size of each block. Useful only when :obj:`attention_type == "block_sparse"`.
- num_random_blocks (:obj:`int`, `optional`, defaults to 3)
- Each query is going to attend these many number of random blocks. Useful only when :obj:`attention_type ==
- "block_sparse"`.
- classifier_dropout (:obj:`float`, `optional`):
+ block_size (`int`, *optional*, defaults to 64)
+ Size of each block. Useful only when `attention_type == "block_sparse"`.
+ num_random_blocks (`int`, *optional*, defaults to 3)
+ Each query is going to attend these many number of random blocks. Useful only when `attention_type == "block_sparse"`.
+ classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
- Example::
+ Example:
+
+ ```python
+
+ ```
>>> from transformers import BigBirdModel, BigBirdConfig
diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py
index b1ed49cd36..a1be468934 100644
--- a/src/transformers/models/big_bird/modeling_flax_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py
@@ -1635,18 +1635,20 @@ class FlaxBigBirdForPreTraining(FlaxBigBirdPreTrainedModel):
FLAX_BIG_BIRD_FOR_PRETRAINING_DOCSTRING = """
Returns:
- Example::
+ Example:
- >>> from transformers import BigBirdTokenizer, FlaxBigBirdForPreTraining
+ ```python
+ >>> from transformers import BigBirdTokenizer, FlaxBigBirdForPreTraining
- >>> tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
- >>> model = FlaxBigBirdForPreTraining.from_pretrained('google/bigbird-roberta-base')
+ >>> tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
+ >>> model = FlaxBigBirdForPreTraining.from_pretrained('google/bigbird-roberta-base')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.prediction_logits
- >>> seq_relationship_logits = outputs.seq_relationship_logits
+ >>> prediction_logits = outputs.prediction_logits
+ >>> seq_relationship_logits = outputs.seq_relationship_logits
+ ```
"""
overwrite_call_docstring(
diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py
index 92f652448d..355e3fd068 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird.py
@@ -46,47 +46,46 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class BigBirdTokenizer(PreTrainedTokenizer):
"""
- Construct a BigBird tokenizer. Based on `SentencePiece `__.
+ Construct a BigBird tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The begin of sequence token.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""
@@ -200,17 +199,17 @@ class BigBirdTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A Big Bird sequence has the following format:
- - single sequence: ``[CLS] X [SEP]``
- - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+ - single sequence: `[CLS] X [SEP]`
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -223,18 +222,18 @@ class BigBirdTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
@@ -251,16 +250,16 @@ class BigBirdTokenizer(PreTrainedTokenizer):
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second
- sequence | If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ sequence | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/big_bird/tokenization_big_bird_fast.py b/src/transformers/models/big_bird/tokenization_big_bird_fast.py
index 36f2afa337..26adf8a3ec 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird_fast.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird_fast.py
@@ -58,38 +58,40 @@ SPIECE_UNDERLINE = "▁"
class BigBirdTokenizerFast(PreTrainedTokenizerFast):
"""
- Construct a "fast" BigBird tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
- `__. This tokenizer
- inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
+ Construct a "fast" BigBird tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This tokenizer
+ inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
- that is used for the end of sequence. The token used is the :obj:`sep_token`.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ that is used for the end of sequence. The token used is the `sep_token`.
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
"""
@@ -147,17 +149,17 @@ class BigBirdTokenizerFast(PreTrainedTokenizerFast):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An BigBird sequence has the following format:
- - single sequence: ``[CLS] X [SEP]``
- - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+ - single sequence: `[CLS] X [SEP]`
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
@@ -170,18 +172,18 @@ class BigBirdTokenizerFast(PreTrainedTokenizerFast):
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of ids.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -203,21 +205,21 @@ class BigBirdTokenizerFast(PreTrainedTokenizerFast):
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
- ::
-
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of ids.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
index 297e2cede4..2d9fdd18d7 100644
--- a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
@@ -30,72 +30,75 @@ BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BigBirdPegasusConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.BigBirdPegasusModel`. It is
+ This is the configuration class to store the configuration of a [`BigBirdPegasusModel`]. It is
used to instantiate an BigBirdPegasus model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the BigBirdPegasus
- `google/bigbird-pegasus-large-arxiv `__ architecture.
+ [google/bigbird-pegasus-large-arxiv](https://huggingface.co/google/bigbird-pegasus-large-arxiv) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 96103):
+ vocab_size (`int`, *optional*, defaults to 96103):
Vocabulary size of the BigBirdPegasus model. Defines the number of different tokens that can be represented
- by the :obj:`inputs_ids` passed when calling :class:`~transformers.BigBirdPegasusModel`.
- d_model (:obj:`int`, `optional`, defaults to 1024):
+ by the `inputs_ids` passed when calling [`BigBirdPegasusModel`].
+ d_model (`int`, *optional*, defaults to 1024):
Dimension of the layers and the pooler layer.
- encoder_layers (:obj:`int`, `optional`, defaults to 16):
+ encoder_layers (`int`, *optional*, defaults to 16):
Number of encoder layers.
- decoder_layers (:obj:`int`, `optional`, defaults to 16):
+ decoder_layers (`int`, *optional*, defaults to 16):
Number of decoder layers.
- encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
- decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
- activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu_new"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
- classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 4096):
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 1024 or 2048 or 4096).
- init_std (:obj:`float`, `optional`, defaults to 0.02):
+ init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details.
- decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the decoder. See the `LayerDrop paper `__ for more details.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
- attention_type (:obj:`str`, `optional`, defaults to :obj:`"block_sparse"`)
+ attention_type (`str`, *optional*, defaults to `"block_sparse"`)
Whether to use block sparse attention (with n complexity) as introduced in paper or original attention
- layer (with n^2 complexity) in encoder. Possible values are :obj:`"original_full"` and
- :obj:`"block_sparse"`.
- use_bias (:obj:`bool`, `optional`, defaults to :obj:`False`)
+ layer (with n^2 complexity) in encoder. Possible values are `"original_full"` and
+ `"block_sparse"`.
+ use_bias (`bool`, *optional*, defaults to `False`)
Whether to use bias in query, key, value.
- block_size (:obj:`int`, `optional`, defaults to 64)
- Size of each block. Useful only when :obj:`attention_type == "block_sparse"`.
- num_random_blocks (:obj:`int`, `optional`, defaults to 3)
- Each query is going to attend these many number of random blocks. Useful only when :obj:`attention_type ==
- "block_sparse"`.
- scale_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`)
+ block_size (`int`, *optional*, defaults to 64)
+ Size of each block. Useful only when `attention_type == "block_sparse"`.
+ num_random_blocks (`int`, *optional*, defaults to 3)
+ Each query is going to attend these many number of random blocks. Useful only when `attention_type == "block_sparse"`.
+ scale_embeddings (`bool`, *optional*, defaults to `True`)
Whether to rescale embeddings with (hidden_size ** 0.5).
- Example::
+ Example:
+
+ ```python
+
+ ```
>>> from transformers import BigBirdPegasusModel, BigBirdPegasusConfig
diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
index 13acbdf699..5dccf86d9c 100644
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -28,77 +28,78 @@ BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BlenderbotConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotModel`. It is used
+ This is the configuration class to store the configuration of a [`BlenderbotModel`]. It is used
to instantiate an Blenderbot model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the Blenderbot
- `facebook/blenderbot-3B `__ architecture.
+ [facebook/blenderbot-3B](https://huggingface.co/facebook/blenderbot-3B) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50265):
+ vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the Blenderbot model. Defines the number of different tokens that can be represented by
- the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotModel` or
- :class:`~transformers.TFBlenderbotModel`.
- d_model (:obj:`int`, `optional`, defaults to 1024):
+ the `inputs_ids` passed when calling [`BlenderbotModel`] or
+ [`TFBlenderbotModel`].
+ d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
- encoder_layers (:obj:`int`, `optional`, defaults to 12):
+ encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
- decoder_layers (:obj:`int`, `optional`, defaults to 12):
+ decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
- encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
- decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
- classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 128):
+ max_position_embeddings (`int`, *optional*, defaults to 128):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- init_std (:obj:`float`, `optional`, defaults to 0.02):
+ init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details.
- decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the decoder. See the `LayerDrop paper `__ for more details.
- scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ scale_embedding (`bool`, *optional*, defaults to `False`):
Scale embeddings by diving by sqrt(d_model).
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models)
- forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
- The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
- :obj:`eos_token_id`.
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+ `eos_token_id`.
- Example::
+ Example:
- >>> from transformers import BlenderbotModel, BlenderbotConfig
+ ```python
+ >>> from transformers import BlenderbotModel, BlenderbotConfig
- >>> # Initializing a Blenderbot facebook/blenderbot-3B style configuration
- >>> configuration = BlenderbotConfig()
+ >>> # Initializing a Blenderbot facebook/blenderbot-3B style configuration
+ >>> configuration = BlenderbotConfig()
- >>> # Initializing a model from the facebook/blenderbot-3B style configuration
- >>> model = BlenderbotModel(configuration)
+ >>> # Initializing a model from the facebook/blenderbot-3B style configuration
+ >>> model = BlenderbotModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "blenderbot"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index d92678a76e..fc9597f76e 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -1128,19 +1128,20 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import BlenderbotTokenizer, BlenderbotModel
+ ```python
+ >>> from transformers import BlenderbotTokenizer, BlenderbotModel
- >>> model = BlenderbotModel.from_pretrained("facebook/blenderbot-400M-distill")
- >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+ >>> model = BlenderbotModel.from_pretrained("facebook/blenderbot-400M-distill")
+ >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
- >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
- >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
- >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+ >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
+ >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
+ >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
- >>> last_hidden_states = outputs.last_hidden_state
- """
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
index 068161dbcf..f3dcf35f64 100644
--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -977,17 +977,18 @@ class FlaxBlenderbotPreTrainedModel(FlaxPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
+ ```python
+ >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
- >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
- >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+ >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
+ >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
- >>> encoder_outputs = model.encode(**inputs)
- """
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+ >>> encoder_outputs = model.encode(**inputs)
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1044,23 +1045,24 @@ class FlaxBlenderbotPreTrainedModel(FlaxPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
+ ```python
+ >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
- >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
- >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+ >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
+ >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
- >>> encoder_outputs = model.encode(**inputs)
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+ >>> encoder_outputs = model.encode(**inputs)
- >>> decoder_start_token_id = model.config.decoder_start_token_id
- >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+ >>> decoder_start_token_id = model.config.decoder_start_token_id
+ >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> last_decoder_hidden_states = outputs.last_hidden_state
- """
+ >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+ >>> last_decoder_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1312,23 +1314,24 @@ class FlaxBlenderbotForConditionalGeneration(FlaxBlenderbotPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
+ ```python
+ >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
- >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
- >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+ >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
+ >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
- >>> encoder_outputs = model.encode(**inputs)
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+ >>> encoder_outputs = model.encode(**inputs)
- >>> decoder_start_token_id = model.config.decoder_start_token_id
- >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+ >>> decoder_start_token_id = model.config.decoder_start_token_id
+ >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> logits = outputs.logits
- """
+ >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py
index e003d80534..966b1294db 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -47,11 +47,11 @@ class BlenderbotTokenizer(RobertaTokenizer):
r"""
Construct a Blenderbot tokenizer.
- :class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
+ [`Blenderbot`] is nearly identical to [`RobertaTokenizer`] and runs
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
to the beginning of sequences.
- Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
+ Refer to superclass [`RobertaTokenizer`] for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -63,16 +63,16 @@ class BlenderbotTokenizer(RobertaTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A Blenderbot sequence has the following format:
- - single sequence: `` X ``
+ - single sequence: ` X `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Will be ignored
Returns:
- :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
return token_ids_0 + [self.eos_token_id]
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
index f7835d573c..f04ce1b369 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@@ -46,13 +46,13 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}
class BlenderbotTokenizerFast(RobertaTokenizerFast):
r"""
- Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's `tokenizers` library).
+ Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.BlenderbotFast` is nearly identical to :class:`~transformers.RobertaTokenizerFast` and runs
+ [`BlenderbotFast`] is nearly identical to [`RobertaTokenizerFast`] and runs
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
to the beginning of sequences.
- Refer to superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning
+ Refer to superclass [`RobertaTokenizerFast`] for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -65,16 +65,16 @@ class BlenderbotTokenizerFast(RobertaTokenizerFast):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A Blenderbot sequence has the following format:
- - single sequence: `` X ``
+ - single sequence: ` X `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Will be ignored
Returns:
- :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
return token_ids_0 + [self.eos_token_id]
diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
index 0f76e2e3ae..2490cb0207 100644
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -28,77 +28,78 @@ BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BlenderbotSmallConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotSmallModel`. It is
+ This is the configuration class to store the configuration of a [`BlenderbotSmallModel`]. It is
used to instantiate an BlenderbotSmall model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the BlenderbotSmall
- `facebook/blenderbot_small-90M `__ architecture.
+ [facebook/blenderbot_small-90M](https://huggingface.co/facebook/blenderbot_small-90M) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50265):
+ vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the BlenderbotSmall model. Defines the number of different tokens that can be
- represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotSmallModel` or
- :class:`~transformers.TFBlenderbotSmallModel`.
- d_model (:obj:`int`, `optional`, defaults to 512):
+ represented by the `inputs_ids` passed when calling [`BlenderbotSmallModel`] or
+ [`TFBlenderbotSmallModel`].
+ d_model (`int`, *optional*, defaults to 512):
Dimensionality of the layers and the pooler layer.
- encoder_layers (:obj:`int`, `optional`, defaults to 8):
+ encoder_layers (`int`, *optional*, defaults to 8):
Number of encoder layers.
- decoder_layers (:obj:`int`, `optional`, defaults to 8):
+ decoder_layers (`int`, *optional*, defaults to 8):
Number of decoder layers.
- encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
- decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+ decoder_ffn_dim (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+ encoder_ffn_dim (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
- classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- init_std (:obj:`float`, `optional`, defaults to 0.02):
+ init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details.
- decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the decoder. See the `LayerDrop paper `__ for more details.
- scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ scale_embedding (`bool`, *optional*, defaults to `False`):
Scale embeddings by diving by sqrt(d_model).
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models)
- forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
- The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
- :obj:`eos_token_id`.
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+ `eos_token_id`.
- Example::
+ Example:
- >>> from transformers import BlenderbotSmallModel, BlenderbotSmallConfig
+ ```python
+ >>> from transformers import BlenderbotSmallModel, BlenderbotSmallConfig
- >>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
- >>> configuration = BlenderbotSmallConfig()
+ >>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
+ >>> configuration = BlenderbotSmallConfig()
- >>> # Initializing a model from the facebook/blenderbot_small-90M style configuration
- >>> model = BlenderbotSmallModel(configuration)
+ >>> # Initializing a model from the facebook/blenderbot_small-90M style configuration
+ >>> model = BlenderbotSmallModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "blenderbot-small"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 276ff96a54..e0dcd95aa6 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -1115,19 +1115,20 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallModel
+ ```python
+ >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallModel
- >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot_small-90M")
- >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
+ >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot_small-90M")
+ >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
- >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
- >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
- >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+ >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
+ >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
+ >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
- >>> last_hidden_states = outputs.last_hidden_state
- """
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
index 1daf801b78..fd27a6c094 100644
--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -989,17 +989,18 @@ class FlaxBlenderbotSmallPreTrainedModel(FlaxPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+ ```python
+ >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
- >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
- >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+ >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
+ >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
- >>> encoder_outputs = model.encode(**inputs)
- """
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+ >>> encoder_outputs = model.encode(**inputs)
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1056,23 +1057,24 @@ class FlaxBlenderbotSmallPreTrainedModel(FlaxPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+ ```python
+ >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
- >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
- >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+ >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
+ >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
- >>> encoder_outputs = model.encode(**inputs)
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+ >>> encoder_outputs = model.encode(**inputs)
- >>> decoder_start_token_id = model.config.decoder_start_token_id
- >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+ >>> decoder_start_token_id = model.config.decoder_start_token_id
+ >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> last_decoder_hidden_states = outputs.last_hidden_state
- """
+ >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+ >>> last_decoder_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1324,23 +1326,24 @@ class FlaxBlenderbotSmallForConditionalGeneration(FlaxBlenderbotSmallPreTrainedM
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+ ```python
+ >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
- >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
- >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+ >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
+ >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
- >>> encoder_outputs = model.encode(**inputs)
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+ >>> encoder_outputs = model.encode(**inputs)
- >>> decoder_start_token_id = model.config.decoder_start_token_id
- >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+ >>> decoder_start_token_id = model.config.decoder_start_token_id
+ >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> logits = outputs.logits
- """
+ >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
index 1b8104e924..29746559be 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -68,25 +68,25 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
"""
Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding)
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to the superclass for more information regarding methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
File containing the vocabulary.
- merges_file (:obj:`str`):
+ merges_file (`str`):
Path to the merges file.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`"__start__"`):
+ bos_token (`str`, *optional*, defaults to `"__start__"`):
The beginning of sentence token.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`"__end__"`):
+ eos_token (`str`, *optional*, defaults to `"__end__"`):
The end of sentence token.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`"__unk__"`):
+ unk_token (`str`, *optional*, defaults to `"__unk__"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`"__pad__"`):
+ pad_token (`str`, *optional*, defaults to `"__pad__"`):
The token used for padding, for example when batching sequences of different lengths.
**kwargs
- Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+ Additional keyword arguments passed along to [`PreTrainedTokenizer`]
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
index 2867b598b7..63c8c39563 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
@@ -49,10 +49,10 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
"""
- Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's `tokenizers` library).
+ Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's *tokenizers* library).
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
"""
@@ -101,13 +101,13 @@ class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
does not make use of token type ids, therefore a list of zeros is returned.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
diff --git a/src/transformers/models/byt5/tokenization_byt5.py b/src/transformers/models/byt5/tokenization_byt5.py
index 4714dbd27e..7c0e94f35c 100644
--- a/src/transformers/models/byt5/tokenization_byt5.py
+++ b/src/transformers/models/byt5/tokenization_byt5.py
@@ -29,29 +29,31 @@ class ByT5Tokenizer(PreTrainedTokenizer):
"""
Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding.
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- extra_ids (:obj:`int`, `optional`, defaults to 100):
+ extra_ids (`int`, *optional*, defaults to 100):
Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
accessible as "" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
indexed from the end of the vocabulary up to beginning ("" is the last token in the vocabulary
- like in ByT5 preprocessing see `here
- `__).
- additional_special_tokens (:obj:`List[str]`, `optional`):
+ like in ByT5 preprocessing see [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
+ additional_special_tokens (`List[str]`, *optional*):
Additional special tokens used by the tokenizer.
"""
@@ -116,18 +118,18 @@ class ByT5Tokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
@@ -157,13 +159,13 @@ class ByT5Tokenizer(PreTrainedTokenizer):
make use of token type ids, therefore a list of zeros is returned.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
eos = [self.eos_token_id]
@@ -178,17 +180,17 @@ class ByT5Tokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:
- - single sequence: ``X ``
- - pair of sequences: ``A B ``
+ - single sequence: `X `
+ - pair of sequences: `A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
token_ids_0 = self._add_eos_if_not_present(token_ids_0)
if token_ids_1 is None:
diff --git a/src/transformers/models/camembert/configuration_camembert.py b/src/transformers/models/camembert/configuration_camembert.py
index 8a55e1c320..2479ca98a6 100644
--- a/src/transformers/models/camembert/configuration_camembert.py
+++ b/src/transformers/models/camembert/configuration_camembert.py
@@ -34,7 +34,7 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class CamembertConfig(RobertaConfig):
"""
- This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate
+ This class overrides [`RobertaConfig`]. Please check the superclass for the appropriate
documentation alongside usage examples.
"""
diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py
index c367e31255..2bb76236ee 100644
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -44,65 +44,70 @@ SPIECE_UNDERLINE = "▁"
class CamembertTokenizer(PreTrainedTokenizer):
"""
- Adapted from :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Construct a
- CamemBERT tokenizer. Based on `SentencePiece `__.
+ Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Construct a
+ CamemBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes:
- sp_model (:obj:`SentencePieceProcessor`):
- The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+ sp_model (`SentencePieceProcessor`):
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -158,17 +163,17 @@ class CamembertTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An CamemBERT sequence has the following format:
- - single sequence: `` X ``
- - pair of sequences: `` A B ``
+ - single sequence: ` X `
+ - pair of sequences: ` A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
@@ -182,18 +187,18 @@ class CamembertTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
@@ -212,13 +217,13 @@ class CamembertTokenizer(PreTrainedTokenizer):
RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py
index cce7e2f63c..782ba2f5c3 100644
--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -53,47 +53,52 @@ SPIECE_UNDERLINE = "▁"
class CamembertTokenizerFast(PreTrainedTokenizerFast):
"""
- Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
- :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE
- `__.
+ Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+ [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
"""
@@ -144,17 +149,17 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An CamemBERT sequence has the following format:
- - single sequence: `` X ``
- - pair of sequences: `` A B ``
+ - single sequence: ` X `
+ - pair of sequences: ` A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
@@ -171,13 +176,13 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
diff --git a/src/transformers/models/canine/configuration_canine.py b/src/transformers/models/canine/configuration_canine.py
index 79be54a824..b57a4fafff 100644
--- a/src/transformers/models/canine/configuration_canine.py
+++ b/src/transformers/models/canine/configuration_canine.py
@@ -28,66 +28,66 @@ CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class CanineConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.CanineModel`. It is used to
+ This is the configuration class to store the configuration of a [`CanineModel`]. It is used to
instantiate an CANINE model according to the specified arguments, defining the model architecture. Instantiating a
- configuration with the defaults will yield a similar configuration to that of the CANINE `google/canine-s
- `__ architecture.
+ configuration with the defaults will yield a similar configuration to that of the CANINE [google/canine-s](https://huggingface.co/google/canine-s) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ hidden_size (`int`, *optional*, defaults to 768):
Dimension of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the deep Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoders.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoders.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoders, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 16384):
+ max_position_embeddings (`int`, *optional*, defaults to 16384):
The maximum sequence length that this model might ever be used with.
- type_vocab_size (:obj:`int`, `optional`, defaults to 16):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.CanineModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 16):
+ The vocabulary size of the `token_type_ids` passed when calling [`CanineModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- downsampling_rate (:obj:`int`, `optional`, defaults to 4):
+ downsampling_rate (`int`, *optional*, defaults to 4):
The rate at which to downsample the original character sequence length before applying the deep Transformer
encoder.
- upsampling_kernel_size (:obj:`int`, `optional`, defaults to 4):
+ upsampling_kernel_size (`int`, *optional*, defaults to 4):
The kernel size (i.e. the number of characters in each window) of the convolutional projection layer when
- projecting back from :obj:`hidden_size`*2 to :obj:`hidden_size`.
- num_hash_functions (:obj:`int`, `optional`, defaults to 8):
+ projecting back from `hidden_size`*2 to `hidden_size`.
+ num_hash_functions (`int`, *optional*, defaults to 8):
The number of hash functions to use. Each hash function has its own embedding matrix.
- num_hash_buckets (:obj:`int`, `optional`, defaults to 16384):
+ num_hash_buckets (`int`, *optional*, defaults to 16384):
The number of hash buckets to use.
- local_transformer_stride (:obj:`int`, `optional`, defaults to 128):
+ local_transformer_stride (`int`, *optional*, defaults to 128):
The stride of the local attention of the first shallow Transformer encoder. Defaults to 128 for good
TPU/XLA memory alignment.
- Example::
+ Example:
- >>> from transformers import CanineModel, CanineConfig
+ ```python
+ >>> from transformers import CanineModel, CanineConfig
- >>> # Initializing a CANINE google/canine-s style configuration
- >>> configuration = CanineConfig()
+ >>> # Initializing a CANINE google/canine-s style configuration
+ >>> configuration = CanineConfig()
- >>> # Initializing a model from the google/canine-s style configuration
- >>> model = CanineModel(configuration)
+ >>> # Initializing a model from the google/canine-s style configuration
+ >>> model = CanineModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "canine"
def __init__(
diff --git a/src/transformers/models/canine/tokenization_canine.py b/src/transformers/models/canine/tokenization_canine.py
index 87580629c8..4bf96f1828 100644
--- a/src/transformers/models/canine/tokenization_canine.py
+++ b/src/transformers/models/canine/tokenization_canine.py
@@ -65,13 +65,13 @@ class CanineTokenizer(PreTrainedTokenizer):
Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
converts each character into its Unicode code point.
- :class:`~transformers.CanineTokenizer` inherits from :class:`~transformers.PreTrainedTokenizer`.
+ [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].
- Refer to superclass :class:`~transformers.PreTrainedTokenizer` for usage examples and documentation concerning
+ Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning
parameters.
Args:
- model_max_length (:obj:`int`, `optional`, defaults to 2048):
+ model_max_length (`int`, *optional*, defaults to 2048):
The maximum sentence length the model accepts.
"""
@@ -160,17 +160,17 @@ class CanineTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A CANINE sequence has the following format:
- - single sequence: ``[CLS] X [SEP]``
- - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+ - single sequence: `[CLS] X [SEP]`
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
@@ -185,18 +185,18 @@ class CanineTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
@@ -215,21 +215,21 @@ class CanineTokenizer(PreTrainedTokenizer):
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A CANINE
sequence pair mask has the following format:
- ::
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
-
- If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py
index 0f8b6fa9a4..e0a34e722d 100644
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -30,58 +30,58 @@ CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class CLIPTextConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.CLIPModel`. It is used to
+ This is the configuration class to store the configuration of a [`CLIPModel`]. It is used to
instantiate an CLIP model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the CLIP
- `openai/clip-vit-base-patch32 `__ architecture.
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 49408):
+ vocab_size (`int`, *optional*, defaults to 49408):
Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
- the :obj:`inputs_ids` passed when calling :class:`~transformers.CLIPModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 512):
+ the `inputs_ids` passed when calling [`CLIPModel`].
+ hidden_size (`int`, *optional*, defaults to 512):
Dimensionality of the encoder layers and the pooler layer.
- intermediate_size (:obj:`int`, `optional`, defaults to 2048):
+ intermediate_size (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 8):
+ num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 77):
+ max_position_embeddings (`int`, *optional*, defaults to 77):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"quick_gelu"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` :obj:`"quick_gelu"` are supported.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-5):
+ `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- dropout (:obj:`float`, `optional`, defaults to 0.0):
+ dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- initializer_factor (:obj:`float`, `optional`, defaults to 1):
+ initializer_factor (`float``, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
- Example::
+ Example:
- >>> from transformers import CLIPTextModel, CLIPTextConfig
+ ```python
+ >>> from transformers import CLIPTextModel, CLIPTextConfig
- >>> # Initializing a CLIPTextModel with openai/clip-vit-base-patch32 style configuration
- >>> configuration = CLIPTextConfig()
+ >>> # Initializing a CLIPTextModel with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPTextConfig()
- >>> # Initializing a CLIPTextConfig from the openai/clip-vit-base-patch32 style configuration
- >>> model = CLIPTextModel(configuration)
+ >>> # Initializing a CLIPTextConfig from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPTextModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "clip_text_model"
def __init__(
@@ -121,56 +121,56 @@ class CLIPTextConfig(PretrainedConfig):
class CLIPVisionConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.CLIPModel`. It is used to
+ This is the configuration class to store the configuration of a [`CLIPModel`]. It is used to
instantiate an CLIP model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the CLIP
- `openai/clip-vit-base-patch32 `__ architecture.
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- image_size (:obj:`int`, `optional`, defaults to 224):
+ image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
- patch_size (:obj:`int`, `optional`, defaults to 32):
+ patch_size (`int`, *optional*, defaults to 32):
The size (resolution) of each patch.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"quick_gelu"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` :obj:`"quick_gelu"` are supported.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-5):
+ `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
- dropout (:obj:`float`, `optional`, defaults to 0.0):
+ dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- initializer_factor (:obj:`float`, `optional`, defaults to 1):
+ initializer_factor (`float``, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
- Example::
+ Example:
- >>> from transformers import CLIPVisionModel, CLIPVisionConfig
+ ```python
+ >>> from transformers import CLIPVisionModel, CLIPVisionConfig
- >>> # Initializing a CLIPVisionModel with openai/clip-vit-base-patch32 style configuration
- >>> configuration = CLIPVisionConfig()
+ >>> # Initializing a CLIPVisionModel with openai/clip-vit-base-patch32 style configuration
+ >>> configuration = CLIPVisionConfig()
- >>> # Initializing a CLIPVisionModel model from the openai/clip-vit-base-patch32 style configuration
- >>> model = CLIPVisionModel(configuration)
+ >>> # Initializing a CLIPVisionModel model from the openai/clip-vit-base-patch32 style configuration
+ >>> model = CLIPVisionModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "clip_vision_model"
@@ -208,23 +208,23 @@ class CLIPVisionConfig(PretrainedConfig):
class CLIPConfig(PretrainedConfig):
r"""
- :class:`~transformers.CLIPConfig` is the configuration class to store the configuration of a
- :class:`~transformers.CLIPModel`. It is used to instantiate CLIP model according to the specified arguments,
+ [`CLIPConfig`] is the configuration class to store the configuration of a
+ [`CLIPModel`]. It is used to instantiate CLIP model according to the specified arguments,
defining the text model and vision model configs.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- text_config_dict (:obj:`dict`, `optional`):
- Dictionary of configuration options used to initialize :class:`~transformers.CLIPTextConfig`.
- vision_config_dict (:obj:`dict`, `optional`):
- Dictionary of configuration options used to initialize :class:`~transformers.CLIPVisionConfig`.
- projection_dim (:obj:`int`, `optional`, defaults to 512):
+ text_config_dict (`dict`, *optional*):
+ Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+ vision_config_dict (`dict`, *optional*):
+ Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+ projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
- logit_scale_init_value (:obj:`float`, `optional`, defaults to 2.6592):
- The inital value of the `logit_scale` paramter. Default is used as per the original CLIP implementation.
- kwargs (`optional`):
+ logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+ The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+ kwargs (*optional*):
Dictionary of keyword arguments.
"""
@@ -259,11 +259,11 @@ class CLIPConfig(PretrainedConfig):
@classmethod
def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
r"""
- Instantiate a :class:`~transformers.CLIPConfig` (or a derived class) from clip text model configuration and
+ Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and
clip vision model configuration.
Returns:
- :class:`CLIPConfig`: An instance of a configuration object
+ [`CLIPConfig`]: An instance of a configuration object
"""
return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs)
@@ -271,10 +271,10 @@ class CLIPConfig(PretrainedConfig):
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default
- :meth:`~transformers.PretrainedConfig.to_dict`.
+ [`~PretrainedConfig.to_dict`].
Returns:
- :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output["text_config"] = self.text_config.to_dict()
diff --git a/src/transformers/models/clip/feature_extraction_clip.py b/src/transformers/models/clip/feature_extraction_clip.py
index b6256b4686..45c5289c90 100644
--- a/src/transformers/models/clip/feature_extraction_clip.py
+++ b/src/transformers/models/clip/feature_extraction_clip.py
@@ -32,29 +32,29 @@ class CLIPFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
r"""
Constructs a CLIP feature extractor.
- This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+ This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether to resize the input to a certain :obj:`size`.
- size (:obj:`int`, `optional`, defaults to 224):
- Resize the input to the given size. Only has an effect if :obj:`do_resize` is set to :obj:`True`.
- resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BICUBIC`):
- An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
- :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
- Only has an effect if :obj:`do_resize` is set to :obj:`True`.
- do_center_crop (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether to crop the input at the center. If the input size is smaller than :obj:`crop_size` along any edge,
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Whether to resize the input to a certain `size`.
+ size (`int`, *optional*, defaults to 224):
+ Resize the input to the given size. Only has an effect if `do_resize` is set to `True`.
+ resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
+ An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+ `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+ Only has an effect if `do_resize` is set to `True`.
+ do_center_crop (`bool`, *optional*, defaults to `True`):
+ Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge,
the image is padded with 0's and then center cropped.
- crop_size (:obj:`int`, `optional`, defaults to 224):
- Desired output size when applying center-cropping. Only has an effect if :obj:`do_center_crop` is set to
- :obj:`True`.
- do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether or not to normalize the input with :obj:`image_mean` and :obj:`image_std`.
- image_mean (:obj:`List[int]`, defaults to :obj:`[0.485, 0.456, 0.406]`):
+ crop_size (`int`, *optional*, defaults to 224):
+ Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to
+ `True`.
+ do_normalize (`bool`, *optional*, defaults to `True`):
+ Whether or not to normalize the input with `image_mean` and `image_std`.
+ image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
The sequence of means for each channel, to be used when normalizing images.
- image_std (:obj:`List[int]`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+ image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
The sequence of standard deviations for each channel, to be used when normalizing images.
"""
@@ -93,27 +93,29 @@ class CLIPFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
"""
Main method to prepare for the model one or several image(s).
- .. warning::
+
- NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
- PIL images.
+ NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+ PIL images.
+
+
Args:
- images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
- * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
- * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
- * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
- :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model.
"""
@@ -157,13 +159,13 @@ class CLIPFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
def center_crop(self, image, size):
"""
- Crops :obj:`image` to the given size using a center crop. Note that if the image is too small to be cropped to
+ Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to
the size is given, it will be padded (so the returned result has the size asked).
Args:
- image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to resize.
- size (:obj:`int` or :obj:`Tuple[int, int]`):
+ size (`int` or `Tuple[int, int]`):
The size to which crop the image.
"""
self._ensure_format_supported(image)
@@ -183,14 +185,14 @@ class CLIPFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
def resize(self, image, size, resample=Image.BICUBIC):
"""
- Resizes :obj:`image`. Note that this will trigger a conversion of :obj:`image` to a PIL Image.
+ Resizes `image`. Note that this will trigger a conversion of `image` to a PIL Image.
Args:
- image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to resize.
- size (:obj:`int` or :obj:`Tuple[int, int]`):
- The size to use for resizing the image. If :obj:`int` it will be resized to match the shorter side
- resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
+ size (`int` or `Tuple[int, int]`):
+ The size to use for resizing the image. If `int` it will be resized to match the shorter side
+ resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
The filter to user for resampling.
"""
self._ensure_format_supported(image)
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index d61ce2553c..45e7c0b307 100755
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -704,19 +704,20 @@ class CLIPTextModel(CLIPPreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import CLIPTokenizer, CLIPTextModel
+ ```python
+ >>> from transformers import CLIPTokenizer, CLIPTextModel
- >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
- >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+ >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
- >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> last_hidden_state = outputs.last_hidden_state
- >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
- """
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
+ ```"""
return self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
@@ -810,24 +811,25 @@ class CLIPVisionModel(CLIPPreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from PIL import Image
- >>> import requests
- >>> from transformers import CLIPProcessor, CLIPVisionModel
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import CLIPProcessor, CLIPVisionModel
- >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
- >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+ >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
- >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> inputs = processor(images=image, return_tensors="pt")
+ >>> inputs = processor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> last_hidden_state = outputs.last_hidden_state
- >>> pooled_output = outputs.pooler_output # pooled CLS states
- """
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled CLS states
+ ```"""
return self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
@@ -968,25 +970,25 @@ class CLIPModel(CLIPPreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from PIL import Image
- >>> import requests
- >>> from transformers import CLIPProcessor, CLIPModel
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import CLIPProcessor, CLIPModel
- >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
- >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
- >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+ >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
- >>> outputs = model(**inputs)
- >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
- >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
-
- """
+ >>> outputs = model(**inputs)
+ >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+ >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.return_dict
vision_outputs = self.vision_model(
pixel_values=pixel_values,
diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py
index 13530e39d3..97db507467 100644
--- a/src/transformers/models/clip/modeling_flax_clip.py
+++ b/src/transformers/models/clip/modeling_flax_clip.py
@@ -940,18 +940,20 @@ class FlaxCLIPTextModel(FlaxCLIPTextPreTrainedModel):
FLAX_CLIP_TEXT_MODEL_DOCSTRING = """
Returns:
- Example::
+ Example:
- >>> from transformers import CLIPTokenizer, FlaxCLIPTextModel
+ ```python
+ >>> from transformers import CLIPTokenizer, FlaxCLIPTextModel
- >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
- >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+ >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
- >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
- >>> outputs = model(**inputs)
- >>> last_hidden_state = outputs.last_hidden_state
- >>> pooler_output = outputs.pooler_output # pooled (EOS token) states
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooler_output = outputs.pooler_output # pooled (EOS token) states
+ ```
"""
overwrite_call_docstring(FlaxCLIPTextModel, CLIP_TEXT_INPUTS_DOCSTRING + FLAX_CLIP_TEXT_MODEL_DOCSTRING)
@@ -991,23 +993,25 @@ class FlaxCLIPVisionModel(FlaxCLIPVisionPreTrainedModel):
FLAX_CLIP_VISION_MODEL_DOCSTRING = """
Returns:
- Example::
+ Example:
- >>> from PIL import Image
- >>> import requests
- >>> from transformers import CLIPProcessor, FlaxCLIPVisionModel
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import CLIPProcessor, FlaxCLIPVisionModel
- >>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
- >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+ >>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
- >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> inputs = processor(images=image, return_tensors="np")
+ >>> inputs = processor(images=image, return_tensors="np")
- >>> outputs = model(**inputs)
- >>> last_hidden_state = outputs.last_hidden_state
- >>> pooler_output = outputs.pooler_output # pooled CLS states
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooler_output = outputs.pooler_output # pooled CLS states
+ ```
"""
overwrite_call_docstring(FlaxCLIPVisionModel, CLIP_VISION_INPUTS_DOCSTRING + FLAX_CLIP_VISION_MODEL_DOCSTRING)
@@ -1115,24 +1119,26 @@ class FlaxCLIPModel(FlaxCLIPPreTrainedModel):
FLAX_CLIP_MODEL_DOCSTRING = """
Returns:
- Example::
+ Example:
- >>> import jax
- >>> from PIL import Image
- >>> import requests
- >>> from transformers import CLIPProcessor, FlaxCLIPModel
+ ```python
+ >>> import jax
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import CLIPProcessor, FlaxCLIPModel
- >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
- >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+ >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+ >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
- >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True)
+ >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True)
- >>> outputs = model(**inputs)
- >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
- >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
+ >>> outputs = model(**inputs)
+ >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+ >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
+ ```
"""
overwrite_call_docstring(FlaxCLIPModel, CLIP_INPUTS_DOCSTRING + FLAX_CLIP_MODEL_DOCSTRING)
diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
index e75199f2b2..caae7983c1 100644
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -24,14 +24,14 @@ class CLIPProcessor:
r"""
Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.
- :class:`~transformers.CLIPProcessor` offers all the functionalities of :class:`~transformers.CLIPFeatureExtractor`
- and :class:`~transformers.CLIPTokenizer`. See the :meth:`~transformers.CLIPProcessor.__call__` and
- :meth:`~transformers.CLIPProcessor.decode` for more information.
+ [`CLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`]
+ and [`CLIPTokenizer`]. See the [`~CLIPProcessor.__call__`] and
+ [`~CLIPProcessor.decode`] for more information.
Args:
- feature_extractor (:class:`~transformers.CLIPFeatureExtractor`):
+ feature_extractor ([`CLIPFeatureExtractor`]):
The feature extractor is a required input.
- tokenizer (:class:`~transformers.CLIPTokenizer`):
+ tokenizer ([`CLIPTokenizer`]):
The tokenizer is a required input.
"""
@@ -49,17 +49,19 @@ class CLIPProcessor:
def save_pretrained(self, save_directory):
"""
- Save a CLIP feature extractor object and CLIP tokenizer object to the directory ``save_directory``, so that it
- can be re-loaded using the :func:`~transformers.CLIPProcessor.from_pretrained` class method.
+ Save a CLIP feature extractor object and CLIP tokenizer object to the directory `save_directory`, so that it
+ can be re-loaded using the [`~CLIPProcessor.from_pretrained`] class method.
- .. note::
+
- This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
- :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
- docstrings of the methods above for more information.
+ This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
+ [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
+ docstrings of the methods above for more information.
+
+
Args:
- save_directory (:obj:`str` or :obj:`os.PathLike`):
+ save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
@@ -70,31 +72,33 @@ class CLIPProcessor:
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
- Instantiate a :class:`~transformers.CLIPProcessor` from a pretrained CLIP processor.
+ Instantiate a [`CLIPProcessor`] from a pretrained CLIP processor.
- .. note::
+
- This class method is simply calling CLIPFeatureExtractor's
- :meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and CLIPTokenizer's
- :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
- docstrings of the methods above for more information.
+ This class method is simply calling CLIPFeatureExtractor's
+ [`~PreTrainedFeatureExtractor.from_pretrained`] and CLIPTokenizer's
+ [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
+ docstrings of the methods above for more information.
+
+
Args:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like ``clip-vit-base-patch32``, or
- namespaced under a user or organization name, like ``openai/clip-vit-base-patch32``.
- - a path to a `directory` containing a feature extractor file saved using the
- :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
- ``./my_model_directory/``.
- - a path or url to a saved feature extractor JSON `file`, e.g.,
- ``./my_model_directory/preprocessor_config.json``.
+ - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `clip-vit-base-patch32`, or
+ namespaced under a user or organization name, like `openai/clip-vit-base-patch32`.
+ - a path to a *directory* containing a feature extractor file saved using the
+ [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g.,
+ `./my_model_directory/`.
+ - a path or url to a saved feature extractor JSON *file*, e.g.,
+ `./my_model_directory/preprocessor_config.json`.
**kwargs
- Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
- :class:`~transformers.PreTrainedTokenizer`
+ Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
+ [`PreTrainedTokenizer`]
"""
feature_extractor = CLIPFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -104,38 +108,38 @@ class CLIPProcessor:
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the
- :obj:`text` and :obj:`kwargs` arguments to CLIPTokenizer's :meth:`~transformers.CLIPTokenizer.__call__` if
- :obj:`text` is not :obj:`None` to encode the text. To prepare the image(s), this method forwards the
- :obj:`images` and :obj:`kwrags` arguments to CLIPFeatureExtractor's
- :meth:`~transformers.CLIPFeatureExtractor.__call__` if :obj:`images` is not :obj:`None`. Please refer to the
+ `text` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if
+ `text` is not `None` to encode the text. To prepare the image(s), this method forwards the
+ `images` and `kwrags` arguments to CLIPFeatureExtractor's
+ [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
doctsring of the above two methods for more information.
Args:
- text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
- :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
- images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
- * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
- * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
- * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
- :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+ [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- - **input_ids** -- List of token ids to be fed to a model. Returned when :obj:`text` is not :obj:`None`.
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
- :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names` and if
- :obj:`text` is not :obj:`None`).
- - **pixel_values** -- Pixel values to be fed to a model. Returned when :obj:`images` is not :obj:`None`.
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if
+ `text` is not `None`).
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
if text is None and images is None:
@@ -158,14 +162,14 @@ class CLIPProcessor:
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to CLIPTokenizer's
- :meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
+ [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more
information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
- This method forwards all its arguments to CLIPTokenizer's :meth:`~transformers.PreTrainedTokenizer.decode`.
+ This method forwards all its arguments to CLIPTokenizer's [`~PreTrainedTokenizer.decode`].
Please refer to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py
index 474fc24421..a3da5bb56e 100644
--- a/src/transformers/models/clip/tokenization_clip.py
+++ b/src/transformers/models/clip/tokenization_clip.py
@@ -105,33 +105,34 @@ class CLIPTokenizer(PreTrainedTokenizer):
be encoded differently whether it is at the beginning of the sentence (without space) or not:
- You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+ You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
- .. note::
+
- When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
- one).
+ When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first
+ one).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- merges_file (:obj:`str`):
+ merges_file (`str`):
Path to the merges file.
- errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
- Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
- `__ for more information.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+ bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The beginning of sequence token.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The end of sequence token.
- add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (CLIP tokenizer detect beginning of words by the preceding space).
"""
@@ -200,7 +201,7 @@ class CLIPTokenizer(PreTrainedTokenizer):
@property
def pad_token_id(self) -> Optional[int]:
"""
- :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
+ `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been
set.
"""
return 0
@@ -219,18 +220,18 @@ class CLIPTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A CLIP sequence has the following format:
- - single sequence: ``<|startoftext|> X <|endoftext|>``
+ - single sequence: `<|startoftext|> X <|endoftext|>`
Pairs of sequences are not the expected use case, but they will be handled without a separator.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
@@ -241,18 +242,18 @@ class CLIPTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
diff --git a/src/transformers/models/clip/tokenization_clip_fast.py b/src/transformers/models/clip/tokenization_clip_fast.py
index 876c6f7bf5..1870c3b8ae 100644
--- a/src/transformers/models/clip/tokenization_clip_fast.py
+++ b/src/transformers/models/clip/tokenization_clip_fast.py
@@ -49,51 +49,52 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class CLIPTokenizerFast(PreTrainedTokenizerFast):
"""
- Construct a "fast" CLIP tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level
+ Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
- ::
+ ```
+ >>> from transformers import CLIPTokenizerFast
+ >>> tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
+ >>> tokenizer("Hello world")['input_ids']
+ [15496, 995]
+ >>> tokenizer(" Hello world")['input_ids']
+ [18435, 995]
+ ```
- >>> from transformers import CLIPTokenizerFast
- >>> tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
- >>> tokenizer("Hello world")['input_ids']
- [15496, 995]
- >>> tokenizer(" Hello world")['input_ids']
- [18435, 995]
-
- You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+ You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
- .. note::
+
- When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
- ``add_prefix_space=True``.
+ When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with
+ `add_prefix_space=True`.
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+
+
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- merges_file (:obj:`str`):
+ merges_file (`str`):
Path to the merges file.
- errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
- Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
- `__ for more information.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+ bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The beginning of sequence token.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The end of sequence token.
- add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (CLIP tokenizer detect beginning of words by the preceding space).
- trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ trim_offsets (`bool`, *optional*, defaults to `True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
"""
@@ -139,7 +140,7 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
@property
def pad_token_id(self) -> Optional[int]:
"""
- :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
+ `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been
set.
"""
return 0
diff --git a/src/transformers/models/convbert/configuration_convbert.py b/src/transformers/models/convbert/configuration_convbert.py
index 1f904ddfce..bce7518754 100644
--- a/src/transformers/models/convbert/configuration_convbert.py
+++ b/src/transformers/models/convbert/configuration_convbert.py
@@ -30,61 +30,62 @@ CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class ConvBertConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.ConvBertModel`. It is used to
+ This is the configuration class to store the configuration of a [`ConvBertModel`]. It is used to
instantiate an ConvBERT model according to the specified arguments, defining the model architecture. Instantiating
- a configuration with the defaults will yield a similar configuration to that of the ConvBERT `conv-bert-base
- `__ architecture. Configuration objects inherit from
- :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from
- :class:`~transformers.PretrainedConfig` for more information.
+ a configuration with the defaults will yield a similar configuration to that of the ConvBERT [conv-bert-base](https://huggingface.co/YituTech/conv-bert-base) architecture. Configuration objects inherit from
+ [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+ [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
+ vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the ConvBERT model. Defines the number of different tokens that can be represented by
- the :obj:`inputs_ids` passed when calling :class:`~transformers.ConvBertModel` or
- :class:`~transformers.TFConvBertModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ the `inputs_ids` passed when calling [`ConvBertModel`] or
+ [`TFConvBertModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.ConvBertModel`
- or :class:`~transformers.TFConvBertModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling [`ConvBertModel`]
+ or [`TFConvBertModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- head_ratio (:obj:`int`, `optional`, defaults to 2):
+ head_ratio (`int`, *optional*, defaults to 2):
Ratio gamma to reduce the number of attention heads.
- num_groups (:obj:`int`, `optional`, defaults to 1):
+ num_groups (`int`, *optional*, defaults to 1):
The number of groups for grouped linear layers for ConvBert model
- conv_kernel_size (:obj:`int`, `optional`, defaults to 9):
+ conv_kernel_size (`int`, *optional*, defaults to 9):
The size of the convolutional kernel.
- classifier_dropout (:obj:`float`, `optional`):
+ classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
- Example::
- >>> from transformers import ConvBertModel, ConvBertConfig
- >>> # Initializing a ConvBERT convbert-base-uncased style configuration
- >>> configuration = ConvBertConfig()
- >>> # Initializing a model from the convbert-base-uncased style configuration
- >>> model = ConvBertModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ Example:
+
+ ```python
+ >>> from transformers import ConvBertModel, ConvBertConfig
+ >>> # Initializing a ConvBERT convbert-base-uncased style configuration
+ >>> configuration = ConvBertConfig()
+ >>> # Initializing a model from the convbert-base-uncased style configuration
+ >>> model = ConvBertModel(configuration)
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "convbert"
def __init__(
diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py
index 12ee66ed28..e4a73f5018 100644
--- a/src/transformers/models/convbert/tokenization_convbert.py
+++ b/src/transformers/models/convbert/tokenization_convbert.py
@@ -45,9 +45,9 @@ PRETRAINED_INIT_CONFIGURATION = {
class ConvBertTokenizer(BertTokenizer):
r"""
- Construct a ConvBERT tokenizer. :class:`~transformers.ConvBertTokenizer` is identical to
- :class:`~transformers.BertTokenizer` and runs end-to-end tokenization: punctuation splitting and wordpiece. Refer
- to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning parameters.
+ Construct a ConvBERT tokenizer. [`ConvBertTokenizer`] is identical to
+ [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting and wordpiece. Refer
+ to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/convbert/tokenization_convbert_fast.py b/src/transformers/models/convbert/tokenization_convbert_fast.py
index 4bc4c05234..8a0f42880c 100644
--- a/src/transformers/models/convbert/tokenization_convbert_fast.py
+++ b/src/transformers/models/convbert/tokenization_convbert_fast.py
@@ -46,12 +46,12 @@ PRETRAINED_INIT_CONFIGURATION = {
class ConvBertTokenizerFast(BertTokenizerFast):
r"""
- Construct a "fast" ConvBERT tokenizer (backed by HuggingFace's `tokenizers` library).
+ Construct a "fast" ConvBERT tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.ConvBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+ [`ConvBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
end-to-end tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/cpm/tokenization_cpm.py b/src/transformers/models/cpm/tokenization_cpm.py
index 7410128a92..5cd3a72012 100644
--- a/src/transformers/models/cpm/tokenization_cpm.py
+++ b/src/transformers/models/cpm/tokenization_cpm.py
@@ -33,59 +33,64 @@ class CpmTokenizer(XLNetTokenizer):
def __init__(self, *args, **kwargs):
"""
- Construct a CPM tokenizer. Based on `Jieba ` and `SentencePiece
- `__.
+ Construct a CPM tokenizer. Based on *Jieba * and [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a .spm extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether to lowercase the input when tokenizing.
- remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ remove_space (`bool`, *optional*, defaults to `True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
- keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ keep_accents (`bool`, *optional*, defaults to `False`):
Whether to keep accents when tokenizing.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier
token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning
- of sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning
+ of sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be
this token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering. It is also used as the
last token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["", ""]`):
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["", ""]`):
Additional special tokens used by the tokenizer.
Attributes:
- sp_model (:obj:`SentencePieceProcessor`):
- The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+ sp_model (`SentencePieceProcessor`):
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
super().__init__(*args, **kwargs)
try:
diff --git a/src/transformers/models/cpm/tokenization_cpm_fast.py b/src/transformers/models/cpm/tokenization_cpm_fast.py
index 24a856c73c..42a627d88c 100644
--- a/src/transformers/models/cpm/tokenization_cpm_fast.py
+++ b/src/transformers/models/cpm/tokenization_cpm_fast.py
@@ -36,59 +36,64 @@ class CpmTokenizerFast(XLNetTokenizerFast):
def __init__(self, *args, **kwargs):
"""
- Construct a CPM tokenizer. Based on `Jieba ` and `SentencePiece
- `__.
+ Construct a CPM tokenizer. Based on *Jieba * and [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a .spm extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether to lowercase the input when tokenizing.
- remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ remove_space (`bool`, *optional*, defaults to `True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
- keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ keep_accents (`bool`, *optional*, defaults to `False`):
Whether to keep accents when tokenizing.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier
token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning
- of sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning
+ of sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be
this token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering. It is also used as the
last token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["", ""]`):
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["", ""]`):
Additional special tokens used by the tokenizer.
Attributes:
- sp_model (:obj:`SentencePieceProcessor`):
- The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+ sp_model (`SentencePieceProcessor`):
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
super().__init__(*args, **kwargs)
try:
diff --git a/src/transformers/models/ctrl/configuration_ctrl.py b/src/transformers/models/ctrl/configuration_ctrl.py
index 2db3f778f8..5c8aa366a0 100644
--- a/src/transformers/models/ctrl/configuration_ctrl.py
+++ b/src/transformers/models/ctrl/configuration_ctrl.py
@@ -25,57 +25,58 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://huggingface.co/ctrl/resol
class CTRLConfig(PretrainedConfig):
"""
- This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel` or a
- :class:`~transformers.TFCTRLModel`. It is used to instantiate a CTRL model according to the specified arguments,
+ This is the configuration class to store the configuration of a [`CTRLModel`] or a
+ [`TFCTRLModel`]. It is used to instantiate a CTRL model according to the specified arguments,
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
- to that of the `ctrl `__ architecture from SalesForce.
+ to that of the [ctrl](https://huggingface.co/ctrl) architecture from SalesForce.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 246534):
+ vocab_size (`int`, *optional*, defaults to 246534):
Vocabulary size of the CTRL model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.CTRLModel` or
- :class:`~transformers.TFCTRLModel`.
- n_positions (:obj:`int`, `optional`, defaults to 256):
+ `inputs_ids` passed when calling [`CTRLModel`] or
+ [`TFCTRLModel`].
+ n_positions (`int`, *optional*, defaults to 256):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- n_embd (:obj:`int`, `optional`, defaults to 1280):
+ n_embd (`int`, *optional*, defaults to 1280):
Dimensionality of the embeddings and hidden states.
- dff (:obj:`int`, `optional`, defaults to 8192):
+ dff (`int`, *optional*, defaults to 8192):
Dimensionality of the inner dimension of the feed forward networks (FFN).
- n_layer (:obj:`int`, `optional`, defaults to 48):
+ n_layer (`int`, *optional*, defaults to 48):
Number of hidden layers in the Transformer encoder.
- n_head (:obj:`int`, `optional`, defaults to 16):
+ n_head (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+ resid_pdrop (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+ embd_pdrop (`int`, *optional*, defaults to 0.1):
The dropout ratio for the embeddings.
- attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+ attn_pdrop (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention.
- layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-6):
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-6):
The epsilon to use in the layer normalization layers
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
- Examples::
+ Examples:
- >>> from transformers import CTRLModel, CTRLConfig
+ ```python
+ >>> from transformers import CTRLModel, CTRLConfig
- >>> # Initializing a CTRL configuration
- >>> configuration = CTRLConfig()
+ >>> # Initializing a CTRL configuration
+ >>> configuration = CTRLConfig()
- >>> # Initializing a model from the configuration
- >>> model = CTRLModel(configuration)
+ >>> # Initializing a model from the configuration
+ >>> model = CTRLModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "ctrl"
keys_to_ignore_at_inference = ["past_key_values"]
diff --git a/src/transformers/models/ctrl/tokenization_ctrl.py b/src/transformers/models/ctrl/tokenization_ctrl.py
index 31ac0637a9..86c24f3125 100644
--- a/src/transformers/models/ctrl/tokenization_ctrl.py
+++ b/src/transformers/models/ctrl/tokenization_ctrl.py
@@ -120,15 +120,15 @@ class CTRLTokenizer(PreTrainedTokenizer):
"""
Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- merges_file (:obj:`str`):
+ merges_file (`str`):
Path to the merges file.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
diff --git a/src/transformers/models/deberta/configuration_deberta.py b/src/transformers/models/deberta/configuration_deberta.py
index 30a984f620..c7701db961 100644
--- a/src/transformers/models/deberta/configuration_deberta.py
+++ b/src/transformers/models/deberta/configuration_deberta.py
@@ -32,59 +32,59 @@ DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DebertaConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.DebertaModel` or a
- :class:`~transformers.TFDebertaModel`. It is used to instantiate a DeBERTa model according to the specified
+ This is the configuration class to store the configuration of a [`DebertaModel`] or a
+ [`TFDebertaModel`]. It is used to instantiate a DeBERTa model according to the specified
arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
- configuration to that of the DeBERTa `microsoft/deberta-base `__
+ configuration to that of the DeBERTa [microsoft/deberta-base](https://huggingface.co/microsoft/deberta-base)
architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Arguments:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
+ vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the DeBERTa model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.DebertaModel` or
- :class:`~transformers.TFDebertaModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ `inputs_ids` passed when calling [`DebertaModel`] or
+ [`TFDebertaModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
- :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`,
+ `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.DebertaModel` or
- :class:`~transformers.TFDebertaModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or
+ [`TFDebertaModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- relative_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ relative_attention (`bool`, *optional*, defaults to `False`):
Whether use relative position encoding.
- max_relative_positions (:obj:`int`, `optional`, defaults to 1):
- The range of relative positions :obj:`[-max_position_embeddings, max_position_embeddings]`. Use the same
- value as :obj:`max_position_embeddings`.
- pad_token_id (:obj:`int`, `optional`, defaults to 0):
+ max_relative_positions (`int`, *optional*, defaults to 1):
+ The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same
+ value as `max_position_embeddings`.
+ pad_token_id (`int`, *optional*, defaults to 0):
The value used to pad input_ids.
- position_biased_input (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ position_biased_input (`bool`, *optional*, defaults to `True`):
Whether add absolute position embedding to content embedding.
- pos_att_type (:obj:`List[str]`, `optional`):
- The type of relative position attention, it can be a combination of :obj:`["p2c", "c2p", "p2p"]`, e.g.
- :obj:`["p2c"]`, :obj:`["p2c", "c2p"]`, :obj:`["p2c", "c2p", 'p2p"]`.
- layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+ pos_att_type (`List[str]`, *optional*):
+ The type of relative position attention, it can be a combination of `["p2c", "c2p", "p2p"]`, e.g.
+ `["p2c"]`, `["p2c", "c2p"]`, `["p2c", "c2p", 'p2p"]`.
+ layer_norm_eps (`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
"""
model_type = "deberta"
diff --git a/src/transformers/models/deberta/tokenization_deberta.py b/src/transformers/models/deberta/tokenization_deberta.py
index ddd08e5286..97ddff5d5e 100644
--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@@ -64,23 +64,23 @@ class DebertaTokenizer(GPT2Tokenizer):
Constructs a DeBERTa tokenizer, which runs end-to-end tokenization: punctuation splitting + wordpiece
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
File containing the vocabulary.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
"""
@@ -141,13 +141,13 @@ class DebertaTokenizer(GPT2Tokenizer):
- pair of sequences: [CLS] A [SEP] B [SEP]
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -160,18 +160,18 @@ class DebertaTokenizer(GPT2Tokenizer):
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
@@ -189,21 +189,21 @@ class DebertaTokenizer(GPT2Tokenizer):
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
sequence pair mask has the following format:
- ::
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
-
- If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/deberta/tokenization_deberta_fast.py b/src/transformers/models/deberta/tokenization_deberta_fast.py
index 54f82d6b1a..ae8e37abcc 100644
--- a/src/transformers/models/deberta/tokenization_deberta_fast.py
+++ b/src/transformers/models/deberta/tokenization_deberta_fast.py
@@ -63,26 +63,26 @@ PRETRAINED_INIT_CONFIGURATION = {
class DebertaTokenizerFast(GPT2TokenizerFast):
"""
Constructs a "fast" DeBERTa tokenizer, which runs end-to-end tokenization: punctuation splitting + wordpiece. It is
- backed by HuggingFace's `tokenizers` library.
+ backed by HuggingFace's *tokenizers* library.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
File containing the vocabulary.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
"""
@@ -129,11 +129,11 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
@property
def mask_token(self) -> str:
"""
- :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
+ `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
not having been set.
Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
- comprise the space before the `[MASK]`.
+ comprise the space before the *[MASK]*.
"""
if self._mask_token is None and self.verbose:
logger.error("Using mask_token, but it is not set yet.")
@@ -161,13 +161,13 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
- pair of sequences: [CLS] A [SEP] B [SEP]
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -182,21 +182,21 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
sequence pair mask has the following format:
- ::
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ | first sequence | second sequence |
+ ```
- 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- | first sequence | second sequence |
-
- If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/deberta_v2/configuration_deberta_v2.py b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
index 9870979fb8..1c283f2cfa 100644
--- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
@@ -30,57 +30,57 @@ DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DebertaV2Config(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.DebertaV2Model`. It is used
+ This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used
to instantiate a DeBERTa-v2 model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the DeBERTa
- `microsoft/deberta-v2-xlarge `__ architecture.
+ [microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-base) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Arguments:
- vocab_size (:obj:`int`, `optional`, defaults to 128100):
+ vocab_size (`int`, *optional*, defaults to 128100):
Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by
- the :obj:`inputs_ids` passed when calling :class:`~transformers.DebertaV2Model`.
- hidden_size (:obj:`int`, `optional`, defaults to 1536):
+ the `inputs_ids` passed when calling [`DebertaV2Model`].
+ hidden_size (`int`, *optional*, defaults to 1536):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+ num_hidden_layers (`int`, *optional*, defaults to 24):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 24):
+ num_attention_heads (`int`, *optional*, defaults to 24):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 6144):
+ intermediate_size (`int`, *optional*, defaults to 6144):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
- :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`,
+ `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 0):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.DebertaModel` or
- :class:`~transformers.TFDebertaModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 0):
+ The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or
+ [`TFDebertaModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-7):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-7):
The epsilon used by the layer normalization layers.
- relative_attention (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ relative_attention (`bool`, *optional*, defaults to `True`):
Whether use relative position encoding.
- max_relative_positions (:obj:`int`, `optional`, defaults to -1):
- The range of relative positions :obj:`[-max_position_embeddings, max_position_embeddings]`. Use the same
- value as :obj:`max_position_embeddings`.
- pad_token_id (:obj:`int`, `optional`, defaults to 0):
+ max_relative_positions (`int`, *optional*, defaults to -1):
+ The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same
+ value as `max_position_embeddings`.
+ pad_token_id (`int`, *optional*, defaults to 0):
The value used to pad input_ids.
- position_biased_input (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ position_biased_input (`bool`, *optional*, defaults to `False`):
Whether add absolute position embedding to content embedding.
- pos_att_type (:obj:`List[str]`, `optional`):
- The type of relative position attention, it can be a combination of :obj:`["p2c", "c2p", "p2p"]`, e.g.
- :obj:`["p2c"]`, :obj:`["p2c", "c2p"]`, :obj:`["p2c", "c2p", 'p2p"]`.
- layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+ pos_att_type (`List[str]`, *optional*):
+ The type of relative position attention, it can be a combination of `["p2c", "c2p", "p2p"]`, e.g.
+ `["p2c"]`, `["p2c", "c2p"]`, `["p2c", "c2p", 'p2p"]`.
+ layer_norm_eps (`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
"""
model_type = "deberta-v2"
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
index dfca91fb1a..5c6612af1a 100644
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -52,49 +52,48 @@ VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
class DebertaV2Tokenizer(PreTrainedTokenizer):
r"""
- Constructs a DeBERTa-v2 tokenizer. Based on `SentencePiece `__.
+ Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ do_lower_case (`bool`, *optional*, defaults to `False`):
Whether or not to lowercase the input when tokenizing.
- bos_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+ bos_token (`string`, *optional*, defaults to "[CLS]"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+ sequence. The token used is the `cls_token`.
+ eos_token (`string`, *optional*, defaults to "[SEP]"):
The end of sequence token. When building a sequence using special tokens, this is not the token that is
- used for the end of sequence. The token used is the :obj:`sep_token`.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+ used for the end of sequence. The token used is the `sep_token`.
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""
@@ -183,13 +182,13 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
- pair of sequences: [CLS] A [SEP] B [SEP]
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
@@ -201,18 +200,18 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -229,21 +228,21 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
sequence pair mask has the following format:
- ::
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
-
- If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
@@ -264,25 +263,24 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
class SPMTokenizer:
r"""
- Constructs a tokenizer based on `SentencePiece `__.
+ Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""
diff --git a/src/transformers/models/deit/configuration_deit.py b/src/transformers/models/deit/configuration_deit.py
index d394431925..09d979daef 100644
--- a/src/transformers/models/deit/configuration_deit.py
+++ b/src/transformers/models/deit/configuration_deit.py
@@ -28,59 +28,60 @@ DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DeiTConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.DeiTModel`. It is used to
+ This is the configuration class to store the configuration of a [`DeiTModel`]. It is used to
instantiate an DeiT model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the DeiT
- `facebook/deit-base-distilled-patch16-224 `__
+ [facebook/deit-base-distilled-patch16-224](https://huggingface.co/facebook/deit-base-distilled-patch16-224)
architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- image_size (:obj:`int`, `optional`, defaults to :obj:`224`):
+ image_size (`int`, *optional*, defaults to `224`):
The size (resolution) of each image.
- patch_size (:obj:`int`, `optional`, defaults to :obj:`16`):
+ patch_size (`int`, *optional*, defaults to `16`):
The size (resolution) of each patch.
- num_channels (:obj:`int`, `optional`, defaults to :obj:`3`):
+ num_channels (`int`, *optional*, defaults to `3`):
The number of input channels.
- qkv_bias (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ qkv_bias (`bool`, *optional*, defaults to `True`):
Whether to add a bias to the queries, keys and values.
- Example::
+ Example:
- >>> from transformers import DeiTModel, DeiTConfig
+ ```python
+ >>> from transformers import DeiTModel, DeiTConfig
- >>> # Initializing a DeiT deit-base-distilled-patch16-224 style configuration
- >>> configuration = DeiTConfig()
+ >>> # Initializing a DeiT deit-base-distilled-patch16-224 style configuration
+ >>> configuration = DeiTConfig()
- >>> # Initializing a model from the deit-base-distilled-patch16-224 style configuration
- >>> model = DeiTModel(configuration)
+ >>> # Initializing a model from the deit-base-distilled-patch16-224 style configuration
+ >>> model = DeiTModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "deit"
def __init__(
diff --git a/src/transformers/models/deit/feature_extraction_deit.py b/src/transformers/models/deit/feature_extraction_deit.py
index b5d86ebba6..a5ed140303 100644
--- a/src/transformers/models/deit/feature_extraction_deit.py
+++ b/src/transformers/models/deit/feature_extraction_deit.py
@@ -38,31 +38,31 @@ class DeiTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
r"""
Constructs a DeiT feature extractor.
- This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+ This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether to resize the input to a certain :obj:`size`.
- size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 256):
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Whether to resize the input to a certain `size`.
+ size (`int` or `Tuple(int)`, *optional*, defaults to 256):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
- integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
- is set to :obj:`True`.
- resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BICUBIC`):
- An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
- :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
- Only has an effect if :obj:`do_resize` is set to :obj:`True`.
- do_center_crop (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether to crop the input at the center. If the input size is smaller than :obj:`crop_size` along any edge,
+ integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+ is set to `True`.
+ resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
+ An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+ `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+ Only has an effect if `do_resize` is set to `True`.
+ do_center_crop (`bool`, *optional*, defaults to `True`):
+ Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge,
the image is padded with 0's and then center cropped.
- crop_size (:obj:`int`, `optional`, defaults to 224):
- Desired output size when applying center-cropping. Only has an effect if :obj:`do_center_crop` is set to
- :obj:`True`.
- do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether or not to normalize the input with :obj:`image_mean` and :obj:`image_std`.
- image_mean (:obj:`List[int]`, defaults to :obj:`[0.485, 0.456, 0.406]`):
+ crop_size (`int`, *optional*, defaults to 224):
+ Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to
+ `True`.
+ do_normalize (`bool`, *optional*, defaults to `True`):
+ Whether or not to normalize the input with `image_mean` and `image_std`.
+ image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
The sequence of means for each channel, to be used when normalizing images.
- image_std (:obj:`List[int]`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+ image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
The sequence of standard deviations for each channel, to be used when normalizing images.
"""
@@ -96,27 +96,29 @@ class DeiTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
"""
Main method to prepare for the model one or several image(s).
- .. warning::
+
- NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
- PIL images.
+ NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+ PIL images.
+
+
Args:
- images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
- * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
- * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
- * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
- :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index dbcf11c37a..190ea7c7eb 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -487,22 +487,23 @@ class DeiTModel(DeiTPreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import DeiTFeatureExtractor, DeiTModel
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import DeiTFeatureExtractor, DeiTModel
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
- >>> model = DeiTModel.from_pretrained('facebook/deit-base-distilled-patch16-224', add_pooling_layer=False)
+ >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
+ >>> model = DeiTModel.from_pretrained('facebook/deit-base-distilled-patch16-224', add_pooling_layer=False)
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> last_hidden_states = outputs.last_hidden_state
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -729,25 +730,26 @@ class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel):
"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
- >>> model = DeiTForImageClassificationWithTeacher.from_pretrained('facebook/deit-base-distilled-patch16-224')
+ >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
+ >>> model = DeiTForImageClassificationWithTeacher.from_pretrained('facebook/deit-base-distilled-patch16-224')
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> logits = outputs.logits
- >>> # model predicts one of the 1000 ImageNet classes
- >>> predicted_class_idx = logits.argmax(-1).item()
- >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
- """
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> logits = outputs.logits
+ >>> # model predicts one of the 1000 ImageNet classes
+ >>> predicted_class_idx = logits.argmax(-1).item()
+ >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.deit(
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index 2f6392a3c6..3edc9da80e 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -28,93 +28,92 @@ DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DetrConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.DetrModel`. It is used to
+ This is the configuration class to store the configuration of a [`DetrModel`]. It is used to
instantiate a DETR model according to the specified arguments, defining the model architecture. Instantiating a
- configuration with the defaults will yield a similar configuration to that of the DETR `facebook/detr-resnet-50
- `__ architecture.
+ configuration with the defaults will yield a similar configuration to that of the DETR [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- num_queries (:obj:`int`, `optional`, defaults to 100):
+ num_queries (`int`, *optional*, defaults to 100):
Number of object queries, i.e. detection slots. This is the maximal number of objects
- :class:`~transformers.DetrModel` can detect in a single image. For COCO, we recommend 100 queries.
- d_model (:obj:`int`, `optional`, defaults to 256):
+ [`DetrModel`] can detect in a single image. For COCO, we recommend 100 queries.
+ d_model (`int`, *optional*, defaults to 256):
Dimension of the layers.
- encoder_layers (:obj:`int`, `optional`, defaults to 6):
+ encoder_layers (`int`, *optional*, defaults to 6):
Number of encoder layers.
- decoder_layers (:obj:`int`, `optional`, defaults to 6):
+ decoder_layers (`int`, *optional*, defaults to 6):
Number of decoder layers.
- encoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+ encoder_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+ decoder_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer decoder.
- decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+ decoder_ffn_dim (`int`, *optional*, defaults to 2048):
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+ encoder_ffn_dim (`int`, *optional*, defaults to 2048):
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
- activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`):
+ activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
- init_std (:obj:`float`, `optional`, defaults to 0.02):
+ init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- init_xavier_std (:obj:`float`, `optional`, defaults to 1):
+ init_xavier_std (`float`, *optional*, defaults to 1):
The scaling factor used for the Xavier initialization gain in the HM Attention map module.
- encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details.
- decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the decoder. See the `LayerDrop paper `__ for more details.
- auxiliary_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ auxiliary_loss (`bool`, *optional*, defaults to `False`):
Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
- position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"sine"`):
- Type of position embeddings to be used on top of the image features. One of :obj:`"sine"` or
- :obj:`"learned"`.
- backbone (:obj:`str`, `optional`, defaults to :obj:`"resnet50"`):
+ position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+ Type of position embeddings to be used on top of the image features. One of `"sine"` or
+ `"learned"`.
+ backbone (`str`, *optional*, defaults to `"resnet50"`):
Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
- list of all available models, see `this page
- `__.
- dilation (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ list of all available models, see [this page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+ dilation (`bool`, *optional*, defaults to `False`):
Whether to replace stride with dilation in the last convolutional block (DC5).
- class_cost (:obj:`float`, `optional`, defaults to 1):
+ class_cost (`float`, *optional*, defaults to 1):
Relative weight of the classification error in the Hungarian matching cost.
- bbox_cost (:obj:`float`, `optional`, defaults to 5):
+ bbox_cost (`float`, *optional*, defaults to 5):
Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
- giou_cost (:obj:`float`, `optional`, defaults to 2):
+ giou_cost (`float`, *optional*, defaults to 2):
Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
- mask_loss_coefficient (:obj:`float`, `optional`, defaults to 1):
+ mask_loss_coefficient (`float`, *optional*, defaults to 1):
Relative weight of the Focal loss in the panoptic segmentation loss.
- dice_loss_coefficient (:obj:`float`, `optional`, defaults to 1):
+ dice_loss_coefficient (`float`, *optional*, defaults to 1):
Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
- bbox_loss_coefficient (:obj:`float`, `optional`, defaults to 5):
+ bbox_loss_coefficient (`float`, *optional*, defaults to 5):
Relative weight of the L1 bounding box loss in the object detection loss.
- giou_loss_coefficient (:obj:`float`, `optional`, defaults to 2):
+ giou_loss_coefficient (`float`, *optional*, defaults to 2):
Relative weight of the generalized IoU loss in the object detection loss.
- eos_coefficient (:obj:`float`, `optional`, defaults to 0.1):
+ eos_coefficient (`float`, *optional*, defaults to 0.1):
Relative classification weight of the 'no-object' class in the object detection loss.
- Examples::
+ Examples:
- >>> from transformers import DetrModel, DetrConfig
+ ```python
+ >>> from transformers import DetrModel, DetrConfig
- >>> # Initializing a DETR facebook/detr-resnet-50 style configuration
- >>> configuration = DetrConfig()
+ >>> # Initializing a DETR facebook/detr-resnet-50 style configuration
+ >>> configuration = DetrConfig()
- >>> # Initializing a model from the facebook/detr-resnet-50 style configuration
- >>> model = DetrModel(configuration)
+ >>> # Initializing a model from the facebook/detr-resnet-50 style configuration
+ >>> model = DetrModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "detr"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py
index e208961c03..a2f93ac2a2 100644
--- a/src/transformers/models/detr/feature_extraction_detr.py
+++ b/src/transformers/models/detr/feature_extraction_detr.py
@@ -124,28 +124,28 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
r"""
Constructs a DETR feature extractor.
- This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+ This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- format (:obj:`str`, `optional`, defaults to :obj:`"coco_detection"`):
+ format (`str`, *optional*, defaults to `"coco_detection"`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
- do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether to resize the input to a certain :obj:`size`.
- size (:obj:`int`, `optional`, defaults to 800):
- Resize the input to the given size. Only has an effect if :obj:`do_resize` is set to :obj:`True`. If size
- is a sequence like :obj:`(width, height)`, output size will be matched to this. If size is an int, smaller
- edge of the image will be matched to this number. i.e, if :obj:`height > width`, then image will be
- rescaled to :obj:`(size * height / width, size)`.
- max_size (:obj:`int`, `optional`, defaults to :obj:`1333`):
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Whether to resize the input to a certain `size`.
+ size (`int`, *optional*, defaults to 800):
+ Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size
+ is a sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller
+ edge of the image will be matched to this number. i.e, if `height > width`, then image will be
+ rescaled to `(size * height / width, size)`.
+ max_size (`int`, *optional*, defaults to `1333`):
The largest size an image dimension can have (otherwise it's capped). Only has an effect if
- :obj:`do_resize` is set to :obj:`True`.
- do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ `do_resize` is set to `True`.
+ do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input with mean and standard deviation.
- image_mean (:obj:`int`, `optional`, defaults to :obj:`[0.485, 0.456, 0.406]`):
+ image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
- image_std (:obj:`int`, `optional`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+ image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
ImageNet std.
"""
@@ -416,39 +416,37 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
padded up to the largest image in a batch, and a pixel mask is created that indicates which pixels are
real/which are padding.
- .. warning::
+
- NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
- PIL images.
+ NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+ PIL images.
+
+
Args:
- images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
- annotations (:obj:`Dict`, :obj:`List[Dict]`, `optional`):
+ annotations (`Dict`, `List[Dict]`, *optional*):
The corresponding annotations in COCO format.
- In case :class:`~transformers.DetrFeatureExtractor` was initialized with :obj:`format =
- "coco_detection"`, the annotations for each image should have the following format: {'image_id': int,
+ In case [`DetrFeatureExtractor`] was initialized with `format = "coco_detection"`, the annotations for each image should have the following format: {'image_id': int,
'annotations': [annotation]}, with the annotations being a list of COCO object annotations.
- In case :class:`~transformers.DetrFeatureExtractor` was initialized with :obj:`format =
- "coco_panoptic"`, the annotations for each image should have the following format: {'image_id': int,
+ In case [`DetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the annotations for each image should have the following format: {'image_id': int,
'file_name': str, 'segments_info': [segment_info]} with segments_info being a list of COCO panoptic
annotations.
- return_segmentation_masks (:obj:`Dict`, :obj:`List[Dict]`, `optional`, defaults to :obj:`False`):
- Whether to also include instance segmentation masks as part of the labels in case :obj:`format =
- "coco_detection"`.
+ return_segmentation_masks (`Dict`, `List[Dict]`, *optional*, defaults to `False`):
+ Whether to also include instance segmentation masks as part of the labels in case `format = "coco_detection"`.
- masks_path (:obj:`pathlib.Path`, `optional`):
+ masks_path (`pathlib.Path`, *optional*):
Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
- relevant in case :class:`~transformers.DetrFeatureExtractor` was initialized with :obj:`format =
- "coco_panoptic"`.
+ relevant in case [`DetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
- pad_and_return_pixel_mask (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether or not to pad images up to the largest image in a batch and create a pixel mask.
If left to the default, will return a pixel mask that is:
@@ -456,17 +454,17 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
- 1 for pixels that are real (i.e. **not masked**),
- 0 for pixels that are padding (i.e. **masked**).
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
- If set, will return tensors instead of NumPy arrays. If set to :obj:`'pt'`, return PyTorch
- :obj:`torch.Tensor` objects.
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+ If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch
+ `torch.Tensor` objects.
Returns:
- :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model.
- - **pixel_mask** -- Pixel mask to be fed to a model (when :obj:`pad_and_return_pixel_mask=True` or if
- `"pixel_mask"` is in :obj:`self.model_input_names`).
- - **labels** -- Optional labels to be fed to a model (when :obj:`annotations` are provided)
+ - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
+ *"pixel_mask"* is in `self.model_input_names`).
+ - **labels** -- Optional labels to be fed to a model (when `annotations` are provided)
"""
# Input type checking for clearer error
@@ -634,21 +632,21 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None
):
"""
- Pad images up to the largest image in a batch and create a corresponding :obj:`pixel_mask`.
+ Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
Args:
- pixel_values_list (:obj:`List[torch.Tensor]`):
+ pixel_values_list (`List[torch.Tensor]`):
List of images (pixel values) to be padded. Each image should be a tensor of shape (C, H, W).
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
- If set, will return tensors instead of NumPy arrays. If set to :obj:`'pt'`, return PyTorch
- :obj:`torch.Tensor` objects.
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+ If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch
+ `torch.Tensor` objects.
Returns:
- :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model.
- - **pixel_mask** -- Pixel mask to be fed to a model (when :obj:`pad_and_return_pixel_mask=True` or if
- `"pixel_mask"` is in :obj:`self.model_input_names`).
+ - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
+ *"pixel_mask"* is in `self.model_input_names`).
"""
@@ -676,19 +674,19 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
# inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258
def post_process(self, outputs, target_sizes):
"""
- Converts the output of :class:`~transformers.DetrForObjectDetection` into the format expected by the COCO api.
+ Converts the output of [`DetrForObjectDetection`] into the format expected by the COCO api.
Only supports PyTorch.
Args:
- outputs (:class:`~transformers.DetrObjectDetectionOutput`):
+ outputs ([`DetrObjectDetectionOutput`]):
Raw outputs of the model.
- target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`, `optional`):
+ target_sizes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
image size (before any data augmentation). For visualization, this should be the image size after data
augment, but before padding.
Returns:
- :obj:`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an
+ `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an
image in the batch as predicted by the model.
"""
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
@@ -714,21 +712,21 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
"""
- Converts the output of :class:`~transformers.DetrForSegmentation` into image segmentation predictions. Only
+ Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only
supports PyTorch.
Parameters:
- outputs (:class:`~transformers.DetrSegmentationOutput`):
+ outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model.
- target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)` or :obj:`List[Tuple]` of length :obj:`batch_size`):
+ target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
- threshold (:obj:`float`, `optional`, defaults to 0.9):
+ threshold (`float`, *optional*, defaults to 0.9):
Threshold to use to filter out queries.
- mask_threshold (:obj:`float`, `optional`, defaults to 0.5):
+ mask_threshold (`float`, *optional*, defaults to 0.5):
Threshold to use when turning the predicted masks into binary values.
Returns:
- :obj:`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an
+ `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an
image in the batch as predicted by the model.
"""
out_logits, raw_masks = outputs.logits, outputs.pred_masks
@@ -757,26 +755,26 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
# inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
"""
- Converts the output of :class:`~transformers.DetrForSegmentation` into actual instance segmentation
+ Converts the output of [`DetrForSegmentation`] into actual instance segmentation
predictions. Only supports PyTorch.
Args:
- results (:obj:`List[Dict]`):
- Results list obtained by :meth:`~transformers.DetrFeatureExtractor.post_process`, to which "masks"
+ results (`List[Dict]`):
+ Results list obtained by [`~DetrFeatureExtractor.post_process`], to which "masks"
results will be added.
- outputs (:class:`~transformers.DetrSegmentationOutput`):
+ outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model.
- orig_target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`):
+ orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
image size (before any data augmentation).
- max_target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`):
+ max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
original image size (before any data augmentation).
- threshold (:obj:`float`, `optional`, defaults to 0.5):
+ threshold (`float`, *optional*, defaults to 0.5):
Threshold to use when turning the predicted masks into binary values.
Returns:
- :obj:`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks
+ `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks
for an image in the batch as predicted by the model.
"""
@@ -801,26 +799,26 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
# inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
"""
- Converts the output of :class:`~transformers.DetrForSegmentation` into actual panoptic predictions. Only
+ Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only
supports PyTorch.
Parameters:
- outputs (:class:`~transformers.DetrSegmentationOutput`):
+ outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model.
- processed_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)` or :obj:`List[Tuple]` of length :obj:`batch_size`):
+ processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
augmentation but before batching.
- target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)` or :obj:`List[Tuple]` of length :obj:`batch_size`, `optional`):
+ target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*):
Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. If left to
- None, it will default to the :obj:`processed_sizes`.
- is_thing_map (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`, `optional`):
+ None, it will default to the `processed_sizes`.
+ is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
- If not set, defaults to the :obj:`is_thing_map` of COCO panoptic.
- threshold (:obj:`float`, `optional`, defaults to 0.85):
+ If not set, defaults to the `is_thing_map` of COCO panoptic.
+ threshold (`float`, *optional*, defaults to 0.85):
Threshold to use to filter out queries.
Returns:
- :obj:`List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values
+ `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values
for an image in the batch as predicted by the model.
"""
if target_sizes is None:
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 0895290aa7..7d1140577a 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -1205,21 +1205,22 @@ class DetrModel(DetrPreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import DetrFeatureExtractor, DetrModel
- >>> from PIL import Image
- >>> import requests
+ ```python
+ >>> from transformers import DetrFeatureExtractor, DetrModel
+ >>> from PIL import Image
+ >>> import requests
- >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
- >>> image = Image.open(requests.get(url, stream=True).raw)
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+ >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
- >>> model = DetrModel.from_pretrained('facebook/detr-resnet-50')
- >>> inputs = feature_extractor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> last_hidden_states = outputs.last_hidden_state
- """
+ >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
+ >>> model = DetrModel.from_pretrained('facebook/detr-resnet-50')
+ >>> inputs = feature_extractor(images=image, return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/distilbert/configuration_distilbert.py b/src/transformers/models/distilbert/configuration_distilbert.py
index 733714e721..09ffe1619c 100644
--- a/src/transformers/models/distilbert/configuration_distilbert.py
+++ b/src/transformers/models/distilbert/configuration_distilbert.py
@@ -36,62 +36,62 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DistilBertConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel` or a
- :class:`~transformers.TFDistilBertModel`. It is used to instantiate a DistilBERT model according to the specified
+ This is the configuration class to store the configuration of a [`DistilBertModel`] or a
+ [`TFDistilBertModel`]. It is used to instantiate a DistilBERT model according to the specified
arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
- configuration to that of the DistilBERT `distilbert-base-uncased
- `__ architecture.
+ configuration to that of the DistilBERT [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
+ vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the DistilBERT model. Defines the number of different tokens that can be represented by
- the :obj:`inputs_ids` passed when calling :class:`~transformers.DistilBertModel` or
- :class:`~transformers.TFDistilBertModel`.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ the `inputs_ids` passed when calling [`DistilBertModel`] or
+ [`TFDistilBertModel`].
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- sinusoidal_pos_embds (:obj:`boolean`, `optional`, defaults to :obj:`False`):
+ sinusoidal_pos_embds (`boolean`, *optional*, defaults to `False`):
Whether to use sinusoidal positional embeddings.
- n_layers (:obj:`int`, `optional`, defaults to 6):
+ n_layers (`int`, *optional*, defaults to 6):
Number of hidden layers in the Transformer encoder.
- n_heads (:obj:`int`, `optional`, defaults to 12):
+ n_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- dim (:obj:`int`, `optional`, defaults to 768):
+ dim (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- hidden_dim (:obj:`int`, `optional`, defaults to 3072):
+ hidden_dim (`int`, *optional*, defaults to 3072):
The size of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+ attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+ activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- qa_dropout (:obj:`float`, `optional`, defaults to 0.1):
+ qa_dropout (`float`, *optional*, defaults to 0.1):
The dropout probabilities used in the question answering model
- :class:`~transformers.DistilBertForQuestionAnswering`.
- seq_classif_dropout (:obj:`float`, `optional`, defaults to 0.2):
+ [`DistilBertForQuestionAnswering`].
+ seq_classif_dropout (`float`, *optional*, defaults to 0.2):
The dropout probabilities used in the sequence classification and the multiple choice model
- :class:`~transformers.DistilBertForSequenceClassification`.
+ [`DistilBertForSequenceClassification`].
- Examples::
+ Examples:
- >>> from transformers import DistilBertModel, DistilBertConfig
+ ```python
+ >>> from transformers import DistilBertModel, DistilBertConfig
- >>> # Initializing a DistilBERT configuration
- >>> configuration = DistilBertConfig()
+ >>> # Initializing a DistilBERT configuration
+ >>> configuration = DistilBertConfig()
- >>> # Initializing a model from the configuration
- >>> model = DistilBertModel(configuration)
+ >>> # Initializing a model from the configuration
+ >>> model = DistilBertModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "distilbert"
attribute_map = {
"hidden_size": "dim",
diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py
index 50dc80bdf4..a04c1059c7 100644
--- a/src/transformers/models/distilbert/tokenization_distilbert.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert.py
@@ -57,10 +57,10 @@ class DistilBertTokenizer(BertTokenizer):
r"""
Construct a DistilBERT tokenizer.
- :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+ [`DistilBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
parameters.
"""
diff --git a/src/transformers/models/distilbert/tokenization_distilbert_fast.py b/src/transformers/models/distilbert/tokenization_distilbert_fast.py
index 4007d4e871..3b052f5cef 100644
--- a/src/transformers/models/distilbert/tokenization_distilbert_fast.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert_fast.py
@@ -64,12 +64,12 @@ PRETRAINED_INIT_CONFIGURATION = {
class DistilBertTokenizerFast(BertTokenizerFast):
r"""
- Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's `tokenizers` library).
+ Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.DistilBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+ [`DistilBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
end-to-end tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
parameters.
"""
diff --git a/src/transformers/models/dpr/configuration_dpr.py b/src/transformers/models/dpr/configuration_dpr.py
index a9b5f96556..dd0a9dfddc 100644
--- a/src/transformers/models/dpr/configuration_dpr.py
+++ b/src/transformers/models/dpr/configuration_dpr.py
@@ -32,51 +32,49 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DPRConfig(PretrainedConfig):
r"""
- :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a `DPRModel`.
+ [`DPRConfig`] is the configuration class to store the configuration of a *DPRModel*.
- This is the configuration class to store the configuration of a :class:`~transformers.DPRContextEncoder`,
- :class:`~transformers.DPRQuestionEncoder`, or a :class:`~transformers.DPRReader`. It is used to instantiate the
+ This is the configuration class to store the configuration of a [`DPRContextEncoder`],
+ [`DPRQuestionEncoder`], or a [`DPRReader`]. It is used to instantiate the
components of the DPR model.
- This class is a subclass of :class:`~transformers.BertConfig`. Please check the superclass for the documentation of
+ This class is a subclass of [`BertConfig`]. Please check the superclass for the documentation of
all kwargs.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
- Vocabulary size of the DPR model. Defines the different tokens that can be represented by the `inputs_ids`
- passed to the forward method of :class:`~transformers.BertModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ vocab_size (`int`, *optional*, defaults to 30522):
+ Vocabulary size of the DPR model. Defines the different tokens that can be represented by the *inputs_ids*
+ passed to the forward method of [`BertModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the *token_type_ids* passed into [`BertModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
- Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
- :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
- :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
- `__. For more information on :obj:`"relative_key_query"`, please refer to
- `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
- `__.
- projection_dim (:obj:`int`, `optional`, defaults to 0):
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+ `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+ `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+ *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+ projection_dim (`int`, *optional*, defaults to 0):
Dimension of the projection for the context and question encoders. If it is set to zero (default), then no
projection is done.
"""
diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py
index c845c31aa9..333106e4fb 100644
--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -64,7 +64,7 @@ class DPRContextEncoderOutput(ModelOutput):
Class for outputs of [`DPRQuestionEncoder`].
Args:
- pooler_output: (:obj:`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
+ pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
@@ -91,7 +91,7 @@ class DPRQuestionEncoderOutput(ModelOutput):
Class for outputs of [`DPRQuestionEncoder`].
Args:
- pooler_output: (:obj:`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
+ pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
This output is to be used to embed questions for nearest neighbors queries with context embeddings.
@@ -118,11 +118,11 @@ class DPRReaderOutput(ModelOutput):
Class for outputs of [`DPRQuestionEncoder`].
Args:
- start_logits: (:obj:`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
+ start_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
Logits of the start index of the span for each passage.
- end_logits: (:obj:`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
+ end_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
Logits of the end index of the span for each passage.
- relevance_logits: (``torch.FloatTensor``` of shape `(n_passages, )`):
+ relevance_logits (`torch.FloatTensor` of shape `(n_passages, )`):
Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
question, compared to all the other passages.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
@@ -350,17 +350,17 @@ DPR_ENCODERS_INPUTS_DOCSTRING = r"""
(a) For sequence pairs (for a pair title+text for example):
- ```
- tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
- token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
- ```
+ ```
+ tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+ token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
+ ```
(b) For single sequences (for a question for example):
- ```
- tokens: [CLS] the dog is hairy . [SEP]
- token_type_ids: 0 0 0 0 0 0 0
- ```
+ ```
+ tokens: [CLS] the dog is hairy . [SEP]
+ token_type_ids: 0 0 0 0 0 0 0
+ ```
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
rather than the left.
@@ -463,14 +463,15 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
r"""
Return:
- Examples::
+ Examples:
- >>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
- >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
- >>> model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
- >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
- >>> embeddings = model(input_ids).pooler_output
- """
+ ```python
+ >>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
+ >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
+ >>> model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
+ >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
+ >>> embeddings = model(input_ids).pooler_output
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -542,13 +543,15 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
r"""
Return:
- Examples::
+ Examples:
- >>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
- >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
- >>> model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
- >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
- >>> embeddings = model(input_ids).pooler_output
+ ```python
+ >>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
+ >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
+ >>> model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
+ >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
+ >>> embeddings = model(input_ids).pooler_output
+ ```
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -619,22 +622,23 @@ class DPRReader(DPRPretrainedReader):
r"""
Return:
- Examples::
-
- >>> from transformers import DPRReader, DPRReaderTokenizer
- >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
- >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
- >>> encoded_inputs = tokenizer(
- ... questions=["What is love ?"],
- ... titles=["Haddaway"],
- ... texts=["'What Is Love' is a song recorded by the artist Haddaway"],
- ... return_tensors='pt'
- ... )
- >>> outputs = model(**encoded_inputs)
- >>> start_logits = outputs.start_logits
- >>> end_logits = outputs.end_logits
- >>> relevance_logits = outputs.relevance_logits
+ Examples:
+ ```python
+ >>> from transformers import DPRReader, DPRReaderTokenizer
+ >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
+ >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
+ >>> encoded_inputs = tokenizer(
+ ... questions=["What is love ?"],
+ ... titles=["Haddaway"],
+ ... texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+ ... return_tensors='pt'
+ ... )
+ >>> outputs = model(**encoded_inputs)
+ >>> start_logits = outputs.start_logits
+ >>> end_logits = outputs.end_logits
+ >>> relevance_logits = outputs.relevance_logits
+ ```
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py
index 4ec0e7b5fb..75e925cb2a 100644
--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -61,7 +61,7 @@ class TFDPRContextEncoderOutput(ModelOutput):
Class for outputs of [`TFDPRContextEncoder`].
Args:
- pooler_output: (:obj:`tf.Tensor` of shape `(batch_size, embeddings_size)`):
+ pooler_output (`tf.Tensor` of shape `(batch_size, embeddings_size)`):
The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
@@ -88,7 +88,7 @@ class TFDPRQuestionEncoderOutput(ModelOutput):
Class for outputs of [`TFDPRQuestionEncoder`].
Args:
- pooler_output: (:obj:`tf.Tensor` of shape `(batch_size, embeddings_size)`):
+ pooler_output (`tf.Tensor` of shape `(batch_size, embeddings_size)`):
The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
This output is to be used to embed questions for nearest neighbors queries with context embeddings.
@@ -115,11 +115,11 @@ class TFDPRReaderOutput(ModelOutput):
Class for outputs of [`TFDPRReaderEncoder`].
Args:
- start_logits: (:obj:`tf.Tensor` of shape `(n_passages, sequence_length)`):
+ start_logits (`tf.Tensor` of shape `(n_passages, sequence_length)`):
Logits of the start index of the span for each passage.
- end_logits: (:obj:`tf.Tensor` of shape `(n_passages, sequence_length)`):
+ end_logits (`tf.Tensor` of shape `(n_passages, sequence_length)`):
Logits of the end index of the span for each passage.
- relevance_logits: (``tf.Tensor``` of shape `(n_passages, )`):
+ relevance_logits (`tf.Tensor` of shape `(n_passages, )`):
Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
question, compared to all the other passages.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
@@ -485,17 +485,17 @@ TF_DPR_ENCODERS_INPUTS_DOCSTRING = r"""
(a) For sequence pairs (for a pair title+text for example):
- ```
- tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
- token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
- ```
+ ```
+ tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+ token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
+ ```
(b) For single sequences (for a question for example):
- ```
- tokens: [CLS] the dog is hairy . [SEP]
- token_type_ids: 0 0 0 0 0 0 0
- ```
+ ```
+ tokens: [CLS] the dog is hairy . [SEP]
+ token_type_ids: 0 0 0 0 0 0 0
+ ```
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
rather than the left.
@@ -610,13 +610,15 @@ class TFDPRContextEncoder(TFDPRPretrainedContextEncoder):
r"""
Return:
- Examples::
+ Examples:
- >>> from transformers import TFDPRContextEncoder, DPRContextEncoderTokenizer
- >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
- >>> model = TFDPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', from_pt=True)
- >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"]
- >>> embeddings = model(input_ids).pooler_output
+ ```python
+ >>> from transformers import TFDPRContextEncoder, DPRContextEncoderTokenizer
+ >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
+ >>> model = TFDPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', from_pt=True)
+ >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"]
+ >>> embeddings = model(input_ids).pooler_output
+ ```
"""
inputs = input_processing(
func=self.call,
@@ -708,13 +710,15 @@ class TFDPRQuestionEncoder(TFDPRPretrainedQuestionEncoder):
r"""
Return:
- Examples::
+ Examples:
- >>> from transformers import TFDPRQuestionEncoder, DPRQuestionEncoderTokenizer
- >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
- >>> model = TFDPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', from_pt=True)
- >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"]
- >>> embeddings = model(input_ids).pooler_output
+ ```python
+ >>> from transformers import TFDPRQuestionEncoder, DPRQuestionEncoderTokenizer
+ >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
+ >>> model = TFDPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', from_pt=True)
+ >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"]
+ >>> embeddings = model(input_ids).pooler_output
+ ```
"""
inputs = input_processing(
func=self.call,
@@ -804,22 +808,23 @@ class TFDPRReader(TFDPRPretrainedReader):
r"""
Return:
- Examples::
-
- >>> from transformers import TFDPRReader, DPRReaderTokenizer
- >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
- >>> model = TFDPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', from_pt=True)
- >>> encoded_inputs = tokenizer(
- ... questions=["What is love ?"],
- ... titles=["Haddaway"],
- ... texts=["'What Is Love' is a song recorded by the artist Haddaway"],
- ... return_tensors='tf'
- ... )
- >>> outputs = model(encoded_inputs)
- >>> start_logits = outputs.start_logits
- >>> end_logits = outputs.end_logits
- >>> relevance_logits = outputs.relevance_logits
+ Examples:
+ ```python
+ >>> from transformers import TFDPRReader, DPRReaderTokenizer
+ >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
+ >>> model = TFDPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', from_pt=True)
+ >>> encoded_inputs = tokenizer(
+ ... questions=["What is love ?"],
+ ... titles=["Haddaway"],
+ ... texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+ ... return_tensors='tf'
+ ... )
+ >>> outputs = model(encoded_inputs)
+ >>> start_logits = outputs.start_logits
+ >>> end_logits = outputs.end_logits
+ >>> relevance_logits = outputs.relevance_logits
+ ```
"""
inputs = input_processing(
func=self.call,
diff --git a/src/transformers/models/dpr/tokenization_dpr.py b/src/transformers/models/dpr/tokenization_dpr.py
index 23bfff9062..46ab974262 100644
--- a/src/transformers/models/dpr/tokenization_dpr.py
+++ b/src/transformers/models/dpr/tokenization_dpr.py
@@ -91,10 +91,10 @@ class DPRContextEncoderTokenizer(BertTokenizer):
r"""
Construct a DPRContextEncoder tokenizer.
- :class:`~transformers.DPRContextEncoderTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs
+ [`DPRContextEncoderTokenizer`] is identical to [`BertTokenizer`] and runs
end-to-end tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
parameters.
"""
@@ -108,10 +108,10 @@ class DPRQuestionEncoderTokenizer(BertTokenizer):
r"""
Constructs a DPRQuestionEncoder tokenizer.
- :class:`~transformers.DPRQuestionEncoderTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs
+ [`DPRQuestionEncoderTokenizer`] is identical to [`BertTokenizer`] and runs
end-to-end tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
parameters.
"""
@@ -130,70 +130,70 @@ DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "en
CUSTOM_DPR_READER_DOCSTRING = r"""
Return a dictionary with the token ids of the input strings and other information to give to
- :obj:`.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a
- sequence of IDs (integers), using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of size
- :obj:`(n_passages, sequence_length)` with the format:
+ `.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a
+ sequence of IDs (integers), using the tokenizer and vocabulary. The resulting `input_ids` is a matrix of size
+ `(n_passages, sequence_length)` with the format:
- ::
-
- [CLS] [SEP] [SEP]
+ ```
+ [CLS] [SEP] [SEP]
+ ```
Args:
- questions (:obj:`str` or :obj:`List[str]`):
+ questions (`str` or `List[str]`):
The questions to be encoded. You can specify one question for many passages. In this case, the question
- will be duplicated like :obj:`[questions] * n_passages`. Otherwise you have to specify as many questions as
- in :obj:`titles` or :obj:`texts`.
- titles (:obj:`str` or :obj:`List[str]`):
+ will be duplicated like `[questions] * n_passages`. Otherwise you have to specify as many questions as
+ in `titles` or `texts`.
+ titles (`str` or `List[str]`):
The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
- texts (:obj:`str` or :obj:`List[str]`):
+ texts (`str` or `List[str]`):
The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
- padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+ padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
Activates and controls padding. Accepts the following values:
- * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
- * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
- truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+ truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
Activates and controls truncation. Accepts the following values:
- * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
- :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+ - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument
+ `max_length` or to the maximum acceptable input length for the model if that argument is not
provided. This will truncate token by token, removing a token from the longest sequence in the pair if a
pair of sequences (or a batch of pairs) is provided.
- * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to the
+ - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided. This will only truncate
the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+ - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to
the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence
+ - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence
lengths greater than the model maximum admissible input size).
- max_length (:obj:`int`, `optional`):
+ max_length (`int`, *optional*):
Controls the maximum length to use by one of the truncation/padding parameters.
- If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+ If left unset or set to `None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
- * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
- * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
- return_attention_mask (:obj:`bool`, `optional`):
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return Numpy `np.ndarray` objects.
+ return_attention_mask (`bool`, *optional*):
Whether or not to return the attention mask. If not set, will return the attention mask according to the
- specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+ specific tokenizer's default, defined by the `return_outputs` attribute.
- `What are attention masks? <../glossary.html#attention-mask>`__
+ [What are attention masks?](../glossary#attention-mask)
Returns:
- :obj:`Dict[str, List[List[int]]]`: A dictionary with the following keys:
+ `Dict[str, List[List[int]]]`: A dictionary with the following keys:
- - ``input_ids``: List of token ids to be fed to a model.
- - ``attention_mask``: List of indices specifying which tokens should be attended to by the model.
+ - `input_ids`: List of token ids to be fed to a model.
+ - `attention_mask`: List of indices specifying which tokens should be attended to by the model.
"""
@@ -268,33 +268,31 @@ class CustomDPRReaderTokenizerMixin:
"""
Get the span predictions for the extractive Q&A model.
- Returns: `List` of `DPRReaderOutput` sorted by descending `(relevance_score, span_score)`. Each
- `DPRReaderOutput` is a `Tuple` with:
+ Returns: *List* of *DPRReaderOutput* sorted by descending *(relevance_score, span_score)*. Each
+ *DPRReaderOutput* is a *Tuple* with:
- - **span_score**: ``float`` that corresponds to the score given by the reader for this span compared to
+ - **span_score**: `float` that corresponds to the score given by the reader for this span compared to
other spans in the same passage. It corresponds to the sum of the start and end logits of the span.
- - **relevance_score**: ``float`` that corresponds to the score of the each passage to answer the question,
+ - **relevance_score**: `float` that corresponds to the score of the each passage to answer the question,
compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader.
- - **doc_id**: ``int``` the id of the passage.
- - **start_index**: ``int`` the start index of the span (inclusive).
- - **end_index**: ``int`` the end index of the span (inclusive).
+ - **doc_id**: ``int``` the id of the passage. - **start_index**: `int` the start index of the span (inclusive). - **end_index**: `int` the end index of the span (inclusive).
- Examples::
+ Examples:
- >>> from transformers import DPRReader, DPRReaderTokenizer
- >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
- >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
- >>> encoded_inputs = tokenizer(
- ... questions=["What is love ?"],
- ... titles=["Haddaway"],
- ... texts=["'What Is Love' is a song recorded by the artist Haddaway"],
- ... return_tensors='pt'
- ... )
- >>> outputs = model(**encoded_inputs)
- >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
- >>> print(predicted_spans[0].text) # best span
-
- """
+ ```python
+ >>> from transformers import DPRReader, DPRReaderTokenizer
+ >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
+ >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
+ >>> encoded_inputs = tokenizer(
+ ... questions=["What is love ?"],
+ ... titles=["Haddaway"],
+ ... texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+ ... return_tensors='pt'
+ ... )
+ >>> outputs = model(**encoded_inputs)
+ >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
+ >>> print(predicted_spans[0].text) # best span
+ ```"""
input_ids = reader_input["input_ids"]
start_logits, end_logits, relevance_logits = reader_output[:3]
n_passages = len(relevance_logits)
@@ -373,11 +371,11 @@ class DPRReaderTokenizer(CustomDPRReaderTokenizerMixin, BertTokenizer):
r"""
Construct a DPRReader tokenizer.
- :class:`~transformers.DPRReaderTokenizer` is almost identical to :class:`~transformers.BertTokenizer` and runs
+ [`DPRReaderTokenizer`] is almost identical to [`BertTokenizer`] and runs
end-to-end tokenization: punctuation splitting and wordpiece. The difference is that is has three inputs strings:
- question, titles and texts that are combined to be fed to the :class:`~transformers.DPRReader` model.
+ question, titles and texts that are combined to be fed to the [`DPRReader`] model.
- Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
parameters.
"""
diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py
index 1f5a37be24..d4d5f41418 100644
--- a/src/transformers/models/dpr/tokenization_dpr_fast.py
+++ b/src/transformers/models/dpr/tokenization_dpr_fast.py
@@ -90,12 +90,12 @@ READER_PRETRAINED_INIT_CONFIGURATION = {
class DPRContextEncoderTokenizerFast(BertTokenizerFast):
r"""
- Construct a "fast" DPRContextEncoder tokenizer (backed by HuggingFace's `tokenizers` library).
+ Construct a "fast" DPRContextEncoder tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.DPRContextEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and
+ [`DPRContextEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and
runs end-to-end tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
parameters.
"""
@@ -108,12 +108,12 @@ class DPRContextEncoderTokenizerFast(BertTokenizerFast):
class DPRQuestionEncoderTokenizerFast(BertTokenizerFast):
r"""
- Constructs a "fast" DPRQuestionEncoder tokenizer (backed by HuggingFace's `tokenizers` library).
+ Constructs a "fast" DPRQuestionEncoder tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.DPRQuestionEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and
+ [`DPRQuestionEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and
runs end-to-end tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
parameters.
"""
@@ -133,68 +133,68 @@ DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "en
CUSTOM_DPR_READER_DOCSTRING = r"""
Return a dictionary with the token ids of the input strings and other information to give to
- :obj:`.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a
- sequence of IDs (integers), using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of size
- :obj:`(n_passages, sequence_length)` with the format:
+ `.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a
+ sequence of IDs (integers), using the tokenizer and vocabulary. The resulting `input_ids` is a matrix of size
+ `(n_passages, sequence_length)` with the format:
[CLS] [SEP] [SEP]
Args:
- questions (:obj:`str` or :obj:`List[str]`):
+ questions (`str` or `List[str]`):
The questions to be encoded. You can specify one question for many passages. In this case, the question
- will be duplicated like :obj:`[questions] * n_passages`. Otherwise you have to specify as many questions as
- in :obj:`titles` or :obj:`texts`.
- titles (:obj:`str` or :obj:`List[str]`):
+ will be duplicated like `[questions] * n_passages`. Otherwise you have to specify as many questions as
+ in `titles` or `texts`.
+ titles (`str` or `List[str]`):
The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
- texts (:obj:`str` or :obj:`List[str]`):
+ texts (`str` or `List[str]`):
The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
- padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+ padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
Activates and controls padding. Accepts the following values:
- * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
- * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
- truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+ truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
Activates and controls truncation. Accepts the following values:
- * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
- :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+ - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument
+ `max_length` or to the maximum acceptable input length for the model if that argument is not
provided. This will truncate token by token, removing a token from the longest sequence in the pair if a
pair of sequences (or a batch of pairs) is provided.
- * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to the
+ - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided. This will only truncate
the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+ - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to
the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence
+ - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence
lengths greater than the model maximum admissible input size).
- max_length (:obj:`int`, `optional`):
+ max_length (`int`, *optional*):
Controls the maximum length to use by one of the truncation/padding parameters.
- If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+ If left unset or set to `None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
- * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
- * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
- return_attention_mask (:obj:`bool`, `optional`):
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return Numpy `np.ndarray` objects.
+ return_attention_mask (`bool`, *optional*):
Whether or not to return the attention mask. If not set, will return the attention mask according to the
- specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+ specific tokenizer's default, defined by the `return_outputs` attribute.
- `What are attention masks? <../glossary.html#attention-mask>`__
+ [What are attention masks?](../glossary#attention-mask)
Return:
- :obj:`Dict[str, List[List[int]]]`: A dictionary with the following keys:
+ `Dict[str, List[List[int]]]`: A dictionary with the following keys:
- - ``input_ids``: List of token ids to be fed to a model.
- - ``attention_mask``: List of indices specifying which tokens should be attended to by the model.
+ - `input_ids`: List of token ids to be fed to a model.
+ - `attention_mask`: List of indices specifying which tokens should be attended to by the model.
"""
@@ -269,33 +269,31 @@ class CustomDPRReaderTokenizerMixin:
"""
Get the span predictions for the extractive Q&A model.
- Returns: `List` of `DPRReaderOutput` sorted by descending `(relevance_score, span_score)`. Each
- `DPRReaderOutput` is a `Tuple` with:
+ Returns: *List* of *DPRReaderOutput* sorted by descending *(relevance_score, span_score)*. Each
+ *DPRReaderOutput* is a *Tuple* with:
- - **span_score**: ``float`` that corresponds to the score given by the reader for this span compared to
+ - **span_score**: `float` that corresponds to the score given by the reader for this span compared to
other spans in the same passage. It corresponds to the sum of the start and end logits of the span.
- - **relevance_score**: ``float`` that corresponds to the score of the each passage to answer the question,
+ - **relevance_score**: `float` that corresponds to the score of the each passage to answer the question,
compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader.
- - **doc_id**: ``int``` the id of the passage.
- - ***start_index**: ``int`` the start index of the span (inclusive).
- - **end_index**: ``int`` the end index of the span (inclusive).
+ - **doc_id**: ``int``` the id of the passage. - ***start_index**: `int` the start index of the span (inclusive). - **end_index**: `int` the end index of the span (inclusive).
- Examples::
+ Examples:
- >>> from transformers import DPRReader, DPRReaderTokenizer
- >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
- >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
- >>> encoded_inputs = tokenizer(
- ... questions=["What is love ?"],
- ... titles=["Haddaway"],
- ... texts=["'What Is Love' is a song recorded by the artist Haddaway"],
- ... return_tensors='pt'
- ... )
- >>> outputs = model(**encoded_inputs)
- >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
- >>> print(predicted_spans[0].text) # best span
-
- """
+ ```python
+ >>> from transformers import DPRReader, DPRReaderTokenizer
+ >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
+ >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
+ >>> encoded_inputs = tokenizer(
+ ... questions=["What is love ?"],
+ ... titles=["Haddaway"],
+ ... texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+ ... return_tensors='pt'
+ ... )
+ >>> outputs = model(**encoded_inputs)
+ >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
+ >>> print(predicted_spans[0].text) # best span
+ ```"""
input_ids = reader_input["input_ids"]
start_logits, end_logits, relevance_logits = reader_output[:3]
n_passages = len(relevance_logits)
@@ -372,13 +370,13 @@ class CustomDPRReaderTokenizerMixin:
@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING)
class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast):
r"""
- Constructs a "fast" DPRReader tokenizer (backed by HuggingFace's `tokenizers` library).
+ Constructs a "fast" DPRReader tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.DPRReaderTokenizerFast` is almost identical to :class:`~transformers.BertTokenizerFast` and
+ [`DPRReaderTokenizerFast`] is almost identical to [`BertTokenizerFast`] and
runs end-to-end tokenization: punctuation splitting and wordpiece. The difference is that is has three inputs
- strings: question, titles and texts that are combined to be fed to the :class:`~transformers.DPRReader` model.
+ strings: question, titles and texts that are combined to be fed to the [`DPRReader`] model.
- Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
parameters.
"""
diff --git a/src/transformers/models/electra/configuration_electra.py b/src/transformers/models/electra/configuration_electra.py
index b0fb6ea73c..963460318e 100644
--- a/src/transformers/models/electra/configuration_electra.py
+++ b/src/transformers/models/electra/configuration_electra.py
@@ -33,96 +33,94 @@ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class ElectraConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel` or a
- :class:`~transformers.TFElectraModel`. It is used to instantiate a ELECTRA model according to the specified
+ This is the configuration class to store the configuration of a [`ElectraModel`] or a
+ [`TFElectraModel`]. It is used to instantiate a ELECTRA model according to the specified
arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
- configuration to that of the ELECTRA `google/electra-small-discriminator
- `__ architecture.
+ configuration to that of the ELECTRA [google/electra-small-discriminator](https://huggingface.co/google/electra-small-discriminator) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
+ vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the ELECTRA model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.ElectraModel` or
- :class:`~transformers.TFElectraModel`.
- embedding_size (:obj:`int`, `optional`, defaults to 128):
+ `inputs_ids` passed when calling [`ElectraModel`] or
+ [`TFElectraModel`].
+ embedding_size (`int`, *optional*, defaults to 128):
Dimensionality of the encoder layers and the pooler layer.
- hidden_size (:obj:`int`, `optional`, defaults to 256):
+ hidden_size (`int`, *optional*, defaults to 256):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 4):
+ num_attention_heads (`int`, *optional*, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 1024):
+ intermediate_size (`int`, *optional*, defaults to 1024):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.ElectraModel` or
- :class:`~transformers.TFElectraModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling [`ElectraModel`] or
+ [`TFElectraModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- summary_type (:obj:`str`, `optional`, defaults to :obj:`"first"`):
+ summary_type (`str`, *optional*, defaults to `"first"`):
Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
Has to be one of the following options:
- - :obj:`"last"`: Take the last token hidden state (like XLNet).
- - :obj:`"first"`: Take the first token hidden state (like BERT).
- - :obj:`"mean"`: Take the mean of all tokens hidden states.
- - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
- - :obj:`"attn"`: Not implemented now, use multi-head attention.
- summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ - `"last"`: Take the last token hidden state (like XLNet).
+ - `"first"`: Take the first token hidden state (like BERT).
+ - `"mean"`: Take the mean of all tokens hidden states.
+ - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+ - `"attn"`: Not implemented now, use multi-head attention.
+ summary_use_proj (`bool`, *optional*, defaults to `True`):
Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
Whether or not to add a projection after the vector extraction.
- summary_activation (:obj:`str`, `optional`):
+ summary_activation (`str`, *optional*):
Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
- Pass :obj:`"gelu"` for a gelu activation to the output, any other value will result in no activation.
- summary_last_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ Pass `"gelu"` for a gelu activation to the output, any other value will result in no activation.
+ summary_last_dropout (`float`, *optional*, defaults to 0.0):
Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
The dropout ratio to be used after the projection and activation.
- position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
- Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
- :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
- :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
- `__. For more information on :obj:`"relative_key_query"`, please refer to
- `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
- `__.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+ `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+ `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+ *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
- relevant if ``config.is_decoder=True``.
- classifier_dropout (:obj:`float`, `optional`):
+ relevant if `config.is_decoder=True`.
+ classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
- Examples::
+ Examples:
- >>> from transformers import ElectraModel, ElectraConfig
+ ```python
+ >>> from transformers import ElectraModel, ElectraConfig
- >>> # Initializing a ELECTRA electra-base-uncased style configuration
- >>> configuration = ElectraConfig()
+ >>> # Initializing a ELECTRA electra-base-uncased style configuration
+ >>> configuration = ElectraConfig()
- >>> # Initializing a model from the electra-base-uncased style configuration
- >>> model = ElectraModel(configuration)
+ >>> # Initializing a model from the electra-base-uncased style configuration
+ >>> model = ElectraModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "electra"
def __init__(
diff --git a/src/transformers/models/electra/modeling_flax_electra.py b/src/transformers/models/electra/modeling_flax_electra.py
index 020a18eba6..afa028fb45 100644
--- a/src/transformers/models/electra/modeling_flax_electra.py
+++ b/src/transformers/models/electra/modeling_flax_electra.py
@@ -814,17 +814,19 @@ class FlaxElectraForPreTraining(FlaxElectraPreTrainedModel):
FLAX_ELECTRA_FOR_PRETRAINING_DOCSTRING = """
Returns:
- Example::
+ Example:
- >>> from transformers import ElectraTokenizer, FlaxElectraForPreTraining
+ ```python
+ >>> from transformers import ElectraTokenizer, FlaxElectraForPreTraining
- >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
- >>> model = FlaxElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+ >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+ >>> model = FlaxElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
- >>> outputs = model(**inputs)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+ >>> outputs = model(**inputs)
- >>> prediction_logits = outputs.logits
+ >>> prediction_logits = outputs.logits
+ ```
"""
overwrite_call_docstring(
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index 10f8ac6cfc..2dedc146f6 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -1082,17 +1082,18 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> import tensorflow as tf
- >>> from transformers import ElectraTokenizer, TFElectraForPreTraining
+ ```python
+ >>> import tensorflow as tf
+ >>> from transformers import ElectraTokenizer, TFElectraForPreTraining
- >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
- >>> model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
- >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
- >>> outputs = model(input_ids)
- >>> scores = outputs[0]
- """
+ >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+ >>> model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+ >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
+ >>> outputs = model(input_ids)
+ >>> scores = outputs[0]
+ ```"""
inputs = input_processing(
func=self.call,
config=self.config,
diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py
index 89c6c922e9..8f087263d8 100644
--- a/src/transformers/models/electra/tokenization_electra.py
+++ b/src/transformers/models/electra/tokenization_electra.py
@@ -53,10 +53,10 @@ class ElectraTokenizer(BertTokenizer):
r"""
Construct an ELECTRA tokenizer.
- :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+ [`ElectraTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
parameters.
"""
diff --git a/src/transformers/models/electra/tokenization_electra_fast.py b/src/transformers/models/electra/tokenization_electra_fast.py
index 67259d83ea..41c7bd5536 100644
--- a/src/transformers/models/electra/tokenization_electra_fast.py
+++ b/src/transformers/models/electra/tokenization_electra_fast.py
@@ -60,12 +60,12 @@ PRETRAINED_INIT_CONFIGURATION = {
class ElectraTokenizerFast(BertTokenizerFast):
r"""
- Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's `tokenizers` library).
+ Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.ElectraTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+ [`ElectraTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
end-to-end tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
index b12e32a2c3..4fc7f6b563 100644
--- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
@@ -25,49 +25,50 @@ logger = logging.get_logger(__name__)
class EncoderDecoderConfig(PretrainedConfig):
r"""
- :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a
- :class:`~transformers.EncoderDecoderModel`. It is used to instantiate an Encoder Decoder model according to the
+ [`EncoderDecoderConfig`] is the configuration class to store the configuration of a
+ [`EncoderDecoderModel`]. It is used to instantiate an Encoder Decoder model according to the
specified arguments, defining the encoder and decoder configs.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- kwargs (`optional`):
+ kwargs (*optional*):
Dictionary of keyword arguments. Notably:
- - **encoder** (:class:`~transformers.PretrainedConfig`, `optional`) -- An instance of a configuration
+ - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration
object that defines the encoder config.
- - **decoder** (:class:`~transformers.PretrainedConfig`, `optional`) -- An instance of a configuration
+ - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration
object that defines the decoder config.
- Examples::
+ Examples:
- >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+ ```python
+ >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
- >>> # Initializing a BERT bert-base-uncased style configuration
- >>> config_encoder = BertConfig()
- >>> config_decoder = BertConfig()
+ >>> # Initializing a BERT bert-base-uncased style configuration
+ >>> config_encoder = BertConfig()
+ >>> config_decoder = BertConfig()
- >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+ >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
- >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
- >>> model = EncoderDecoderModel(config=config)
+ >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
+ >>> model = EncoderDecoderModel(config=config)
- >>> # Accessing the model configuration
- >>> config_encoder = model.config.encoder
- >>> config_decoder = model.config.decoder
- >>> # set decoder config to causal lm
- >>> config_decoder.is_decoder = True
- >>> config_decoder.add_cross_attention = True
+ >>> # Accessing the model configuration
+ >>> config_encoder = model.config.encoder
+ >>> config_decoder = model.config.decoder
+ >>> # set decoder config to causal lm
+ >>> config_decoder.is_decoder = True
+ >>> config_decoder.add_cross_attention = True
- >>> # Saving the model, including its configuration
- >>> model.save_pretrained('my-model')
+ >>> # Saving the model, including its configuration
+ >>> model.save_pretrained('my-model')
- >>> # loading model and config from pretrained folder
- >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
- >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
- """
+ >>> # loading model and config from pretrained folder
+ >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
+ >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
+ ```"""
model_type = "encoder-decoder"
is_composition = True
@@ -92,11 +93,11 @@ class EncoderDecoderConfig(PretrainedConfig):
cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
) -> PretrainedConfig:
r"""
- Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model
+ Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
configuration and decoder model configuration.
Returns:
- :class:`EncoderDecoderConfig`: An instance of a configuration object
+ [`EncoderDecoderConfig`]: An instance of a configuration object
"""
logger.info("Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
decoder_config.is_decoder = True
@@ -106,10 +107,10 @@ class EncoderDecoderConfig(PretrainedConfig):
def to_dict(self):
"""
- Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig`.
+ Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
Returns:
- :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output["encoder"] = self.encoder.to_dict()
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 27d69e9ef9..96205ea334 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -444,32 +444,32 @@ class EncoderDecoderModel(PreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import EncoderDecoderModel, BertTokenizer
- >>> import torch
+ ```python
+ >>> from transformers import EncoderDecoderModel, BertTokenizer
+ >>> import torch
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+ >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints
- >>> # training
- >>> model.config.decoder_start_token_id = tokenizer.cls_token_id
- >>> model.config.pad_token_id = tokenizer.pad_token_id
- >>> model.config.vocab_size = model.config.decoder.vocab_size
+ >>> # training
+ >>> model.config.decoder_start_token_id = tokenizer.cls_token_id
+ >>> model.config.pad_token_id = tokenizer.pad_token_id
+ >>> model.config.vocab_size = model.config.decoder.vocab_size
- >>> input_ids = tokenizer("This is a really long text", return_tensors="pt").input_ids
- >>> labels = tokenizer("This is the corresponding summary", return_tensors="pt").input_ids
- >>> outputs = model(input_ids=input_ids, labels=input_ids)
- >>> loss, logits = outputs.loss, outputs.logits
+ >>> input_ids = tokenizer("This is a really long text", return_tensors="pt").input_ids
+ >>> labels = tokenizer("This is the corresponding summary", return_tensors="pt").input_ids
+ >>> outputs = model(input_ids=input_ids, labels=input_ids)
+ >>> loss, logits = outputs.loss, outputs.logits
- >>> # save and load from pretrained
- >>> model.save_pretrained("bert2bert")
- >>> model = EncoderDecoderModel.from_pretrained("bert2bert")
+ >>> # save and load from pretrained
+ >>> model.save_pretrained("bert2bert")
+ >>> model = EncoderDecoderModel.from_pretrained("bert2bert")
- >>> # generation
- >>> generated = model.generate(input_ids)
-
- """
+ >>> # generation
+ >>> generated = model.generate(input_ids)
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index 3cfb2eb334..186b2ee527 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -428,20 +428,20 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
+ ```python
+ >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
- >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
- >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
+ >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+ >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> input_ids = tokenizer.encode(text, return_tensors='np')
- >>> encoder_outputs = model.encode(input_ids)
-
- """
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> input_ids = tokenizer.encode(text, return_tensors='np')
+ >>> encoder_outputs = model.encode(input_ids)
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -505,27 +505,27 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
- >>> import jax.numpy as jnp
+ ```python
+ >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
+ >>> import jax.numpy as jnp
- >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
- >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
+ >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+ >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> input_ids = tokenizer.encode(text, max_length=1024, return_tensors='np')
- >>> encoder_outputs = model.encode(input_ids)
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> input_ids = tokenizer.encode(text, max_length=1024, return_tensors='np')
+ >>> encoder_outputs = model.encode(input_ids)
- >>> decoder_start_token_id = model.config.decoder.bos_token_id
- >>> decoder_input_ids = jnp.ones((input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+ >>> decoder_start_token_id = model.config.decoder.bos_token_id
+ >>> decoder_input_ids = jnp.ones((input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> logits = outputs.logits
-
- """
+ >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -631,32 +631,33 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer, GPT2Tokenizer
+ ```python
+ >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer, GPT2Tokenizer
- >>> # load a fine-tuned bert2gpt2 model
- >>> model = FlaxEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16")
- >>> # load input & output tokenizer
- >>> tokenizer_input = BertTokenizer.from_pretrained('bert-base-cased')
- >>> tokenizer_output = GPT2Tokenizer.from_pretrained('gpt2')
+ >>> # load a fine-tuned bert2gpt2 model
+ >>> model = FlaxEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16")
+ >>> # load input & output tokenizer
+ >>> tokenizer_input = BertTokenizer.from_pretrained('bert-base-cased')
+ >>> tokenizer_output = GPT2Tokenizer.from_pretrained('gpt2')
- >>> article = '''Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members
- ... singing a racist chant. SAE's national chapter suspended the students,
- ... but University of Oklahoma President David Boren took it a step further,
- ... saying the university's affiliation with the fraternity is permanently done.'''
+ >>> article = '''Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members
+ ... singing a racist chant. SAE's national chapter suspended the students,
+ ... but University of Oklahoma President David Boren took it a step further,
+ ... saying the university's affiliation with the fraternity is permanently done.'''
- >>> input_ids = tokenizer_input(article, add_special_tokens=True, return_tensors='np').input_ids
+ >>> input_ids = tokenizer_input(article, add_special_tokens=True, return_tensors='np').input_ids
- >>> # use GPT2's eos_token as the pad as well as eos token
- >>> model.config.eos_token_id = model.config.decoder.eos_token_id
- >>> model.config.pad_token_id = model.config.eos_token_id
+ >>> # use GPT2's eos_token as the pad as well as eos token
+ >>> model.config.eos_token_id = model.config.decoder.eos_token_id
+ >>> model.config.pad_token_id = model.config.eos_token_id
- >>> sequences = model.generate(input_ids, num_beams=4, max_length=12).sequences
+ >>> sequences = model.generate(input_ids, num_beams=4, max_length=12).sequences
- >>> summary = tokenizer_output.batch_decode(sequences, skip_special_tokens=True)[0]
- >>> assert summary == "SAS Alpha Epsilon suspended Sigma Alpha Epsilon members"
- """
+ >>> summary = tokenizer_output.batch_decode(sequences, skip_special_tokens=True)[0]
+ >>> assert summary == "SAS Alpha Epsilon suspended Sigma Alpha Epsilon members"
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index 9dc68878f3..4735d94a3f 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -263,26 +263,28 @@ class TFEncoderDecoderModel(TFPreTrainedModel):
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r"""
- Initializing `TFEncoderDecoderModel` from a pytorch checkpoint is not supported currently.
+ Initializing *TFEncoderDecoderModel* from a pytorch checkpoint is not supported currently.
- If there are only pytorch checkpoints for a particular encoder-decoder model, a workaround is::
+ If there are only pytorch checkpoints for a particular encoder-decoder model, a workaround is:
- >>> # a workaround to load from pytorch checkpoint
- >>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
- >>> _model.encoder.save_pretrained("./encoder")
- >>> _model.decoder.save_pretrained("./decoder")
- >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
- ... "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
- ... )
- >>> # This is only for copying some specific attributes of this particular model.
- >>> model.config = _model.config
+ ```python
+ >>> # a workaround to load from pytorch checkpoint
+ >>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
+ >>> _model.encoder.save_pretrained("./encoder")
+ >>> _model.decoder.save_pretrained("./decoder")
+ >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
+ ... "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
+ ... )
+ >>> # This is only for copying some specific attributes of this particular model.
+ >>> model.config = _model.config
+ ```
- Example::
+ Example:
- >>> from transformers import TFEncoderDecoderModel
- >>> model = TFEncoderDecoderModel.from_pretrained("ydshieh/bert2bert-cnn_dailymail-fp16")
-
- """
+ ```python
+ >>> from transformers import TFEncoderDecoderModel
+ >>> model = TFEncoderDecoderModel.from_pretrained("ydshieh/bert2bert-cnn_dailymail-fp16")
+ ```"""
from_pt = kwargs.pop("from_pt", False)
if from_pt:
@@ -481,31 +483,31 @@ class TFEncoderDecoderModel(TFPreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import TFEncoderDecoderModel, BertTokenizer
+ ```python
+ >>> from transformers import TFEncoderDecoderModel, BertTokenizer
- >>> # initialize a bert2gpt2 from a pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
- >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
+ >>> # initialize a bert2gpt2 from a pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+ >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
- >>> # forward
- >>> input_ids = tokenizer.encode("Hello, my dog is cute", add_special_tokens=True, return_tensors='tf') # Batch size 1
- >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
+ >>> # forward
+ >>> input_ids = tokenizer.encode("Hello, my dog is cute", add_special_tokens=True, return_tensors='tf') # Batch size 1
+ >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
- >>> # training
- >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
- >>> loss, logits = outputs.loss, outputs.logits
+ >>> # training
+ >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
+ >>> loss, logits = outputs.loss, outputs.logits
- >>> # save and load from pretrained
- >>> model.save_pretrained("bert2gpt2")
- >>> model = TFEncoderDecoderModel.from_pretrained("bert2gpt2")
+ >>> # save and load from pretrained
+ >>> model.save_pretrained("bert2gpt2")
+ >>> model = TFEncoderDecoderModel.from_pretrained("bert2gpt2")
- >>> # generation
- >>> generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.bos_token_id)
-
- """
+ >>> # generation
+ >>> generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.bos_token_id)
+ ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
diff --git a/src/transformers/models/flaubert/configuration_flaubert.py b/src/transformers/models/flaubert/configuration_flaubert.py
index a372ff47ce..14509347f4 100644
--- a/src/transformers/models/flaubert/configuration_flaubert.py
+++ b/src/transformers/models/flaubert/configuration_flaubert.py
@@ -30,105 +30,105 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class FlaubertConfig(XLMConfig):
"""
- This is the configuration class to store the configuration of a :class:`~transformers.FlaubertModel` or a
- :class:`~transformers.TFFlaubertModel`. It is used to instantiate a FlauBERT model according to the specified
+ This is the configuration class to store the configuration of a [`FlaubertModel`] or a
+ [`TFFlaubertModel`]. It is used to instantiate a FlauBERT model according to the specified
arguments, defining the model architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ pre_norm (`bool`, *optional*, defaults to `False`):
Whether to apply the layer normalization before or after the feed forward layer following the attention in
each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
- layerdrop (:obj:`float`, `optional`, defaults to 0.0):
+ layerdrop (`float`, *optional*, defaults to 0.0):
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand with
Structured Dropout. ICLR 2020)
- vocab_size (:obj:`int`, `optional`, defaults to 30145):
+ vocab_size (`int`, *optional*, defaults to 30145):
Vocabulary size of the FlauBERT model. Defines the number of different tokens that can be represented by
- the :obj:`inputs_ids` passed when calling :class:`~transformers.FlaubertModel` or
- :class:`~transformers.TFFlaubertModel`.
- emb_dim (:obj:`int`, `optional`, defaults to 2048):
+ the `inputs_ids` passed when calling [`FlaubertModel`] or
+ [`TFFlaubertModel`].
+ emb_dim (`int`, *optional*, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer.
- n_layer (:obj:`int`, `optional`, defaults to 12):
+ n_layer (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- n_head (:obj:`int`, `optional`, defaults to 16):
+ n_head (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+ attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for the attention mechanism
- gelu_activation (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether or not to use a `gelu` activation instead of `relu`.
- sinusoidal_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ gelu_activation (`bool`, *optional*, defaults to `True`):
+ Whether or not to use a *gelu* activation instead of *relu*.
+ sinusoidal_embeddings (`bool`, *optional*, defaults to `False`):
Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
- causal (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ causal (`bool`, *optional*, defaults to `False`):
Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
order to only attend to the left-side context instead if a bidirectional context.
- asm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ asm (`bool`, *optional*, defaults to `False`):
Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
layer.
- n_langs (:obj:`int`, `optional`, defaults to 1):
+ n_langs (`int`, *optional*, defaults to 1):
The number of languages the model handles. Set to 1 for monolingual models.
- use_lang_emb (:obj:`bool`, `optional`, defaults to :obj:`True`)
- Whether to use language embeddings. Some models use additional language embeddings, see `the multilingual
- models page `__ for
+ use_lang_emb (`bool`, *optional*, defaults to `True`)
+ Whether to use language embeddings. Some models use additional language embeddings, see [the multilingual
+ models page](http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings) for
information on how to use them.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- embed_init_std (:obj:`float`, `optional`, defaults to 2048^-0.5):
+ embed_init_std (`float`, *optional*, defaults to 2048^-0.5):
The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.
- init_std (:obj:`int`, `optional`, defaults to 50257):
+ init_std (`int`, *optional*, defaults to 50257):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the
embedding matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- bos_index (:obj:`int`, `optional`, defaults to 0):
+ bos_index (`int`, *optional*, defaults to 0):
The index of the beginning of sentence token in the vocabulary.
- eos_index (:obj:`int`, `optional`, defaults to 1):
+ eos_index (`int`, *optional*, defaults to 1):
The index of the end of sentence token in the vocabulary.
- pad_index (:obj:`int`, `optional`, defaults to 2):
+ pad_index (`int`, *optional*, defaults to 2):
The index of the padding token in the vocabulary.
- unk_index (:obj:`int`, `optional`, defaults to 3):
+ unk_index (`int`, *optional*, defaults to 3):
The index of the unknown token in the vocabulary.
- mask_index (:obj:`int`, `optional`, defaults to 5):
+ mask_index (`int`, *optional*, defaults to 5):
The index of the masking token in the vocabulary.
- is_encoder(:obj:`bool`, `optional`, defaults to :obj:`True`):
+ is_encoder(`bool`, *optional*, defaults to `True`):
Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
- summary_type (:obj:`string`, `optional`, defaults to "first"):
+ summary_type (`string`, *optional*, defaults to "first"):
Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
Has to be one of the following options:
- - :obj:`"last"`: Take the last token hidden state (like XLNet).
- - :obj:`"first"`: Take the first token hidden state (like BERT).
- - :obj:`"mean"`: Take the mean of all tokens hidden states.
- - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
- - :obj:`"attn"`: Not implemented now, use multi-head attention.
- summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ - `"last"`: Take the last token hidden state (like XLNet).
+ - `"first"`: Take the first token hidden state (like BERT).
+ - `"mean"`: Take the mean of all tokens hidden states.
+ - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+ - `"attn"`: Not implemented now, use multi-head attention.
+ summary_use_proj (`bool`, *optional*, defaults to `True`):
Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
Whether or not to add a projection after the vector extraction.
- summary_activation (:obj:`str`, `optional`):
+ summary_activation (`str`, *optional*):
Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
- Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
- summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+ summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
Used in the sequence classification and multiple choice models.
- Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
- summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
+ Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+ summary_first_dropout (`float`, *optional*, defaults to 0.1):
Used in the sequence classification and multiple choice models.
The dropout ratio to be used after the projection and activation.
- start_n_top (:obj:`int`, `optional`, defaults to 5):
+ start_n_top (`int`, *optional*, defaults to 5):
Used in the SQuAD evaluation script.
- end_n_top (:obj:`int`, `optional`, defaults to 5):
+ end_n_top (`int`, *optional*, defaults to 5):
Used in the SQuAD evaluation script.
- mask_token_id (:obj:`int`, `optional`, defaults to 0):
+ mask_token_id (`int`, *optional*, defaults to 0):
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
- lang_id (:obj:`int`, `optional`, defaults to 1):
+ lang_id (`int`, *optional*, defaults to 1):
The ID of the language used by the model. This parameter is used when generating text in a given language.
"""
diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py
index ee6c824612..ecdaaf874a 100644
--- a/src/transformers/models/flaubert/tokenization_flaubert.py
+++ b/src/transformers/models/flaubert/tokenization_flaubert.py
@@ -82,11 +82,11 @@ class FlaubertTokenizer(XLMTokenizer):
- Moses preprocessing and tokenization.
- Normalizing all inputs text.
- - The arguments ``special_tokens`` and the function ``set_special_tokens``, can be used to add additional symbols
+ - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols
(like "__classify__") to a vocabulary.
- - The argument :obj:`do_lowercase` controls lower casing (automatically set for pretrained vocabularies).
+ - The argument `do_lowercase` controls lower casing (automatically set for pretrained vocabularies).
- This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
+ This tokenizer inherits from [`XLMTokenizer`]. Please check the superclass for usage examples
and documentation regarding arguments.
"""
diff --git a/src/transformers/models/fnet/configuration_fnet.py b/src/transformers/models/fnet/configuration_fnet.py
index a6922f8355..783064b5d4 100644
--- a/src/transformers/models/fnet/configuration_fnet.py
+++ b/src/transformers/models/fnet/configuration_fnet.py
@@ -29,63 +29,62 @@ FNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class FNetConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.FNetModel`. It is used to
+ This is the configuration class to store the configuration of a [`FNetModel`]. It is used to
instantiate an FNet model according to the specified arguments, defining the model architecture. Instantiating a
- configuration with the defaults will yield a similar configuration to that of the FNet `fnet-base
- `__ architecture.
+ configuration with the defaults will yield a similar configuration to that of the FNet [fnet-base](https://huggingface.co/google/fnet-base) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 32000):
+ vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the FNet model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.FNetModel` or
- :class:`~transformers.TFFNetModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ `inputs_ids` passed when calling [`FNetModel`] or
+ [`TFFNetModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
Dimension of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 4):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.FNetModel` or
- :class:`~transformers.TFFNetModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 4):
+ The vocabulary size of the `token_type_ids` passed when calling [`FNetModel`] or
+ [`TFFNetModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- use_tpu_fourier_optimizations (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Determines whether to use TPU optimized FFTs. If :obj:`True`, the model will favor axis-wise FFTs
- transforms. Set to :obj:`False` for GPU/CPU hardware, in which case n-dimensional FFTs are used.
- tpu_short_seq_length (:obj:`int`, `optional`, defaults to 512):
+ use_tpu_fourier_optimizations (`bool`, *optional*, defaults to `False`):
+ Determines whether to use TPU optimized FFTs. If `True`, the model will favor axis-wise FFTs
+ transforms. Set to `False` for GPU/CPU hardware, in which case n-dimensional FFTs are used.
+ tpu_short_seq_length (`int`, *optional*, defaults to 512):
The sequence length that is expected by the model when using TPUs. This will be used to initialize the DFT
- matrix only when `use_tpu_fourier_optimizations` is set to :obj:`True` and the input sequence is shorter
+ matrix only when *use_tpu_fourier_optimizations* is set to `True` and the input sequence is shorter
than or equal to 4096 tokens.
- Example::
+ Example:
+ ```python
+ >>> from transformers import FNetModel, FNetConfig
- >>> from transformers import FNetModel, FNetConfig
+ >>> # Initializing a FNet fnet-base style configuration
+ >>> configuration = FNetConfig()
- >>> # Initializing a FNet fnet-base style configuration
- >>> configuration = FNetConfig()
+ >>> # Initializing a model from the fnet-base style configuration
+ >>> model = FNetModel(configuration)
- >>> # Initializing a model from the fnet-base style configuration
- >>> model = FNetModel(configuration)
-
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "fnet"
def __init__(
diff --git a/src/transformers/models/fnet/tokenization_fnet.py b/src/transformers/models/fnet/tokenization_fnet.py
index 7d9f248d86..ad7b1d6396 100644
--- a/src/transformers/models/fnet/tokenization_fnet.py
+++ b/src/transformers/models/fnet/tokenization_fnet.py
@@ -45,53 +45,51 @@ SPIECE_UNDERLINE = "▁"
class FNetTokenizer(PreTrainedTokenizer):
"""
- Construct an FNet tokenizer. Adapted from :class:`~transformers.AlbertTokenizer`. Based on `SentencePiece
- `__. This tokenizer inherits from
- :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. Users should refer to this
+ Construct an FNet tokenizer. Adapted from [`AlbertTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece). This tokenizer inherits from
+ [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to this
superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ do_lower_case (`bool`, *optional*, defaults to `False`):
Whether or not to lowercase the input when tokenizing.
- remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ remove_space (`bool`, *optional*, defaults to `True`):
Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
- keep_accents (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ keep_accents (`bool`, *optional*, defaults to `True`):
Whether or not to keep accents when tokenizing.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes:
- sp_model (:obj:`SentencePieceProcessor`):
- The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+ sp_model (`SentencePieceProcessor`):
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -221,17 +219,17 @@ class FNetTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An FNet sequence has the following format:
- - single sequence: ``[CLS] X [SEP]``
- - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+ - single sequence: `[CLS] X [SEP]`
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
@@ -244,18 +242,18 @@ class FNetTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -272,20 +270,22 @@ class FNetTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet sequence
- pair mask has the following format: ::
+ pair mask has the following format: :
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence |
+ ```python
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence |
+ ```
- If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/fnet/tokenization_fnet_fast.py b/src/transformers/models/fnet/tokenization_fnet_fast.py
index 099fd9c0c5..d6a43f21e8 100644
--- a/src/transformers/models/fnet/tokenization_fnet_fast.py
+++ b/src/transformers/models/fnet/tokenization_fnet_fast.py
@@ -54,35 +54,34 @@ SPIECE_UNDERLINE = "▁"
class FNetTokenizerFast(PreTrainedTokenizerFast):
"""
- Construct a "fast" FNetTokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
- :class:`~transformers.AlbertTokenizerFast`. Based on `Unigram
- `__. This tokenizer
- inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
+ Construct a "fast" FNetTokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+ [`AlbertTokenizerFast`]. Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This tokenizer
+ inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods
Args:
- vocab_file (:obj:`str`):
- `SentencePiece `__ file (generally has a `.spm` extension) that
+ vocab_file (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ do_lower_case (`bool`, *optional*, defaults to `False`):
Whether or not to lowercase the input when tokenizing.
- remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ remove_space (`bool`, *optional*, defaults to `True`):
Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
- keep_accents (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ keep_accents (`bool`, *optional*, defaults to `True`):
Whether or not to keep accents when tokenizing.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
"""
@@ -142,17 +141,17 @@ class FNetTokenizerFast(PreTrainedTokenizerFast):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An FNet sequence has the following format:
- - single sequence: ``[CLS] X [SEP]``
- - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+ - single sequence: `[CLS] X [SEP]`
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
@@ -167,21 +166,21 @@ class FNetTokenizerFast(PreTrainedTokenizerFast):
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet
sequence pair mask has the following format:
- ::
-
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of ids.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py
index 8b60883e69..6e204a8c31 100644
--- a/src/transformers/models/fsmt/configuration_fsmt.py
+++ b/src/transformers/models/fsmt/configuration_fsmt.py
@@ -40,89 +40,89 @@ class DecoderConfig(PretrainedConfig):
class FSMTConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.FSMTModel`. It is used to
+ This is the configuration class to store the configuration of a [`FSMTModel`]. It is used to
instantiate a FSMT model according to the specified arguments, defining the model architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- langs (:obj:`List[str]`):
+ langs (`List[str]`):
A list with source language and target_language (e.g., ['en', 'ru']).
- src_vocab_size (:obj:`int`):
+ src_vocab_size (`int`):
Vocabulary size of the encoder. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed to the forward method in the encoder.
- tgt_vocab_size (:obj:`int`):
+ `inputs_ids` passed to the forward method in the encoder.
+ tgt_vocab_size (`int`):
Vocabulary size of the decoder. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed to the forward method in the decoder.
- d_model (:obj:`int`, `optional`, defaults to 1024):
+ `inputs_ids` passed to the forward method in the decoder.
+ d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
- encoder_layers (:obj:`int`, `optional`, defaults to 12):
+ encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
- decoder_layers (:obj:`int`, `optional`, defaults to 12):
+ decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
- encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
- decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- activation_function (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
+ activation_function (`str` or `Callable`, *optional*, defaults to `"relu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- init_std (:obj:`float`, `optional`, defaults to 0.02):
+ init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ scale_embedding (`bool`, *optional*, defaults to `True`):
Scale embeddings by diving by sqrt(d_model).
- bos_token_id (:obj:`int`, `optional`, defaults to 0)
+ bos_token_id (`int`, *optional*, defaults to 0)
Beginning of stream token id.
- pad_token_id (:obj:`int`, `optional`, defaults to 1)
+ pad_token_id (`int`, *optional*, defaults to 1)
Padding token id.
- eos_token_id (:obj:`int`, `optional`, defaults to 2)
+ eos_token_id (`int`, *optional*, defaults to 2)
End of stream token id.
- decoder_start_token_id (:obj:`int`, `optional`):
- This model starts decoding with :obj:`eos_token_id`
- encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+ decoder_start_token_id (`int`, *optional*):
+ This model starts decoding with `eos_token_id`
+ encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
Google "layerdrop arxiv", as its not explainable in one line.
- decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+ decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
Google "layerdrop arxiv", as its not explainable in one line.
- is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ is_encoder_decoder (`bool`, *optional*, defaults to `True`):
Whether this is an encoder/decoder model.
- tie_word_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie input and output embeddings.
- num_beams (:obj:`int`, `optional`, defaults to 5)
- Number of beams for beam search that will be used by default in the :obj:`generate` method of the model. 1
+ num_beams (`int`, *optional*, defaults to 5)
+ Number of beams for beam search that will be used by default in the `generate` method of the model. 1
means no beam search.
- length_penalty (:obj:`float`, `optional`, defaults to 1)
- Exponential penalty to the length that will be used by default in the :obj:`generate` method of the model.
- early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`)
- Flag that will be used by default in the :obj:`generate` method of the model. Whether to stop the beam
- search when at least ``num_beams`` sentences are finished per batch or not.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ length_penalty (`float`, *optional*, defaults to 1)
+ Exponential penalty to the length that will be used by default in the `generate` method of the model.
+ early_stopping (`bool`, *optional*, defaults to `False`)
+ Flag that will be used by default in the `generate` method of the model. Whether to stop the beam
+ search when at least `num_beams` sentences are finished per batch or not.
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
- forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
- The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
- :obj:`eos_token_id`.
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+ `eos_token_id`.
- Examples::
+ Examples:
- >>> from transformers import FSMTConfig, FSMTModel
+ ```python
+ >>> from transformers import FSMTConfig, FSMTModel
- >>> config = FSMTConfig.from_pretrained('facebook/wmt19-en-ru')
- >>> model = FSMTModel(config)
-
- """
+ >>> config = FSMTConfig.from_pretrained('facebook/wmt19-en-ru')
+ >>> model = FSMTModel(config)
+ ```"""
model_type = "fsmt"
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
@@ -208,10 +208,10 @@ class FSMTConfig(PretrainedConfig):
def to_dict(self):
"""
- Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig`.
+ Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
Returns:
- :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output["decoder"] = self.decoder.to_dict()
diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py
index ff99d75eeb..73a1ca8322 100644
--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -140,39 +140,42 @@ class FSMTTokenizer(PreTrainedTokenizer):
- Moses preprocessing and tokenization.
- Normalizing all inputs text.
- - The arguments ``special_tokens`` and the function ``set_special_tokens``, can be used to add additional symbols
+ - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols
(like "__classify__") to a vocabulary.
- - The argument :obj:`langs` defines a pair of languages.
+ - The argument `langs` defines a pair of languages.
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- langs (:obj:`List[str]`):
- A list of two languages to translate from and to, for instance :obj:`["en", "ru"]`.
- src_vocab_file (:obj:`str`):
+ langs (`List[str]`):
+ A list of two languages to translate from and to, for instance `["en", "ru"]`.
+ src_vocab_file (`str`):
File containing the vocabulary for the source language.
- tgt_vocab_file (:obj:`st`):
+ tgt_vocab_file (`st`):
File containing the vocabulary for the target language.
- merges_file (:obj:`str`):
+ merges_file (`str`):
File containing the merges.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ do_lower_case (`bool`, *optional*, defaults to `False`):
Whether or not to lowercase the input when tokenizing.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
"""
@@ -398,17 +401,17 @@ class FSMTTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A FAIRSEQ Transformer sequence has the following format:
- - single sequence: `` X ``
- - pair of sequences: `` A B ``
+ - single sequence: ` X `
+ - pair of sequences: ` A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
sep = [self.sep_token_id]
@@ -422,18 +425,18 @@ class FSMTTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -452,21 +455,21 @@ class FSMTTokenizer(PreTrainedTokenizer):
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
Transformer sequence pair mask has the following format:
- ::
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
-
- If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An
diff --git a/src/transformers/models/funnel/configuration_funnel.py b/src/transformers/models/funnel/configuration_funnel.py
index 1d25e765c4..5c2d1c962e 100644
--- a/src/transformers/models/funnel/configuration_funnel.py
+++ b/src/transformers/models/funnel/configuration_funnel.py
@@ -36,69 +36,68 @@ FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class FunnelConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.FunnelModel` or a
- :class:`~transformers.TFBertModel`. It is used to instantiate a Funnel Transformer model according to the specified
+ This is the configuration class to store the configuration of a [`FunnelModel`] or a
+ [`TFBertModel`]. It is used to instantiate a Funnel Transformer model according to the specified
arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
- configuration to that of the Funnel Transformer `funnel-transformer/small
- `__ architecture.
+ configuration to that of the Funnel Transformer [funnel-transformer/small](https://huggingface.co/funnel-transformer/small) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
+ vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the Funnel transformer. Defines the number of different tokens that can be represented
- by the :obj:`inputs_ids` passed when calling :class:`~transformers.FunnelModel` or
- :class:`~transformers.TFFunnelModel`.
- block_sizes (:obj:`List[int]`, `optional`, defaults to :obj:`[4, 4, 4]`):
+ by the `inputs_ids` passed when calling [`FunnelModel`] or
+ [`TFFunnelModel`].
+ block_sizes (`List[int]`, *optional*, defaults to `[4, 4, 4]`):
The sizes of the blocks used in the model.
- block_repeats (:obj:`List[int]`, `optional`):
+ block_repeats (`List[int]`, *optional*):
If passed along, each layer of each block is repeated the number of times indicated.
- num_decoder_layers (:obj:`int`, `optional`, defaults to 2):
+ num_decoder_layers (`int`, *optional*, defaults to 2):
The number of layers in the decoder (when not using the base model).
- d_model (:obj:`int`, `optional`, defaults to 768):
+ d_model (`int`, *optional*, defaults to 768):
Dimensionality of the model's hidden states.
- n_head (:obj:`int`, `optional`, defaults to 12):
+ n_head (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- d_head (:obj:`int`, `optional`, defaults to 64):
+ d_head (`int`, *optional*, defaults to 64):
Dimensionality of the model's heads.
- d_inner (:obj:`int`, `optional`, defaults to 3072):
+ d_inner (`int`, *optional*, defaults to 3072):
Inner dimension in the feed-forward blocks.
- hidden_act (:obj:`str` or :obj:`callable`, `optional`, defaults to :obj:`"gelu_new"`):
+ hidden_act (`str` or `callable`, *optional*, defaults to `"gelu_new"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+ attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for the attention probabilities.
- activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability used between the two layers of the feed-forward blocks.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 3):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.FunnelModel` or
- :class:`~transformers.TFFunnelModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.1):
- The standard deviation of the `uniform initializer` for initializing all weight matrices in attention
+ type_vocab_size (`int`, *optional*, defaults to 3):
+ The vocabulary size of the `token_type_ids` passed when calling [`FunnelModel`] or
+ [`TFFunnelModel`].
+ initializer_range (`float`, *optional*, defaults to 0.1):
+ The standard deviation of the *uniform initializer* for initializing all weight matrices in attention
layers.
- initializer_std (:obj:`float`, `optional`):
- The standard deviation of the `normal initializer` for initializing the embedding matrix and the weight of
+ initializer_std (`float`, *optional*):
+ The standard deviation of the *normal initializer* for initializing the embedding matrix and the weight of
linear layers. Will default to 1 for the embedding matrix and the value given by Xavier initialization for
linear layers.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-9):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-9):
The epsilon used by the layer normalization layers.
- pooling_type (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
- Possible values are ``"mean"`` or ``"max"``. The way pooling is performed at the beginning of each block.
- attention_type (:obj:`str`, `optional`, defaults to :obj:`"relative_shift"`):
- Possible values are ``"relative_shift"`` or ``"factorized"``. The former is faster on CPU/GPU while the
+ pooling_type (`str`, *optional*, defaults to `"mean"`):
+ Possible values are `"mean"` or `"max"`. The way pooling is performed at the beginning of each block.
+ attention_type (`str`, *optional*, defaults to `"relative_shift"`):
+ Possible values are `"relative_shift"` or `"factorized"`. The former is faster on CPU/GPU while the
latter is faster on TPU.
- separate_cls (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ separate_cls (`bool`, *optional*, defaults to `True`):
Whether or not to separate the cls token when applying pooling.
- truncate_seq (:obj:`bool`, `optional`, defaults to :obj:`False`):
- When using ``separate_cls``, whether or not to truncate the last token when pooling, to avoid getting a
+ truncate_seq (`bool`, *optional*, defaults to `False`):
+ When using `separate_cls`, whether or not to truncate the last token when pooling, to avoid getting a
sequence length that is not a multiple of 2.
- pool_q_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ pool_q_only (`bool`, *optional*, defaults to `False`):
Whether or not to apply the pooling only to the query or to query, key and values for the attention layers.
"""
model_type = "funnel"
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index 04a4208fae..ca77e070b3 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -1267,17 +1267,18 @@ class TFFunnelForPreTraining(TFFunnelPreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import FunnelTokenizer, TFFunnelForPreTraining
- >>> import torch
+ ```python
+ >>> from transformers import FunnelTokenizer, TFFunnelForPreTraining
+ >>> import torch
- >>> tokenizer = TFFunnelTokenizer.from_pretrained('funnel-transformer/small')
- >>> model = TFFunnelForPreTraining.from_pretrained('funnel-transformer/small')
+ >>> tokenizer = TFFunnelTokenizer.from_pretrained('funnel-transformer/small')
+ >>> model = TFFunnelForPreTraining.from_pretrained('funnel-transformer/small')
- >>> inputs = tokenizer("Hello, my dog is cute", return_tensors= "tf")
- >>> logits = model(inputs).logits
- """
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors= "tf")
+ >>> logits = model(inputs).logits
+ ```"""
inputs = input_processing(
func=self.call,
config=self.config,
diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py
index 8a2f00d847..991c048864 100644
--- a/src/transformers/models/funnel/tokenization_funnel.py
+++ b/src/transformers/models/funnel/tokenization_funnel.py
@@ -59,10 +59,10 @@ class FunnelTokenizer(BertTokenizer):
r"""
Construct a Funnel Transformer tokenizer.
- :class:`~transformers.FunnelTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+ [`FunnelTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
parameters.
"""
@@ -113,21 +113,21 @@ class FunnelTokenizer(BertTokenizer):
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
Transformer sequence pair mask has the following format:
- ::
+ ```
+ 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
- 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
-
- If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/funnel/tokenization_funnel_fast.py b/src/transformers/models/funnel/tokenization_funnel_fast.py
index 4ccab80d45..a185f005ed 100644
--- a/src/transformers/models/funnel/tokenization_funnel_fast.py
+++ b/src/transformers/models/funnel/tokenization_funnel_fast.py
@@ -70,12 +70,12 @@ PRETRAINED_INIT_CONFIGURATION = {f"funnel-transformer/{name}": {"do_lower_case":
class FunnelTokenizerFast(BertTokenizerFast):
r"""
- Construct a "fast" Funnel Transformer tokenizer (backed by HuggingFace's `tokenizers` library).
+ Construct a "fast" Funnel Transformer tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.FunnelTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+ [`FunnelTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
end-to-end tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
parameters.
"""
@@ -129,21 +129,21 @@ class FunnelTokenizerFast(BertTokenizerFast):
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
Transformer sequence pair mask has the following format:
- ::
+ ```
+ 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
- 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
-
- If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
index be4f8df0a8..9ea843a523 100644
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -37,97 +37,98 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class GPT2Config(PretrainedConfig):
"""
- This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model` or a
- :class:`~transformers.TFGPT2Model`. It is used to instantiate a GPT-2 model according to the specified arguments,
+ This is the configuration class to store the configuration of a [`GPT2Model`] or a
+ [`TFGPT2Model`]. It is used to instantiate a GPT-2 model according to the specified arguments,
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
- to that of the GPT-2 `small `__ architecture.
+ to that of the GPT-2 [small](https://huggingface.co/gpt2) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50257):
+ vocab_size (`int`, *optional*, defaults to 50257):
Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.GPT2Model` or
- :class:`~transformers.TFGPT2Model`.
- n_positions (:obj:`int`, `optional`, defaults to 1024):
+ `inputs_ids` passed when calling [`GPT2Model`] or
+ [`TFGPT2Model`].
+ n_positions (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- n_embd (:obj:`int`, `optional`, defaults to 768):
+ n_embd (`int`, *optional*, defaults to 768):
Dimensionality of the embeddings and hidden states.
- n_layer (:obj:`int`, `optional`, defaults to 12):
+ n_layer (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- n_head (:obj:`int`, `optional`, defaults to 12):
+ n_head (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- n_inner (:obj:`int`, `optional`, defaults to None):
- Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
- activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu"`):
- Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`.
- resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+ n_inner (`int`, *optional*, defaults to None):
+ Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+ activation_function (`str`, *optional*, defaults to `"gelu"`):
+ Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+ resid_pdrop (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+ embd_pdrop (`int`, *optional*, defaults to 0.1):
The dropout ratio for the embeddings.
- attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+ attn_pdrop (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention.
- layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
The epsilon to use in the layer normalization layers.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- summary_type (:obj:`string`, `optional`, defaults to :obj:`"cls_index"`):
- Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
- and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+ summary_type (`string`, *optional*, defaults to `"cls_index"`):
+ Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`]
+ and [`TFGPT2DoubleHeadsModel`].
Has to be one of the following options:
- - :obj:`"last"`: Take the last token hidden state (like XLNet).
- - :obj:`"first"`: Take the first token hidden state (like BERT).
- - :obj:`"mean"`: Take the mean of all tokens hidden states.
- - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
- - :obj:`"attn"`: Not implemented now, use multi-head attention.
- summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
- and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+ - `"last"`: Take the last token hidden state (like XLNet).
+ - `"first"`: Take the first token hidden state (like BERT).
+ - `"mean"`: Take the mean of all tokens hidden states.
+ - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+ - `"attn"`: Not implemented now, use multi-head attention.
+ summary_use_proj (`bool`, *optional*, defaults to `True`):
+ Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`]
+ and [`TFGPT2DoubleHeadsModel`].
Whether or not to add a projection after the vector extraction.
- summary_activation (:obj:`str`, `optional`):
+ summary_activation (`str`, *optional*):
Argument used when doing sequence summary. Used in for the multiple choice head in
- :class:`~transformers.GPT2DoubleHeadsModel`.
+ [`GPT2DoubleHeadsModel`].
- Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
- summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
- and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+ Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+ summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+ Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`]
+ and [`TFGPT2DoubleHeadsModel`].
- Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
- summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
- Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
- and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+ Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+ summary_first_dropout (`float`, *optional*, defaults to 0.1):
+ Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`]
+ and [`TFGPT2DoubleHeadsModel`].
The dropout ratio to be used after the projection and activation.
- scale_attn_weights (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ scale_attn_weights (`bool`, *optional*, defaults to `True`):
Scale attention weights by dividing by sqrt(hidden_size)..
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
- scale_attn_by_inverse_layer_idx (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether to additionally scale attention weights by ``1 / layer_idx + 1``.
- reorder_and_upcast_attn (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+ Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+ reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
dot-product/softmax to float() when training with mixed precision.
- Example::
+ Example:
- >>> from transformers import GPT2Model, GPT2Config
+ ```python
+ >>> from transformers import GPT2Model, GPT2Config
- >>> # Initializing a GPT2 configuration
- >>> configuration = GPT2Config()
+ >>> # Initializing a GPT2 configuration
+ >>> configuration = GPT2Config()
- >>> # Initializing a model from the configuration
- >>> model = GPT2Model(configuration)
+ >>> # Initializing a model from the configuration
+ >>> model = GPT2Model(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "gpt2"
keys_to_ignore_at_inference = ["past_key_values"]
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index bdc019fcba..46f8f9a29d 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -642,17 +642,19 @@ PARALLELIZE_DOCSTRING = r"""
DEPARALLELIZE_DOCSTRING = r"""
Moves the model to cpu from a model parallel state.
- Example::
+ Example:
- # On a 4 GPU machine with gpt2-large:
- model = GPT2LMHeadModel.from_pretrained('gpt2-large')
- device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7],
+ ```python
+ # On a 4 GPU machine with gpt2-large:
+ model = GPT2LMHeadModel.from_pretrained('gpt2-large')
+ device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7],
- 1: [8, 9, 10, 11, 12, 13, 14, 15],
- 2: [16, 17, 18, 19, 20, 21, 22, 23],
- 3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]}
- model.parallelize(device_map) # Splits the model across several devices
- model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+ 1: [8, 9, 10, 11, 12, 13, 14, 15],
+ 2: [16, 17, 18, 19, 20, 21, 22, 23],
+ 3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]}
+ model.parallelize(device_map) # Splits the model across several devices
+ model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+ ```
"""
diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py
index d09e4eedd0..e334c6c7c9 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@@ -108,42 +108,43 @@ class GPT2Tokenizer(PreTrainedTokenizer):
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
- ::
+ ```
+ >>> from transformers import GPT2Tokenizer
+ >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+ >>> tokenizer("Hello world")['input_ids']
+ [15496, 995]
+ >>> tokenizer(" Hello world")['input_ids']
+ [18435, 995]
+ ```
- >>> from transformers import GPT2Tokenizer
- >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
- >>> tokenizer("Hello world")['input_ids']
- [15496, 995]
- >>> tokenizer(" Hello world")['input_ids']
- [18435, 995]
-
- You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+ You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
- .. note::
+
- When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
- one).
+ When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first
+ one).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- merges_file (:obj:`str`):
+ merges_file (`str`):
Path to the merges file.
- errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
- Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
- `__ for more information.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+ bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The beginning of sequence token.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The end of sequence token.
- add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPT2 tokenizer detect beginning of words by the preceding space).
"""
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
index 1751247f35..54ea4fa27e 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -69,51 +69,52 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class GPT2TokenizerFast(PreTrainedTokenizerFast):
"""
- Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level
+ Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
- ::
+ ```
+ >>> from transformers import GPT2TokenizerFast
+ >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+ >>> tokenizer("Hello world")['input_ids']
+ [15496, 995]
+ >>> tokenizer(" Hello world")['input_ids']
+ [18435, 995]
+ ```
- >>> from transformers import GPT2TokenizerFast
- >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
- >>> tokenizer("Hello world")['input_ids']
- [15496, 995]
- >>> tokenizer(" Hello world")['input_ids']
- [18435, 995]
-
- You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer, but since
+ You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
the model was not pretrained this way, it might yield a decrease in performance.
- .. note::
+
- When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
- ``add_prefix_space=True``.
+ When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with
+ `add_prefix_space=True`.
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+
+
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- merges_file (:obj:`str`):
+ merges_file (`str`):
Path to the merges file.
- errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
- Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
- `__ for more information.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+ bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The beginning of sequence token.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The end of sequence token.
- add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPT2 tokenizer detect beginning of words by the preceding space).
- trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ trim_offsets (`bool`, *optional*, defaults to `True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
"""
diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
index 959d0bc7de..5499334c87 100644
--- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
@@ -33,66 +33,65 @@ GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class GPTNeoConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.GPTNeoModel`. It is used to
+ This is the configuration class to store the configuration of a [`GPTNeoModel`]. It is used to
instantiate a GPT Neo model according to the specified arguments, defining the model architecture. Instantiating a
- configuration with the defaults will yield a similar configuration to that of the GPTNeo `gpt-neo-1.3B
- `__ architecture.
+ configuration with the defaults will yield a similar configuration to that of the GPTNeo [gpt-neo-1.3B](https://huggingface.co/EleutherAI/gpt-neo-1.3B) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50257):
+ vocab_size (`int`, *optional*, defaults to 50257):
Vocabulary size of the GPT Neo model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.GPTNeoModel`. Vocabulary size of the model.
- Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of
- :class:`~transformers.GPTNeoModel`.
- attention_types (:obj:`List`, `optional`, defaults to :obj:`[[["global", "local"], 12]]`):
- The type of attention for each layer in a :obj:`List` of the following format :obj:`[[["attention_type"],
- num_layerss]]` e.g. for a 24 layer model :obj:`[[["global"], 24]]` or :obj:`[[["global", "local"], 12]]`
- Choose the value of ``attention_type`` from :obj:`["global", "local"]`
- hidden_size (:obj:`int`, `optional`, defaults to 2048):
+ `inputs_ids` passed when calling [`GPTNeoModel`]. Vocabulary size of the model.
+ Defines the different tokens that can be represented by the *inputs_ids* passed to the forward method of
+ [`GPTNeoModel`].
+ attention_types (`List`, *optional*, defaults to `[[["global", "local"], 12]]`):
+ The type of attention for each layer in a `List` of the following format `[[["attention_type"], num_layerss]]` e.g. for a 24 layer model `[[["global"], 24]]` or `[[["global", "local"], 12]]`
+ Choose the value of `attention_type` from `["global", "local"]`
+ hidden_size (`int`, *optional*, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer.
- num_layers (:obj:`int`, `optional`, defaults to 24):
+ num_layers (`int`, *optional*, defaults to 24):
Number of hidden layers in the Transformer encoder.
- num_heads (:obj:`int`, `optional`, defaults to 16):
+ num_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 8192):
+ intermediate_size (`int`, *optional*, defaults to 8192):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu_new"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
- embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ embed_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.GPTNeoModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling [`GPTNeoModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
- relevant if ``config.is_decoder=True``.
+ relevant if `config.is_decoder=True`.
- Example::
+ Example:
- >>> from transformers import GPTNeoModel, GPTNeoConfig
+ ```python
+ >>> from transformers import GPTNeoModel, GPTNeoConfig
- >>> # Initializing a GPTNeo EleutherAI/gpt-neo-1.3B style configuration
- >>> configuration = GPTNeoConfig()
+ >>> # Initializing a GPTNeo EleutherAI/gpt-neo-1.3B style configuration
+ >>> configuration = GPTNeoConfig()
- >>> # Initializing a model from the EleutherAI/gpt-neo-1.3B style configuration
- >>> model = GPTNeoModel(configuration)
+ >>> # Initializing a model from the EleutherAI/gpt-neo-1.3B style configuration
+ >>> model = GPTNeoModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "gpt_neo"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py
index 6c754ddc42..1079169ac3 100644
--- a/src/transformers/models/gptj/configuration_gptj.py
+++ b/src/transformers/models/gptj/configuration_gptj.py
@@ -28,60 +28,60 @@ GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class GPTJConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.GPTJModel`. It is used to
+ This is the configuration class to store the configuration of a [`GPTJModel`]. It is used to
instantiate a GPT-J model according to the specified arguments, defining the model architecture. Instantiating a
- configuration with the defaults will yield a similar configuration to that of the GPT-J `gpt-j-6B
- `__ architecture. Configuration objects inherit from
- :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from
- :class:`~transformers.PretrainedConfig` for more information.
+ configuration with the defaults will yield a similar configuration to that of the GPT-J [gpt-j-6B](https://huggingface.co/EleutherAI/gpt-j-6B) architecture. Configuration objects inherit from
+ [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+ [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50400):
+ vocab_size (`int`, *optional*, defaults to 50400):
Vocabulary size of the GPT-J model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.GPTJModel`.
- n_positions (:obj:`int`, `optional`, defaults to 2048):
+ `inputs_ids` passed when calling [`GPTJModel`].
+ n_positions (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- n_embd (:obj:`int`, `optional`, defaults to 4096):
+ n_embd (`int`, *optional*, defaults to 4096):
Dimensionality of the embeddings and hidden states.
- n_layer (:obj:`int`, `optional`, defaults to 28):
+ n_layer (`int`, *optional*, defaults to 28):
Number of hidden layers in the Transformer encoder.
- n_head (:obj:`int`, `optional`, defaults to 16):
+ n_head (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- rotary_dim (:obj:`int`, `optional`, defaults to 64):
+ rotary_dim (`int`, *optional*, defaults to 64):
Number of dimensions in the embedding that Rotary Position Embedding is applied to.
- n_inner (:obj:`int`, `optional`, defaults to None):
- Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
- activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu_new"`):
- Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`.
- resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+ n_inner (`int`, *optional*, defaults to None):
+ Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+ activation_function (`str`, *optional*, defaults to `"gelu_new"`):
+ Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+ resid_pdrop (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+ embd_pdrop (`int`, *optional*, defaults to 0.1):
The dropout ratio for the embeddings.
- attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+ attn_pdrop (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention.
- layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
The epsilon to use in the layer normalization layers.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- scale_attn_weights (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ scale_attn_weights (`bool`, *optional*, defaults to `True`):
Scale attention weights by dividing by sqrt(hidden_size).
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
- Example::
+ Example:
- >>> from transformers import GPTJModel, GPTJConfig
+ ```python
+ >>> from transformers import GPTJModel, GPTJConfig
- >>> # Initializing a GPT-J 6B configuration
- >>> configuration = GPTJConfig()
+ >>> # Initializing a GPT-J 6B configuration
+ >>> configuration = GPTJConfig()
- >>> # Initializing a model from the configuration
- >>> model = GPTJModel(configuration)
+ >>> # Initializing a model from the configuration
+ >>> model = GPTJModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "gptj"
attribute_map = {
"max_position_embeddings": "n_positions",
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 0c6b60f65f..05e7f16700 100755
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -424,15 +424,18 @@ PARALLELIZE_DOCSTRING = r"""
DEPARALLELIZE_DOCSTRING = r"""
Moves the model to CPU from a model parallel state.
- Example::
- # On a 4 GPU machine with gpt-j-6B:
- model = GPTJForCausalLM.from_pretrained('EleutherAI/gpt-j-6B')
- device_map = {0: [0, 1, 2, 3, 4, 5, 6],
- 1: [7, 8, 9, 10, 11, 12, 13],
- 2: [14, 15, 16, 17, 18, 19, 20],
- 3: [21, 22, 23, 24, 25, 26, 27]}
- model.parallelize(device_map) # Splits the model across several devices
- model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+ Example:
+
+ ```python
+ # On a 4 GPU machine with gpt-j-6B:
+ model = GPTJForCausalLM.from_pretrained('EleutherAI/gpt-j-6B')
+ device_map = {0: [0, 1, 2, 3, 4, 5, 6],
+ 1: [7, 8, 9, 10, 11, 12, 13],
+ 2: [14, 15, 16, 17, 18, 19, 20],
+ 3: [21, 22, 23, 24, 25, 26, 27]}
+ model.parallelize(device_map) # Splits the model across several devices
+ model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+ ```
"""
diff --git a/src/transformers/models/herbert/tokenization_herbert.py b/src/transformers/models/herbert/tokenization_herbert.py
index 7f954f43b9..c025c02e9b 100644
--- a/src/transformers/models/herbert/tokenization_herbert.py
+++ b/src/transformers/models/herbert/tokenization_herbert.py
@@ -49,7 +49,7 @@ class HerbertTokenizer(XLMTokenizer):
- Such pretokenized input is BPE subtokenized
- This tokenizer inherits from :class:`~transformers.XLMTokenizer` which contains most of the methods. Users should
+ This tokenizer inherits from [`XLMTokenizer`] which contains most of the methods. Users should
refer to the superclass for more information regarding methods.
"""
diff --git a/src/transformers/models/herbert/tokenization_herbert_fast.py b/src/transformers/models/herbert/tokenization_herbert_fast.py
index 2961d5c94c..7d08b18983 100644
--- a/src/transformers/models/herbert/tokenization_herbert_fast.py
+++ b/src/transformers/models/herbert/tokenization_herbert_fast.py
@@ -39,20 +39,20 @@ PRETRAINED_INIT_CONFIGURATION = {}
class HerbertTokenizerFast(PreTrainedTokenizerFast):
"""
- Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's `tokenizers` library).
+ Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's *tokenizers* library).
Peculiarities:
- uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
a punctuation character will be treated separately.
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- merges_file (:obj:`str`):
+ merges_file (`str`):
Path to the merges file.
"""
@@ -94,17 +94,17 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An HerBERT, like BERT sequence has the following format:
- - single sequence: `` X ``
- - pair of sequences: `` A B ``
+ - single sequence: ` X `
+ - pair of sequences: ` A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
cls = [self.cls_token_id]
@@ -119,18 +119,18 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
@@ -148,19 +148,19 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
Create a mask from the two sequences passed to be used in a sequence-pair classification task. HerBERT, like
BERT sequence pair mask has the following format:
- ::
-
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
+ ```
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+ | first sequence | second sequence |
+ ```
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py
index 84ed7a70bc..b1528c03fb 100644
--- a/src/transformers/models/hubert/configuration_hubert.py
+++ b/src/transformers/models/hubert/configuration_hubert.py
@@ -28,129 +28,126 @@ HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class HubertConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.HubertModel`. It is used to
+ This is the configuration class to store the configuration of a [`HubertModel`]. It is used to
instantiate an Hubert model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Hubert
- `facebook/hubert-base-ls960 `__ architecture.
+ [facebook/hubert-base-ls960](https://huggingface.co/facebook/hubert-base-ls960) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 32):
+ vocab_size (`int`, *optional*, defaults to 32):
Vocabulary size of the Hubert model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.HubertModel`. Vocabulary size of the model.
- Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of
- :class:`~transformers.HubertModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ `inputs_ids` passed when calling [`HubertModel`]. Vocabulary size of the model.
+ Defines the different tokens that can be represented by the *inputs_ids* passed to the forward method of
+ [`HubertModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout(:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout(`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout(:obj:`float`, `optional`, defaults to 0.1):
+ attention_dropout(`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- final_dropout (:obj:`float`, `optional`, defaults to 0.1):
- The dropout probabilitiy for the final projection layer of :class:`Wav2Vec2ForCTC`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ final_dropout (`float`, *optional*, defaults to 0.1):
+ The dropout probabilitiy for the final projection layer of [`Wav2Vec2ForCTC`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
- The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
- normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
+ feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+ The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+ normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers.
- feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor.
- feat_proj_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ feat_proj_layer_norm (`bool`, *optional*, defaults to `True`):
Whether to apply LayerNorm to the output of the feature extractor.
- feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
+ feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
- extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
- conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 512, 512, 512)`):
+ extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
- feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
- conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 2, 2, 2, 2, 2)`):
+ feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+ conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
- of `conv_stride` defines the number of convolutional layers and has to match the the length of `conv_dim`.
- conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 3, 3, 3, 3, 3)`):
+ of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+ conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
- length of `conv_kernel` defines the number of convolutional layers and has to match the the length of
- `conv_dim`.
- conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+ *conv_dim*.
+ conv_bias (`bool`, *optional*, defaults to `False`):
Whether the 1D convolutional layers have a bias.
- num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
+ num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
embeddings layer.
- num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
+ num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
Number of groups of 1D convolutional positional embeddings layer.
- do_stable_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether do apply `stable` layer norm architecture of the Transformer encoder. ``do_stable_layer_norm is
- True`` corresponds to applying layer norm before the attention layer, whereas ``do_stable_layer_norm is
- False`` corresponds to applying layer norm after the attention layer.
- apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+ Whether do apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer.
+ apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
- `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
- `__.
- mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
+ [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779).
+ mask_time_prob (`float`, *optional*, defaults to 0.05):
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
- masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
- the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
- mask_time_length (:obj:`int`, `optional`, defaults to 10):
+ masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease
+ the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+ mask_time_length (`int`, *optional*, defaults to 10):
Length of vector span along the time axis.
- mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
- The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
- step, irrespectively of ``mask_feature_prob``. Only relevant if
+ mask_time_min_masks (`int`, *optional*, defaults to 2),:
+ The minimum number of masks of length `mask_feature_length` generated along the time axis, each time
+ step, irrespectively of `mask_feature_prob`. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
- mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
+ mask_feature_prob (`float`, *optional*, defaults to 0.0):
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
- span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
- overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
- is True``.
- mask_feature_length (:obj:`int`, `optional`, defaults to 10):
+ span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that
+ overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+ mask_feature_length (`int`, *optional*, defaults to 10):
Length of vector span along the feature axis.
- mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
- The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
- step, irrespectively of ``mask_feature_prob``. Only relevant if
+ mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+ The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+ step, irrespectively of `mask_feature_prob`. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
- ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
- Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
- instance of :class:`~transformers.HubertForCTC`.
- ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
+ ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+ Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+ instance of [`HubertForCTC`].
+ ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+ Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses
mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
- instance of :class:`~transformers.HubertForCTC`.
- use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ instance of [`HubertForCTC`].
+ use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
- instance of :class:`~transformers.HubertForSequenceClassification`.
- classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
+ instance of [`HubertForSequenceClassification`].
+ classifier_proj_size (`int`, *optional*, defaults to 256):
Dimensionality of the projection before token mean-pooling for classification.
- Example::
+ Example:
- >>> from transformers import HubertModel, HubertConfig
+ ```python
+ >>> from transformers import HubertModel, HubertConfig
- >>> # Initializing a Hubert facebook/hubert-base-ls960 style configuration
- >>> configuration = HubertConfig()
+ >>> # Initializing a Hubert facebook/hubert-base-ls960 style configuration
+ >>> configuration = HubertConfig()
- >>> # Initializing a model from the facebook/hubert-base-ls960 style configuration
- >>> model = HubertModel(configuration)
+ >>> # Initializing a model from the facebook/hubert-base-ls960 style configuration
+ >>> model = HubertModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "hubert"
def __init__(
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 416f6ce63d..d75ad2a056 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -977,26 +977,27 @@ class HubertModel(HubertPreTrainedModel):
Returns:
- Example::
+ Example:
- >>> from transformers import Wav2Vec2Processor, HubertModel
- >>> from datasets import load_dataset
- >>> import soundfile as sf
+ ```python
+ >>> from transformers import Wav2Vec2Processor, HubertModel
+ >>> from datasets import load_dataset
+ >>> import soundfile as sf
- >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
- >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
+ >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
+ >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
- >>> def map_to_array(batch):
- ... speech, _ = sf.read(batch["file"])
- ... batch["speech"] = speech
- ... return batch
+ >>> def map_to_array(batch):
+ ... speech, _ = sf.read(batch["file"])
+ ... batch["speech"] = speech
+ ... return batch
- >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
- >>> ds = ds.map(map_to_array)
+ >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+ >>> ds = ds.map(map_to_array)
- >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
- >>> hidden_states = model(input_values).last_hidden_state
- """
+ >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
+ >>> hidden_states = model(input_values).last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index d25ee4f38c..ab10009ad9 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -1405,26 +1405,27 @@ class TFHubertModel(TFHubertPreTrainedModel):
Returns:
- Example::
+ Example:
- >>> from transformers import Wav2Vec2Processor, TFHubertModel
- >>> from datasets import load_dataset
- >>> import soundfile as sf
+ ```python
+ >>> from transformers import Wav2Vec2Processor, TFHubertModel
+ >>> from datasets import load_dataset
+ >>> import soundfile as sf
- >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-base-960h")
- >>> model = TFHubertModel.from_pretrained("facebook/hubert-base-960h")
+ >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-base-960h")
+ >>> model = TFHubertModel.from_pretrained("facebook/hubert-base-960h")
- >>> def map_to_array(batch):
- ... speech, _ = sf.read(batch["file"])
- ... batch["speech"] = speech
- ... return batch
+ >>> def map_to_array(batch):
+ ... speech, _ = sf.read(batch["file"])
+ ... batch["speech"] = speech
+ ... return batch
- >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
- >>> ds = ds.map(map_to_array)
+ >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+ >>> ds = ds.map(map_to_array)
- >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
- >>> hidden_states = model(input_values).last_hidden_state
- """
+ >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
+ >>> hidden_states = model(input_values).last_hidden_state
+ ```"""
inputs = input_values_processing(
func=self.call,
diff --git a/src/transformers/models/ibert/configuration_ibert.py b/src/transformers/models/ibert/configuration_ibert.py
index 397b6fd1e6..b389a30bf0 100644
--- a/src/transformers/models/ibert/configuration_ibert.py
+++ b/src/transformers/models/ibert/configuration_ibert.py
@@ -31,55 +31,53 @@ IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class IBertConfig(PretrainedConfig):
"""
- This is the configuration class to store the configuration of a :class:`~transformers.IBertModel`. It is used to
+ This is the configuration class to store the configuration of a [`IBertModel`]. It is used to
instantiate a I-BERT model according to the specified arguments,
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
+ vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the I-BERT model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.IBertModel`
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ `inputs_ids` passed when calling [`IBertModel`]
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.IBertModel`
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling [`IBertModel`]
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
- Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
- :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
- :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
- `__. For more information on :obj:`"relative_key_query"`, please refer to
- `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
- `__.
- quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+ `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+ `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+ *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+ quant_mode (`bool`, *optional*, defaults to `False`):
Whether to quantize the model or not.
- force_dequant (:obj:`str`, `optional`, defaults to :obj:`"none"`):
+ force_dequant (`str`, *optional*, defaults to `"none"`):
Force dequantize specific nonlinear layer. Dequatized layers are then executed with full precision.
- :obj:`"none"`, :obj:`"gelu"`, :obj:`"softmax"`, :obj:`"layernorm"` and :obj:`"nonlinear"` are supported. As
- deafult, it is set as :obj:`"none"`, which does not dequantize any layers. Please specify :obj:`"gelu"`,
- :obj:`"softmax"`, or :obj:`"layernorm"` to dequantize GELU, Softmax, or LayerNorm, respectively.
- :obj:`"nonlinear"` will dequantize all nonlinear layers, i.e., GELU, Softmax, and LayerNorm.
+ `"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As
+ deafult, it is set as `"none"`, which does not dequantize any layers. Please specify `"gelu"`,
+ `"softmax"`, or `"layernorm"` to dequantize GELU, Softmax, or LayerNorm, respectively.
+ `"nonlinear"` will dequantize all nonlinear layers, i.e., GELU, Softmax, and LayerNorm.
"""
model_type = "ibert"
diff --git a/src/transformers/models/ibert/quant_modules.py b/src/transformers/models/ibert/quant_modules.py
index 386988c06d..83b173a873 100644
--- a/src/transformers/models/ibert/quant_modules.py
+++ b/src/transformers/models/ibert/quant_modules.py
@@ -30,15 +30,15 @@ logger = logging.get_logger(__name__)
class QuantEmbedding(nn.Module):
"""
- Quantized version of :obj:`torch.nn.Embedding`. Adds quantization-specific arguments on top of
- :obj:`torch.nn.Embedding`.
+ Quantized version of `torch.nn.Embedding`. Adds quantization-specific arguments on top of
+ `torch.nn.Embedding`.
Args:
- weight_bit (:obj:`int`, `optional`, defaults to :obj:`8`):
+ weight_bit (`int`, *optional*, defaults to `8`):
Bitwidth for the quantized weight.
- momentum (:obj:`float`, `optional`, defaults to :obj:`0.95`):
+ momentum (`float`, *optional*, defaults to `0.95`):
Momentum for updating the activation quantization range.
- quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
"""
@@ -117,15 +117,15 @@ class QuantAct(nn.Module):
Quantizes the given activation.
Args:
- activation_bit (:obj:`int`):
+ activation_bit (`int`):
Bitwidth for the quantized activation.
- act_range_momentum (:obj:`float`, `optional`, defaults to :obj:`0.95`):
+ act_range_momentum (`float`, *optional*, defaults to `0.95`):
Momentum for updating the activation quantization range.
- per_channel (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ per_channel (`bool`, *optional*, defaults to `False`):
Whether to or not use channel-wise quantization.
- channel_len (:obj:`int`, `optional`):
- Specify the channel length when set the `per_channel` True.
- quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ channel_len (`int`, *optional*):
+ Specify the channel length when set the *per_channel* True.
+ quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
"""
@@ -221,16 +221,16 @@ class QuantAct(nn.Module):
class QuantLinear(nn.Module):
"""
- Quantized version of :obj:`torch.nn.Linear`. Adds quantization-specific arguments on top of :obj:`torch.nn.Linear`.
+ Quantized version of `torch.nn.Linear`. Adds quantization-specific arguments on top of `torch.nn.Linear`.
Args:
- weight_bit (:obj:`int`, `optional`, defaults to :obj:`8`):
+ weight_bit (`int`, *optional*, defaults to `8`):
Bitwidth for the quantized weight.
- bias_bit (:obj:`int`, `optional`, defaults to :obj:`32`):
+ bias_bit (`int`, *optional*, defaults to `32`):
Bitwidth for the quantized bias.
- per_channel (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ per_channel (`bool`, *optional*, defaults to `False`):
Whether or not to use channel-wise quantization.
- quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
"""
@@ -301,12 +301,12 @@ class QuantLinear(nn.Module):
class IntGELU(nn.Module):
"""
- Quantized version of :obj:`torch.nn.GELU`. Adds quantization-specific arguments on top of :obj:`torch.nn.GELU`.
+ Quantized version of `torch.nn.GELU`. Adds quantization-specific arguments on top of `torch.nn.GELU`.
Args:
- quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
- force_dequant (:obj:`str`, `optional`, defaults to :obj:`"none"`):
+ force_dequant (`str`, *optional*, defaults to `"none"`):
Force dequantize the layer if either "gelu" or "nonlinear" is given.
"""
@@ -358,15 +358,15 @@ class IntGELU(nn.Module):
class IntSoftmax(nn.Module):
"""
- Quantized version of :obj:`torch.nn.Softmax`. Adds quantization-specific arguments on top of
- :obj:`torch.nn.Softmax`.
+ Quantized version of `torch.nn.Softmax`. Adds quantization-specific arguments on top of
+ `torch.nn.Softmax`.
Args:
- output_bit (:obj:`int`):
+ output_bit (`int`):
Bitwidth for the layer output activation.
- quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
- force_dequant (:obj:`str`, `optional`, defaults to :obj:`"none"`):
+ force_dequant (`str`, *optional*, defaults to `"none"`):
Force dequantize the layer if either "softmax" or "nonlinear" is given.
"""
@@ -430,15 +430,15 @@ class IntSoftmax(nn.Module):
class IntLayerNorm(nn.Module):
"""
- Quantized version of :obj:`torch.nn.LayerNorm`. Adds quantization-specific arguments on top of
- :obj:`torch.nn.LayerNorm`.
+ Quantized version of `torch.nn.LayerNorm`. Adds quantization-specific arguments on top of
+ `torch.nn.LayerNorm`.
Args:
- output_bit (:obj:`int`, `optional`, defaults to :obj:`8`):
+ output_bit (`int`, *optional*, defaults to `8`):
Bitwidth for the layer output activation.
- quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ quant_mode (`bool`, *optional*, defaults to `False`):
Whether or not the layer is quantized.
- force_dequant (:obj:`str`, `optional`, defaults to :obj:`"none"`):
+ force_dequant (`str`, *optional*, defaults to `"none"`):
Force dequantize the layer if either "layernorm" or "nonlinear" is given.
"""
@@ -535,17 +535,17 @@ def get_percentile_min_max(input, lower_percentile, upper_percentile, output_ten
Calculate the percentile max and min values in a given tensor
Args:
- input (:obj:`torch.Tensor`):
+ input (`torch.Tensor`):
The target tensor to calculate percentile max and min.
- lower_percentile (:obj:`float`):
+ lower_percentile (`float`):
If 0.1, means we return the value of the smallest 0.1% value in the tensor as percentile min.
- upper_percentile (:obj:`float`):
+ upper_percentile (`float`):
If 99.9, means we return the value of the largest 0.1% value in the tensor as percentile max.
- output_tensor (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ output_tensor (`bool`, *optional*, defaults to `False`):
If True, this function returns tensors, otherwise it returns values.
Returns:
- :obj:`Tuple(torch.Tensor, torch.Tensor)`: Percentile min and max value of `input`
+ `Tuple(torch.Tensor, torch.Tensor)`: Percentile min and max value of *input*
"""
input_length = input.shape[0]
@@ -571,17 +571,17 @@ def linear_quantize(input, scale, zero_point, inplace=False):
Quantize single-precision input tensor to integers with the given scaling factor and zeropoint.
Args:
- input (:obj:`torch.Tensor`):
+ input (`torch.Tensor`):
Single-precision input tensor to be quantized.
- scale (:obj:`torch.Tensor`):
+ scale (`torch.Tensor`):
Scaling factor for quantization.
- zero_pint (:obj:`torch.Tensor`):
+ zero_pint (`torch.Tensor`):
Shift for quantization.
- inplace (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ inplace (`bool`, *optional*, defaults to `False`):
Whether to compute inplace or not.
Returns:
- :obj:`torch.Tensor`: Linearly quantized value of `input` according to `scale` and `zero_point`.
+ `torch.Tensor`: Linearly quantized value of *input* according to *scale* and *zero_point*.
"""
# reshape scale and zeropoint for convolutional weights and activation
if len(input.shape) == 4:
@@ -606,16 +606,16 @@ def symmetric_linear_quantization_params(num_bits, saturation_min, saturation_ma
Compute the scaling factor with the given quantization range for symmetric quantization.
Args:
- saturation_min (:obj:`torch.Tensor`):
+ saturation_min (`torch.Tensor`):
Lower bound for quantization range.
- saturation_max (:obj:`torch.Tensor`):
+ saturation_max (`torch.Tensor`):
Upper bound for quantization range.
- per_channel (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ per_channel (`bool`, *optional*, defaults to `False`):
Whether to or not use channel-wise quantization.
Returns:
- :obj:`torch.Tensor`: Scaling factor that linearly quantizes the given range between `saturation_min` and
- `saturation_max`.
+ `torch.Tensor`: Scaling factor that linearly quantizes the given range between *saturation_min* and
+ *saturation_max*.
"""
# in this part, we do not need any gradient computation,
# in order to enforce this, we put torch.no_grad()
@@ -642,18 +642,18 @@ class SymmetricQuantFunction(Function):
def forward(ctx, x, k, percentile_mode, scale):
"""
Args:
- x (:obj:`torch.Tensor`):
+ x (`torch.Tensor`):
Floating point tensor to be quantized.
- k (:obj:`int`):
+ k (`int`):
Quantization bitwidth.
- percentile_mode (:obj:`bool`):
+ percentile_mode (`bool`):
Whether or not to use percentile calibration.
- scale (:obj:`torch.Tensor`):
- Pre-calculated scaling factor for `x`. Note that the current implementation of SymmetricQuantFunction
+ scale (`torch.Tensor`):
+ Pre-calculated scaling factor for *x*. Note that the current implementation of SymmetricQuantFunction
requires pre-calculated scaling factor.
Returns:
- :obj:`torch.Tensor`: Symmetric-quantized value of `input`.
+ `torch.Tensor`: Symmetric-quantized value of *input*.
"""
zero_point = torch.tensor(0.0).to(scale.device)
@@ -712,7 +712,7 @@ def batch_frexp(inputs, max_bit=31):
Decompose the scaling factor into mantissa and twos exponent.
Args:
- scaling_factor (:obj:`torch.Tensor`):
+ scaling_factor (`torch.Tensor`):
Target scaling factor to decompose.
Returns:
@@ -746,22 +746,22 @@ class FixedPointMul(Function):
Function to perform fixed-point arithmetic that can match integer arithmetic on hardware.
Args:
- pre_act (:obj:`torch.Tensor`):
+ pre_act (`torch.Tensor`):
Input tensor.
- pre_act_scaling_factor (:obj:`torch.Tensor`):
- Scaling factor of the input tensor `pre_act`.
- bit_num (:obj:`int`):
+ pre_act_scaling_factor (`torch.Tensor`):
+ Scaling factor of the input tensor *pre_act*.
+ bit_num (`int`):
Quantization bitwidth.
- z_scaling_factor (:obj:`torch.Tensor`):
+ z_scaling_factor (`torch.Tensor`):
Scaling factor of the output tensor.
- identity (:obj:`torch.Tensor`, `optional`):
+ identity (`torch.Tensor`, *optional*):
Identity tensor, if exists.
- identity_scaling_factor (:obj:`torch.Tensor`, `optional`):
- Scaling factor of the identity tensor `identity`, if exists.
+ identity_scaling_factor (`torch.Tensor`, *optional*):
+ Scaling factor of the identity tensor *identity*, if exists.
Returns:
- :obj:`torch.Tensor`: Output tensor(`pre_act` if `identity` is not given, otherwise the addition of `pre_act`
- and `identity`), whose scale is rescaled to `z_scaling_factor`.
+ `torch.Tensor`: Output tensor(*pre_act* if *identity* is not given, otherwise the addition of *pre_act*
+ and *identity*), whose scale is rescaled to *z_scaling_factor*.
"""
@staticmethod
diff --git a/src/transformers/models/imagegpt/configuration_imagegpt.py b/src/transformers/models/imagegpt/configuration_imagegpt.py
index 5a8d0db144..5cfec7e4b3 100644
--- a/src/transformers/models/imagegpt/configuration_imagegpt.py
+++ b/src/transformers/models/imagegpt/configuration_imagegpt.py
@@ -29,67 +29,68 @@ IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class ImageGPTConfig(PretrainedConfig):
"""
- This is the configuration class to store the configuration of a :class:`~transformers.ImageGPTModel` or a
- :class:`~transformers.TFImageGPTModel`. It is used to instantiate a GPT-2 model according to the specified
+ This is the configuration class to store the configuration of a [`ImageGPTModel`] or a
+ [`TFImageGPTModel`]. It is used to instantiate a GPT-2 model according to the specified
arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
- configuration to that of the ImageGPT `small `__ architecture.
+ configuration to that of the ImageGPT [small](https://huggingface.co/imagegpt) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 512):
+ vocab_size (`int`, *optional*, defaults to 512):
Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.ImageGPTModel` or
- :class:`~transformers.TFImageGPTModel`.
- n_positions (:obj:`int`, `optional`, defaults to 32*32):
+ `inputs_ids` passed when calling [`ImageGPTModel`] or
+ [`TFImageGPTModel`].
+ n_positions (`int`, *optional*, defaults to 32*32):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- n_embd (:obj:`int`, `optional`, defaults to 512):
+ n_embd (`int`, *optional*, defaults to 512):
Dimensionality of the embeddings and hidden states.
- n_layer (:obj:`int`, `optional`, defaults to 24):
+ n_layer (`int`, *optional*, defaults to 24):
Number of hidden layers in the Transformer encoder.
- n_head (:obj:`int`, `optional`, defaults to 8):
+ n_head (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
- n_inner (:obj:`int`, `optional`, defaults to None):
- Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
- activation_function (:obj:`str`, `optional`, defaults to :obj:`"quick_gelu"`):
+ n_inner (`int`, *optional*, defaults to None):
+ Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+ activation_function (`str`, *optional*, defaults to `"quick_gelu"`):
Activation function (can be one of the activation functions defined in src/transformers/activations.py).
Defaults to "quick_gelu".
- resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+ resid_pdrop (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+ embd_pdrop (`int`, *optional*, defaults to 0.1):
The dropout ratio for the embeddings.
- attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+ attn_pdrop (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention.
- layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
The epsilon to use in the layer normalization layers.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- scale_attn_weights (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ scale_attn_weights (`bool`, *optional*, defaults to `True`):
Scale attention weights by dividing by sqrt(hidden_size)..
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
- scale_attn_by_inverse_layer_idx (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether to additionally scale attention weights by ``1 / layer_idx + 1``.
- reorder_and_upcast_attn (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+ Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+ reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
dot-product/softmax to float() when training with mixed precision.
- Example::
+ Example:
- >>> from transformers import ImageGPTModel, ImageGPTConfig
+ ```python
+ >>> from transformers import ImageGPTModel, ImageGPTConfig
- >>> # Initializing a ImageGPT configuration
- >>> configuration = ImageGPTConfig()
+ >>> # Initializing a ImageGPT configuration
+ >>> configuration = ImageGPTConfig()
- >>> # Initializing a model from the configuration
- >>> model = ImageGPTModel(configuration)
+ >>> # Initializing a model from the configuration
+ >>> model = ImageGPTModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "imagegpt"
keys_to_ignore_at_inference = ["past_key_values"]
diff --git a/src/transformers/models/imagegpt/feature_extraction_imagegpt.py b/src/transformers/models/imagegpt/feature_extraction_imagegpt.py
index 85aec8a634..a6a069afff 100644
--- a/src/transformers/models/imagegpt/feature_extraction_imagegpt.py
+++ b/src/transformers/models/imagegpt/feature_extraction_imagegpt.py
@@ -49,23 +49,23 @@ class ImageGPTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMix
resolution (such as 32x32 or 64x64), normalize them and finally color quantize them to obtain sequences of "pixel
values" (color clusters).
- This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+ This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- clusters (:obj:`np.ndarray`):
- The color clusters to use, as a :obj:`np.ndarray` of shape :obj:`(n_clusters, 3)`.
- do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether to resize the input to a certain :obj:`size`.
- size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 32):
+ clusters (`np.ndarray`):
+ The color clusters to use, as a `np.ndarray` of shape `(n_clusters, 3)`.
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Whether to resize the input to a certain `size`.
+ size (`int` or `Tuple(int)`, *optional*, defaults to 32):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
- integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
- is set to :obj:`True`.
- resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
- An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
- :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
- Only has an effect if :obj:`do_resize` is set to :obj:`True`.
- do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+ is set to `True`.
+ resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+ An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+ `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+ Only has an effect if `do_resize` is set to `True`.
+ do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input to the range between -1 and +1.
"""
@@ -81,14 +81,14 @@ class ImageGPTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMix
def normalize(self, image):
"""
- Normalizes :obj:`image` into the range -1 to +1.
+ Normalizes `image` into the range -1 to +1.
Args:
- image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to normalize.
Returns:
- :obj:`np.ndarray`: The normalized image.
+ `np.ndarray`: The normalized image.
"""
image = self.to_numpy_array(image, rescale=False, channel_first=False)
@@ -105,27 +105,29 @@ class ImageGPTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMix
"""
Main method to prepare for the model one or several image(s).
- .. warning::
+
- NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
- PIL images.
+ NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+ PIL images.
+
+
Args:
- images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
- * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
- * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
- * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
- :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
diff --git a/src/transformers/models/layoutlm/configuration_layoutlm.py b/src/transformers/models/layoutlm/configuration_layoutlm.py
index 913a6bf792..57b3bb4637 100644
--- a/src/transformers/models/layoutlm/configuration_layoutlm.py
+++ b/src/transformers/models/layoutlm/configuration_layoutlm.py
@@ -34,61 +34,60 @@ LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class LayoutLMConfig(BertConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.LayoutLMModel`. It is used to
+ This is the configuration class to store the configuration of a [`LayoutLMModel`]. It is used to
instantiate a LayoutLM model according to the specified arguments, defining the model architecture. Instantiating a
- configuration with the defaults will yield a similar configuration to that of the LayoutLM `layoutlm-base-uncased
- `__ architecture.
+ configuration with the defaults will yield a similar configuration to that of the LayoutLM [layoutlm-base-uncased](https://huggingface.co/microsoft/layoutlm-base-uncased) architecture.
- Configuration objects inherit from :class:`~transformers.BertConfig` and can be used to control the model outputs.
- Read the documentation from :class:`~transformers.BertConfig` for more information.
+ Configuration objects inherit from [`BertConfig`] and can be used to control the model outputs.
+ Read the documentation from [`BertConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
+ vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the LayoutLM model. Defines the different tokens that can be represented by the
- `inputs_ids` passed to the forward method of :class:`~transformers.LayoutLMModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ *inputs_ids* passed to the forward method of [`LayoutLMModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed into :class:`~transformers.LayoutLMModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed into [`LayoutLMModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- max_2d_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+ max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum value that the 2D position embedding might ever used. Typically set this to something large
just in case (e.g., 1024).
- Examples::
+ Examples:
- >>> from transformers import LayoutLMModel, LayoutLMConfig
+ ```python
+ >>> from transformers import LayoutLMModel, LayoutLMConfig
- >>> # Initializing a LayoutLM configuration
- >>> configuration = LayoutLMConfig()
+ >>> # Initializing a LayoutLM configuration
+ >>> configuration = LayoutLMConfig()
- >>> # Initializing a model from the configuration
- >>> model = LayoutLMModel(configuration)
+ >>> # Initializing a model from the configuration
+ >>> model = LayoutLMModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
-
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "layoutlm"
def __init__(
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 3aae3c62dd..ac0f8b7359 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -747,34 +747,35 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import LayoutLMTokenizer, LayoutLMModel
- >>> import torch
+ ```python
+ >>> from transformers import LayoutLMTokenizer, LayoutLMModel
+ >>> import torch
- >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> model = LayoutLMModel.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> model = LayoutLMModel.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> words = ["Hello", "world"]
- >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+ >>> words = ["Hello", "world"]
+ >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
- >>> token_boxes = []
- >>> for word, box in zip(words, normalized_word_boxes):
- ... word_tokens = tokenizer.tokenize(word)
- ... token_boxes.extend([box] * len(word_tokens))
- >>> # add bounding boxes of cls + sep tokens
- >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+ >>> token_boxes = []
+ >>> for word, box in zip(words, normalized_word_boxes):
+ ... word_tokens = tokenizer.tokenize(word)
+ ... token_boxes.extend([box] * len(word_tokens))
+ >>> # add bounding boxes of cls + sep tokens
+ >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
- >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
- >>> input_ids = encoding["input_ids"]
- >>> attention_mask = encoding["attention_mask"]
- >>> token_type_ids = encoding["token_type_ids"]
- >>> bbox = torch.tensor([token_boxes])
+ >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+ >>> input_ids = encoding["input_ids"]
+ >>> attention_mask = encoding["attention_mask"]
+ >>> token_type_ids = encoding["token_type_ids"]
+ >>> bbox = torch.tensor([token_boxes])
- >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
+ >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
- >>> last_hidden_states = outputs.last_hidden_state
- """
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index 88326c109c..aa33734d4d 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -947,34 +947,35 @@ class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import LayoutLMTokenizer, TFLayoutLMModel
- >>> import tensorflow as tf
+ ```python
+ >>> from transformers import LayoutLMTokenizer, TFLayoutLMModel
+ >>> import tensorflow as tf
- >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> model = TFLayoutLMModel.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+ >>> model = TFLayoutLMModel.from_pretrained('microsoft/layoutlm-base-uncased')
- >>> words = ["Hello", "world"]
- >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+ >>> words = ["Hello", "world"]
+ >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
- >>> token_boxes = []
- >>> for word, box in zip(words, normalized_word_boxes):
- ... word_tokens = tokenizer.tokenize(word)
- ... token_boxes.extend([box] * len(word_tokens))
- >>> # add bounding boxes of cls + sep tokens
- >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+ >>> token_boxes = []
+ >>> for word, box in zip(words, normalized_word_boxes):
+ ... word_tokens = tokenizer.tokenize(word)
+ ... token_boxes.extend([box] * len(word_tokens))
+ >>> # add bounding boxes of cls + sep tokens
+ >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
- >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
- >>> input_ids = encoding["input_ids"]
- >>> attention_mask = encoding["attention_mask"]
- >>> token_type_ids = encoding["token_type_ids"]
- >>> bbox = tf.convert_to_tensor([token_boxes])
+ >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
+ >>> input_ids = encoding["input_ids"]
+ >>> attention_mask = encoding["attention_mask"]
+ >>> token_type_ids = encoding["token_type_ids"]
+ >>> bbox = tf.convert_to_tensor([token_boxes])
- >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
+ >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
- >>> last_hidden_states = outputs.last_hidden_state
- """
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
inputs = input_processing(
func=self.call,
config=self.config,
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py
index 6a961c7747..603d730e8a 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py
@@ -47,10 +47,10 @@ class LayoutLMTokenizer(BertTokenizer):
r"""
Constructs a LayoutLM tokenizer.
- :class:`~transformers.LayoutLMTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+ [`LayoutLMTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
tokenization: punctuation splitting + wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
parameters.
"""
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
index 533645693e..8e8e13a90f 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
@@ -52,10 +52,10 @@ class LayoutLMTokenizerFast(BertTokenizerFast):
r"""
Constructs a "Fast" LayoutLMTokenizer.
- :class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+ [`LayoutLMTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
end-to-end tokenization: punctuation splitting + wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
parameters.
"""
diff --git a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
index f9ad445bf0..de19988365 100644
--- a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
@@ -34,87 +34,87 @@ if is_detectron2_available():
class LayoutLMv2Config(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.LayoutLMv2Model`. It is used
+ This is the configuration class to store the configuration of a [`LayoutLMv2Model`]. It is used
to instantiate an LayoutLMv2 model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the LayoutLMv2
- `microsoft/layoutlmv2-base-uncased `__ architecture.
+ [microsoft/layoutlmv2-base-uncased](https://huggingface.co/microsoft/layoutlmv2-base-uncased) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
+ vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the LayoutLMv2 model. Defines the number of different tokens that can be represented by
- the :obj:`inputs_ids` passed when calling :class:`~transformers.LayoutLMv2Model` or
- :class:`~transformers.TFLayoutLMv2Model`.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ the `inputs_ids` passed when calling [`LayoutLMv2Model`] or
+ [`TFLayoutLMv2Model`].
+ hidden_size (`int`, *optional*, defaults to 768):
Dimension of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.LayoutLMv2Model`
- or :class:`~transformers.TFLayoutLMv2Model`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling [`LayoutLMv2Model`]
+ or [`TFLayoutLMv2Model`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- max_2d_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+ max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum value that the 2D position embedding might ever be used with. Typically set this to something
large just in case (e.g., 1024).
- max_rel_pos (:obj:`int`, `optional`, defaults to 128):
+ max_rel_pos (`int`, *optional*, defaults to 128):
The maximum number of relative positions to be used in the self-attention mechanism.
- rel_pos_bins (:obj:`int`, `optional`, defaults to 32):
+ rel_pos_bins (`int`, *optional*, defaults to 32):
The number of relative position bins to be used in the self-attention mechanism.
- fast_qkv (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ fast_qkv (`bool`, *optional*, defaults to `True`):
Whether or not to use a single matrix for the queries, keys, values in the self-attention layers.
- max_rel_2d_pos (:obj:`int`, `optional`, defaults to 256):
+ max_rel_2d_pos (`int`, *optional*, defaults to 256):
The maximum number of relative 2D positions in the self-attention mechanism.
- rel_2d_pos_bins (:obj:`int`, `optional`, defaults to 64):
+ rel_2d_pos_bins (`int`, *optional*, defaults to 64):
The number of 2D relative position bins in the self-attention mechanism.
- image_feature_pool_shape (:obj:`List[int]`, `optional`, defaults to [7, 7, 256]):
+ image_feature_pool_shape (`List[int]`, *optional*, defaults to [7, 7, 256]):
The shape of the average-pooled feature map.
- coordinate_size (:obj:`int`, `optional`, defaults to 128):
+ coordinate_size (`int`, *optional*, defaults to 128):
Dimension of the coordinate embeddings.
- shape_size (:obj:`int`, `optional`, defaults to 128):
+ shape_size (`int`, *optional*, defaults to 128):
Dimension of the width and height embeddings.
- has_relative_attention_bias (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ has_relative_attention_bias (`bool`, *optional*, defaults to `True`):
Whether or not to use a relative attention bias in the self-attention mechanism.
- has_spatial_attention_bias (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ has_spatial_attention_bias (`bool`, *optional*, defaults to `True`):
Whether or not to use a spatial attention bias in the self-attention mechanism.
- has_visual_segment_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ has_visual_segment_embedding (`bool`, *optional*, defaults to `False`):
Whether or not to add visual segment embeddings.
- detectron2_config_args (:obj:`dict`, `optional`):
- Dictionary containing the configuration arguments of the Detectron2 visual backbone. Refer to `this file
- `__
+ detectron2_config_args (`dict`, *optional*):
+ Dictionary containing the configuration arguments of the Detectron2 visual backbone. Refer to [this file](https://github.com/microsoft/unilm/blob/master/layoutlmft/layoutlmft/models/layoutlmv2/detectron2_config.py)
for details regarding default values.
- Example::
+ Example:
- >>> from transformers import LayoutLMv2Model, LayoutLMv2Config
+ ```python
+ >>> from transformers import LayoutLMv2Model, LayoutLMv2Config
- >>> # Initializing a LayoutLMv2 microsoft/layoutlmv2-base-uncased style configuration
- >>> configuration = LayoutLMv2Config()
+ >>> # Initializing a LayoutLMv2 microsoft/layoutlmv2-base-uncased style configuration
+ >>> configuration = LayoutLMv2Config()
- >>> # Initializing a model from the microsoft/layoutlmv2-base-uncased style configuration
- >>> model = LayoutLMv2Model(configuration)
+ >>> # Initializing a model from the microsoft/layoutlmv2-base-uncased style configuration
+ >>> model = LayoutLMv2Model(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "layoutlmv2"
def __init__(
diff --git a/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py b/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
index 7a8c4fab7b..b10cedf4ed 100644
--- a/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
@@ -85,31 +85,32 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
Constructs a LayoutLMv2 feature extractor. This can be used to resize document images to the same size, as well as
to apply OCR on them in order to get a list of words and normalized bounding boxes.
- This feature extractor inherits from :class:`~transformers.feature_extraction_utils.PreTrainedFeatureExtractor`
+ This feature extractor inherits from [`~feature_extraction_utils.PreTrainedFeatureExtractor`]
which contains most of the main methods. Users should refer to this superclass for more information regarding those
methods.
Args:
- do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether to resize the input to a certain :obj:`size`.
- size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 224):
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Whether to resize the input to a certain `size`.
+ size (`int` or `Tuple(int)`, *optional*, defaults to 224):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
- integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
- is set to :obj:`True`.
- resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
- An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
- :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
- Only has an effect if :obj:`do_resize` is set to :obj:`True`.
- apply_ocr (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+ is set to `True`.
+ resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+ An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+ `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+ Only has an effect if `do_resize` is set to `True`.
+ apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
- ocr_lang (:obj:`Optional[str]`, `optional`):
+ ocr_lang (`Optional[str]`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used.
- .. note::
+
- LayoutLMv2FeatureExtractor uses Google's Tesseract OCR engine under the hood.
- """
+ LayoutLMv2FeatureExtractor uses Google's Tesseract OCR engine under the hood.
+
+ """
model_input_names = ["pixel_values"]
@@ -130,48 +131,49 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
Main method to prepare for the model one or several image(s).
Args:
- images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
- * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
- * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
- * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
- :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
- **words** -- Optional words as identified by Tesseract OCR (only when
- :class:`~transformers.LayoutLMv2FeatureExtractor` was initialized with :obj:`apply_ocr` set to ``True``).
+ [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`).
- **boxes** -- Optional bounding boxes as identified by Tesseract OCR, normalized based on the image size
- (only when :class:`~transformers.LayoutLMv2FeatureExtractor` was initialized with :obj:`apply_ocr` set to
- ``True``).
+ (only when [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to
+ `True`).
- Examples::
+ Examples:
- >>> from transformers import LayoutLMv2FeatureExtractor
- >>> from PIL import Image
+ ```python
+ >>> from transformers import LayoutLMv2FeatureExtractor
+ >>> from PIL import Image
- >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+ >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
- >>> # option 1: with apply_ocr=True (default)
- >>> feature_extractor = LayoutLMv2FeatureExtractor()
- >>> encoding = feature_extractor(image, return_tensors="pt")
- >>> print(encoding.keys())
- >>> # dict_keys(['pixel_values', 'words', 'boxes'])
+ >>> # option 1: with apply_ocr=True (default)
+ >>> feature_extractor = LayoutLMv2FeatureExtractor()
+ >>> encoding = feature_extractor(image, return_tensors="pt")
+ >>> print(encoding.keys())
+ >>> # dict_keys(['pixel_values', 'words', 'boxes'])
- >>> # option 2: with apply_ocr=False
- >>> feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
- >>> encoding = feature_extractor(image, return_tensors="pt")
- >>> print(encoding.keys())
- >>> # dict_keys(['pixel_values'])
- """
+ >>> # option 2: with apply_ocr=False
+ >>> feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+ >>> encoding = feature_extractor(image, return_tensors="pt")
+ >>> print(encoding.keys())
+ >>> # dict_keys(['pixel_values'])
+ ```"""
# Input type checking for clearer error
valid_images = False
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index e05bff9ebe..3df07a66f5 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -820,21 +820,22 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
r"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import LayoutLMv2Processor, LayoutLMv2Model
- >>> from PIL import Image
+ ```python
+ >>> from transformers import LayoutLMv2Processor, LayoutLMv2Model
+ >>> from PIL import Image
- >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased')
- >>> model = LayoutLMv2Model.from_pretrained('microsoft/layoutlmv2-base-uncased')
+ >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased')
+ >>> model = LayoutLMv2Model.from_pretrained('microsoft/layoutlmv2-base-uncased')
- >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+ >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
- >>> encoding = processor(image, return_tensors="pt")
+ >>> encoding = processor(image, return_tensors="pt")
- >>> outputs = model(**encoding)
- >>> last_hidden_states = outputs.last_hidden_state
- """
+ >>> outputs = model(**encoding)
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
index ed91556bc3..d49dbc99bb 100644
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -29,22 +29,22 @@ class LayoutLMv2Processor:
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a
single processor.
- :class:`~transformers.LayoutLMv2Processor` offers all the functionalities you need to prepare data for the model.
+ [`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
- It first uses :class:`~transformers.LayoutLMv2FeatureExtractor` to resize document images to a fixed size, and
+ It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and
optionally applies OCR to get words and normalized bounding boxes. These are then provided to
- :class:`~transformers.LayoutLMv2Tokenizer` or :class:`~transformers.LayoutLMv2TokenizerFast`, which turns the words
- and bounding boxes into token-level :obj:`input_ids`, :obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`.
- Optionally, one can provide integer :obj:`word_labels`, which are turned into token-level :obj:`labels` for token
+ [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`], which turns the words
+ and bounding boxes into token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`.
+ Optionally, one can provide integer `word_labels`, which are turned into token-level `labels` for token
classification tasks (such as FUNSD, CORD).
Args:
- feature_extractor (:obj:`LayoutLMv2FeatureExtractor`):
- An instance of :class:`~transformers.LayoutLMv2FeatureExtractor`. The feature extractor is a required
+ feature_extractor (`LayoutLMv2FeatureExtractor`):
+ An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required
input.
- tokenizer (:obj:`LayoutLMv2Tokenizer` or :obj:`LayoutLMv2TokenizerFast`):
- An instance of :class:`~transformers.LayoutLMv2Tokenizer` or
- :class:`~transformers.LayoutLMv2TokenizerFast`. The tokenizer is a required input.
+ tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
+ An instance of [`LayoutLMv2Tokenizer`] or
+ [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
"""
def __init__(self, feature_extractor, tokenizer):
@@ -62,18 +62,20 @@ class LayoutLMv2Processor:
def save_pretrained(self, save_directory):
"""
- Save a LayoutLMv2 feature_extractor object and LayoutLMv2 tokenizer object to the directory ``save_directory``,
- so that it can be re-loaded using the :func:`~transformers.LayoutLMv2Processor.from_pretrained` class method.
+ Save a LayoutLMv2 feature_extractor object and LayoutLMv2 tokenizer object to the directory `save_directory`,
+ so that it can be re-loaded using the [`~LayoutLMv2Processor.from_pretrained`] class method.
- .. note::
+
- This class method is simply calling
- :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` and
- :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
- docstrings of the methods above for more information.
+ This class method is simply calling
+ [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
+ [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
+ docstrings of the methods above for more information.
+
+
Args:
- save_directory (:obj:`str` or :obj:`os.PathLike`):
+ save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
@@ -84,35 +86,37 @@ class LayoutLMv2Processor:
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
r"""
- Instantiate a :class:`~transformers.LayoutLMv2Processor` from a pretrained LayoutLMv2 processor.
+ Instantiate a [`LayoutLMv2Processor`] from a pretrained LayoutLMv2 processor.
- .. note::
+
- This class method is simply calling LayoutLMv2FeatureExtractor's
- :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` and
- LayoutLMv2TokenizerFast's
- :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
- docstrings of the methods above for more information.
+ This class method is simply calling LayoutLMv2FeatureExtractor's
+ [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
+ LayoutLMv2TokenizerFast's
+ [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
+ docstrings of the methods above for more information.
+
+
Args:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
- namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - a path to a `directory` containing a feature extractor file saved using the
- :meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g.,
- ``./my_model_directory/``.
- - a path or url to a saved feature extractor JSON `file`, e.g.,
- ``./my_model_directory/preprocessor_config.json``.
+ - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+ namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - a path to a *directory* containing a feature extractor file saved using the
+ [`~SequenceFeatureExtractor.save_pretrained`] method, e.g.,
+ `./my_model_directory/`.
+ - a path or url to a saved feature extractor JSON *file*, e.g.,
+ `./my_model_directory/preprocessor_config.json`.
- use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_fast (`bool`, *optional*, defaults to `True`):
Whether or not to instantiate a fast tokenizer.
**kwargs
- Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
- :class:`~transformers.PreTrainedTokenizer`
+ Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
+ [`PreTrainedTokenizer`]
"""
feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_fast:
@@ -146,14 +150,12 @@ class LayoutLMv2Processor:
**kwargs
) -> BatchEncoding:
"""
- This method first forwards the :obj:`images` argument to
- :meth:`~transformers.LayoutLMv2FeatureExtractor.__call__`. In case :class:`~LayoutLMv2FeatureExtractor` was
- initialized with :obj:`apply_ocr` set to ``True``, it passes the obtained words and bounding boxes along with
- the additional arguments to :meth:`~transformers.LayoutLMv2Tokenizer.__call__` and returns the output, together
- with resized :obj:`images`. In case :class:`~LayoutLMv2FeatureExtractor` was initialized with :obj:`apply_ocr`
- set to ``False``, it passes the words (:obj:`text`/:obj:`text_pair`) and :obj:`boxes` specified by the user
- along with the additional arguments to :meth:`~transformers.LayoutLMv2Tokenizer.__call__` and returns the
- output, together with resized :obj:`images`.
+ This method first forwards the `images` argument to
+ [`~LayoutLMv2FeatureExtractor.__call__`]. In case [`LayoutLMv2FeatureExtractor`] was
+ initialized with `apply_ocr` set to `True`, it passes the obtained words and bounding boxes along with
+ the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together
+ with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr`
+ set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.
Please refer to the docstring of the above two methods for more information.
"""
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index 2c1f6eb712..87057a325d 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -59,51 +59,51 @@ PRETRAINED_INIT_CONFIGURATION = {
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
- add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ add_special_tokens (`bool`, *optional*, defaults to `True`):
Whether or not to encode the sequences with the special tokens relative to their model.
- padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+ padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
Activates and controls padding. Accepts the following values:
- * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
- * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
- * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
- truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+ truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
Activates and controls truncation. Accepts the following values:
- * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
- :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+ - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument
+ `max_length` or to the maximum acceptable input length for the model if that argument is not
provided. This will truncate token by token, removing a token from the longest sequence in the pair
if a pair of sequences (or a batch of pairs) is provided.
- * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+ - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to
the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+ - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
+ - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with
sequence lengths greater than the model maximum admissible input size).
- max_length (:obj:`int`, `optional`):
+ max_length (`int`, *optional*):
Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
- :obj:`None`, this will use the predefined model maximum length if a maximum length is required by one
+ `None`, this will use the predefined model maximum length if a maximum length is required by one
of the truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
truncation/padding to a maximum length will be deactivated.
- stride (:obj:`int`, `optional`, defaults to 0):
- If set to a number along with :obj:`max_length`, the overflowing tokens returned when
- :obj:`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+ stride (`int`, *optional*, defaults to 0):
+ If set to a number along with `max_length`, the overflowing tokens returned when
+ `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
returned to provide some overlap between truncated and overflowing sequences. The value of this
argument defines the number of overlapping tokens.
- pad_to_multiple_of (:obj:`int`, `optional`):
+ pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
- * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
- * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return Numpy `np.ndarray` objects.
"""
@@ -145,14 +145,14 @@ def subfinder(mylist, pattern):
class LayoutLMv2Tokenizer(PreTrainedTokenizer):
r"""
- Construct a LayoutLMv2 tokenizer. Based on WordPiece. :class:`~transformers.LayoutLMv2Tokenizer` can be used to
- turn words, word-level bounding boxes and optional word labels to token-level :obj:`input_ids`,
- :obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`, and optional :obj:`labels` (for token classification).
+ Construct a LayoutLMv2 tokenizer. Based on WordPiece. [`LayoutLMv2Tokenizer`] can be used to
+ turn words, word-level bounding boxes and optional word labels to token-level `input_ids`,
+ `attention_mask`, `token_type_ids`, `bbox`, and optional `labels` (for token classification).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
- :class:`~transformers.LayoutLMv2Tokenizer` runs end-to-end tokenization: punctuation splitting and wordpiece. It
+ [`LayoutLMv2Tokenizer`] runs end-to-end tokenization: punctuation splitting and wordpiece. It
also turns the word-level bounding boxes into token-level bounding boxes.
"""
@@ -274,17 +274,17 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- - single sequence: ``[CLS] X [SEP]``
- - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+ - single sequence: `[CLS] X [SEP]`
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -297,18 +297,18 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -326,16 +326,16 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second
- sequence | If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ sequence | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
@@ -392,16 +392,16 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
sequences with word-level normalized bounding boxes and optional labels.
Args:
- text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
(words of a single example or questions of a batch of examples) or a list of list of strings (batch of
words).
- text_pair (:obj:`List[str]`, :obj:`List[List[str]]`):
+ text_pair (`List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
(pretokenized string).
- boxes (:obj:`List[List[int]]`, :obj:`List[List[List[int]]]`):
+ boxes (`List[List[int]]`, `List[List[List[int]]]`):
Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
- word_labels (:obj:`List[int]`, :obj:`List[List[int]]`, `optional`):
+ word_labels (`List[int]`, `List[List[int]]`, *optional*):
Word-level integer labels (for token classification tasks such as FUNSD, CORD).
"""
# Input type checking for clearer error
@@ -772,12 +772,12 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
- ``__call__`` should be used instead.
+ `__call__` should be used instead.
Args:
- text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text (`str`, `List[str]`, `List[List[str]]`):
The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
- text_pair (:obj:`List[str]` or :obj:`List[int]`, `optional`):
+ text_pair (`List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
list of list of strings (words of a batch of examples).
"""
@@ -893,18 +893,18 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
"""
Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
truncates sequences if overflowing while taking into account the special tokens and manages a moving window
- (with user defined stride) for overflowing tokens. Please Note, for `text_pair` different than `None` and
- `truncation_strategy = longest_first` or `True`, it is not possible to return overflowing tokens. Such a
+ (with user defined stride) for overflowing tokens. Please Note, for *text_pair* different than *None* and
+ *truncation_strategy = longest_first* or *True*, it is not possible to return overflowing tokens. Such a
combination of arguments will raise an error.
- Word-level :obj:`boxes` are turned into token-level :obj:`bbox`. If provided, word-level :obj:`word_labels` are
- turned into token-level :obj:`labels`. The word label is used for the first token of the word, while remaining
+ Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are
+ turned into token-level `labels`. The word label is used for the first token of the word, while remaining
tokens are labeled with -100, such that they will be ignored by the loss function.
Args:
- text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text (`str`, `List[str]`, `List[List[str]]`):
The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
- text_pair (:obj:`List[str]` or :obj:`List[int]`, `optional`):
+ text_pair (`List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
list of list of strings (words of a batch of examples).
"""
@@ -1092,42 +1092,42 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
Truncates a sequence pair in-place following the strategy.
Args:
- ids (:obj:`List[int]`):
- Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
- and ``convert_tokens_to_ids`` methods.
- token_boxes (:obj:`List[List[int]]`):
+ ids (`List[int]`):
+ Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize`
+ and `convert_tokens_to_ids` methods.
+ token_boxes (`List[List[int]]`):
Bounding boxes of the first sequence.
- pair_ids (:obj:`List[int]`, `optional`):
- Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
- and ``convert_tokens_to_ids`` methods.
- pair_token_boxes (:obj:`List[List[int]]`, `optional`):
+ pair_ids (`List[int]`, *optional*):
+ Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+ and `convert_tokens_to_ids` methods.
+ pair_token_boxes (`List[List[int]]`, *optional*):
Bounding boxes of the second sequence.
- labels (:obj:`List[int]`, `optional`):
+ labels (`List[int]`, *optional*):
Labels of the first sequence (for token classification tasks).
- num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
+ num_tokens_to_remove (`int`, *optional*, defaults to 0):
Number of tokens to remove using the truncation strategy.
- truncation_strategy (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+ truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
The strategy to follow for truncation. Can be:
- * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+ - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will
truncate token by token, removing a token from the longest sequence in the pair if a pair of
sequences (or a batch of pairs) is provided.
- * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+ - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to
the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+ - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+ - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
greater than the model maximum admissible input size).
- stride (:obj:`int`, `optional`, defaults to 0):
+ stride (`int`, *optional*, defaults to 0):
If set to a positive number, the overflowing tokens returned will contain some tokens from the main
sequence returned. The value of this argument defines the number of additional tokens.
Returns:
- :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the
- list of overflowing tokens. Note: The `longest_first` strategy returns empty list of overflowing tokens if
+ `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the
+ list of overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if
a pair of sequences (or a batch of pairs) is provided.
"""
if num_tokens_to_remove <= 0:
@@ -1291,19 +1291,18 @@ class BasicTokenizer(object):
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
- never_split (:obj:`Iterable`, `optional`):
+ never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
- :obj:`do_basic_tokenize=True`
- tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ `do_basic_tokenize=True`
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
- This should likely be deactivated for Japanese (see this `issue
- `__).
- strip_accents: (:obj:`bool`, `optional`):
+ This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+ strip_accents: (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
- value for :obj:`lowercase` (as in the original BERT).
+ value for `lowercase` (as in the original BERT).
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
@@ -1320,9 +1319,9 @@ class BasicTokenizer(object):
WordPieceTokenizer.
Args:
- **never_split**: (`optional`) list of str
+ never_split (`LIst[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
- :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+ [`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
# union() returns a new set by concatenating the two sets.
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
@@ -1449,14 +1448,14 @@ class WordpieceTokenizer(object):
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
- For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+ For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
- text: A single token or whitespace separated tokens. This should have
- already been passed through `BasicTokenizer`.
+ text: A single token or whitespace separated tokens. This should have
+ already been passed through *BasicTokenizer*.
Returns:
- A list of wordpiece tokens.
+ A list of wordpiece tokens.
"""
output_tokens = []
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
index 73a2cc2cb3..cab5df57d8 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
@@ -61,48 +61,48 @@ PRETRAINED_INIT_CONFIGURATION = {
class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
r"""
- Construct a "fast" LayoutLMv2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece.
+ Construct a "fast" LayoutLMv2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
File containing the vocabulary.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- cls_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
+ cls_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
The bounding box to use for the special [CLS] token.
- sep_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[1000, 1000, 1000, 1000]`):
+ sep_token_box (`List[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
The bounding box to use for the special [SEP] token.
- pad_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
+ pad_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
The bounding box to use for the special [PAD] token.
- pad_token_label (:obj:`int`, `optional`, defaults to -100):
- The label to use for padding tokens. Defaults to -100, which is the :obj:`ignore_index` of PyTorch's
+ pad_token_label (`int`, *optional*, defaults to -100):
+ The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
CrossEntropyLoss.
- only_label_first_subword (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ only_label_first_subword (`bool`, *optional*, defaults to `True`):
Whether or not to only label the first subword, in case word labels are provided.
- tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
- issue `__).
- strip_accents: (:obj:`bool`, `optional`):
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+ Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+ issue](https://github.com/huggingface/transformers/issues/328)).
+ strip_accents: (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
- value for :obj:`lowercase` (as in the original LayoutLMv2).
+ value for `lowercase` (as in the original LayoutLMv2).
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -196,16 +196,16 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
sequences with word-level normalized bounding boxes and optional labels.
Args:
- text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
(words of a single example or questions of a batch of examples) or a list of list of strings (batch of
words).
- text_pair (:obj:`List[str]`, :obj:`List[List[str]]`):
+ text_pair (`List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
(pretokenized string).
- boxes (:obj:`List[List[int]]`, :obj:`List[List[List[int]]]`):
+ boxes (`List[List[int]]`, `List[List[List[int]]]`):
Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
- word_labels (:obj:`List[int]`, :obj:`List[List[int]]`, `optional`):
+ word_labels (`List[int]`, `List[List[int]]`, *optional*):
Word-level integer labels (for token classification tasks such as FUNSD, CORD).
"""
# Input type checking for clearer error
@@ -407,12 +407,12 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
- ``__call__`` should be used instead.
+ `__call__` should be used instead.
Args:
- text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text (`str`, `List[str]`, `List[List[str]]`):
The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
- text_pair (:obj:`List[str]` or :obj:`List[int]`, `optional`):
+ text_pair (`List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
list of list of strings (words of a batch of examples).
"""
@@ -760,17 +760,17 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- - single sequence: ``[CLS] X [SEP]``
- - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+ - single sequence: `[CLS] X [SEP]`
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -785,16 +785,16 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second
- sequence | If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+ sequence | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
sequence(s).
"""
sep = [self.sep_token_id]
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index 7178797fbf..59c7cb0e6c 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -30,21 +30,21 @@ class LayoutXLMProcessor:
Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
single processor.
- :class:`~transformers.LayoutXLMProcessor` offers all the functionalities you need to prepare data for the model.
+ [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.
- It first uses :class:`~transformers.LayoutLMv2FeatureExtractor` to resize document images to a fixed size, and
+ It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and
optionally applies OCR to get words and normalized bounding boxes. These are then provided to
- :class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast`, which turns the words
- and bounding boxes into token-level :obj:`input_ids`, :obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`.
- Optionally, one can provide integer :obj:`word_labels`, which are turned into token-level :obj:`labels` for token
+ [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`], which turns the words
+ and bounding boxes into token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`.
+ Optionally, one can provide integer `word_labels`, which are turned into token-level `labels` for token
classification tasks (such as FUNSD, CORD).
Args:
- feature_extractor (:obj:`LayoutLMv2FeatureExtractor`):
- An instance of :class:`~transformers.LayoutLMv2FeatureExtractor`. The feature extractor is a required
+ feature_extractor (`LayoutLMv2FeatureExtractor`):
+ An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required
input.
- tokenizer (:obj:`LayoutXLMTokenizer` or :obj:`LayoutXLMTokenizerFast`):
- An instance of :class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast`.
+ tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`):
+ An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`].
The tokenizer is a required input.
"""
@@ -63,18 +63,20 @@ class LayoutXLMProcessor:
def save_pretrained(self, save_directory):
"""
- Save a LayoutXLM feature_extractor object and LayoutXLM tokenizer object to the directory ``save_directory``,
- so that it can be re-loaded using the :func:`~transformers.LayoutXLMProcessor.from_pretrained` class method.
+ Save a LayoutXLM feature_extractor object and LayoutXLM tokenizer object to the directory `save_directory`,
+ so that it can be re-loaded using the [`~LayoutXLMProcessor.from_pretrained`] class method.
- .. note::
+
- This class method is simply calling
- :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` and
- :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
- docstrings of the methods above for more information.
+ This class method is simply calling
+ [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
+ [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
+ docstrings of the methods above for more information.
+
+
Args:
- save_directory (:obj:`str` or :obj:`os.PathLike`):
+ save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
@@ -85,34 +87,36 @@ class LayoutXLMProcessor:
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
r"""
- Instantiate a :class:`~transformers.LayoutXLMProcessor` from a pretrained LayoutXLM processor.
+ Instantiate a [`LayoutXLMProcessor`] from a pretrained LayoutXLM processor.
- .. note::
+
- This class method is simply calling Layoutv2FeatureExtractor's
- :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` and
- LayoutXLMTokenizerFast's :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`.
- Please refer to the docstrings of the methods above for more information.
+ This class method is simply calling Layoutv2FeatureExtractor's
+ [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
+ LayoutXLMTokenizerFast's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
+ Please refer to the docstrings of the methods above for more information.
+
+
Args:
- pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
- namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- - a path to a `directory` containing a feature extractor file saved using the
- :meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g.,
- ``./my_model_directory/``.
- - a path or url to a saved feature extractor JSON `file`, e.g.,
- ``./my_model_directory/preprocessor_config.json``.
+ - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+ namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - a path to a *directory* containing a feature extractor file saved using the
+ [`~SequenceFeatureExtractor.save_pretrained`] method, e.g.,
+ `./my_model_directory/`.
+ - a path or url to a saved feature extractor JSON *file*, e.g.,
+ `./my_model_directory/preprocessor_config.json`.
- use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_fast (`bool`, *optional*, defaults to `True`):
Whether or not to instantiate a fast tokenizer.
**kwargs
- Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
- :class:`~transformers.PreTrainedTokenizer`
+ Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
+ [`PreTrainedTokenizer`]
"""
feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_fast:
@@ -146,14 +150,12 @@ class LayoutXLMProcessor:
**kwargs
) -> BatchEncoding:
"""
- This method first forwards the :obj:`images` argument to
- :meth:`~transformers.LayoutLMv2FeatureExtractor.__call__`. In case :class:`~LayoutLMv2FeatureExtractor` was
- initialized with :obj:`apply_ocr` set to ``True``, it passes the obtained words and bounding boxes along with
- the additional arguments to :meth:`~transformers.LayoutXLMTokenizer.__call__` and returns the output, together
- with resized :obj:`images`. In case :class:`~LayoutLMv2FeatureExtractor` was initialized with :obj:`apply_ocr`
- set to ``False``, it passes the words (:obj:`text`/:obj:`text_pair`) and :obj:`boxes` specified by the user
- along with the additional arguments to :meth:`~transformers.LayoutXLMTokenizer.__call__` and returns the
- output, together with resized :obj:`images`.
+ This method first forwards the `images` argument to
+ [`~LayoutLMv2FeatureExtractor.__call__`]. In case [`LayoutLMv2FeatureExtractor`] was
+ initialized with `apply_ocr` set to `True`, it passes the obtained words and bounding boxes along with
+ the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together
+ with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr`
+ set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.
Please refer to the docstring of the above two methods for more information.
"""
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index 0e40cb06fe..75cc9a591c 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -47,75 +47,80 @@ logger = logging.get_logger(__name__)
class LayoutXLMTokenizer(PreTrainedTokenizer):
"""
- Adapted from :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on
- `SentencePiece `__.
+ Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+ [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- cls_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
+ cls_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
The bounding box to use for the special [CLS] token.
- sep_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[1000, 1000, 1000, 1000]`):
+ sep_token_box (`List[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
The bounding box to use for the special [SEP] token.
- pad_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
+ pad_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
The bounding box to use for the special [PAD] token.
- pad_token_label (:obj:`int`, `optional`, defaults to -100):
- The label to use for padding tokens. Defaults to -100, which is the :obj:`ignore_index` of PyTorch's
+ pad_token_label (`int`, *optional*, defaults to -100):
+ The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
CrossEntropyLoss.
- only_label_first_subword (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ only_label_first_subword (`bool`, *optional*, defaults to `True`):
Whether or not to only label the first subword, in case word labels are provided.
- additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes:
- sp_model (:obj:`SentencePieceProcessor`):
- The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+ sp_model (`SentencePieceProcessor`):
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -212,17 +217,17 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An XLM-RoBERTa sequence has the following format:
- - single sequence: `` X ``
- - pair of sequences: `` A B ``
+ - single sequence: ` X `
+ - pair of sequences: ` A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
@@ -236,18 +241,18 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -267,13 +272,13 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
not make use of token type ids, therefore a list of zeros is returned.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
@@ -357,16 +362,16 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
sequences with word-level normalized bounding boxes and optional labels.
Args:
- text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
(words of a single example or questions of a batch of examples) or a list of list of strings (batch of
words).
- text_pair (:obj:`List[str]`, :obj:`List[List[str]]`):
+ text_pair (`List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
(pretokenized string).
- boxes (:obj:`List[List[int]]`, :obj:`List[List[List[int]]]`):
+ boxes (`List[List[int]]`, `List[List[List[int]]]`):
Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
- word_labels (:obj:`List[int]`, :obj:`List[List[int]]`, `optional`):
+ word_labels (`List[int]`, `List[List[int]]`, *optional*):
Word-level integer labels (for token classification tasks such as FUNSD, CORD).
"""
# Input type checking for clearer error
@@ -684,14 +689,14 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
truncates sequences if overflowing while taking into account the special tokens and manages a moving window
(with user defined stride) for overflowing tokens.
- Word-level :obj:`boxes` are turned into token-level :obj:`bbox`. If provided, word-level :obj:`word_labels` are
- turned into token-level :obj:`labels`. The word label is used for the first token of the word, while remaining
+ Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are
+ turned into token-level `labels`. The word label is used for the first token of the word, while remaining
tokens are labeled with -100, such that they will be ignored by the loss function.
Args:
- text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text (`str`, `List[str]`, `List[List[str]]`):
The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
- text_pair (:obj:`List[str]` or :obj:`List[int]`, `optional`):
+ text_pair (`List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
list of list of strings (words of a batch of examples).
"""
@@ -868,41 +873,41 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
Truncates a sequence pair in-place following the strategy.
Args:
- ids (:obj:`List[int]`):
- Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
- and ``convert_tokens_to_ids`` methods.
- token_boxes (:obj:`List[List[int]]`):
+ ids (`List[int]`):
+ Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize`
+ and `convert_tokens_to_ids` methods.
+ token_boxes (`List[List[int]]`):
Bounding boxes of the first sequence.
- pair_ids (:obj:`List[int]`, `optional`):
- Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
- and ``convert_tokens_to_ids`` methods.
- pair_token_boxes (:obj:`List[List[int]]`, `optional`):
+ pair_ids (`List[int]`, *optional*):
+ Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+ and `convert_tokens_to_ids` methods.
+ pair_token_boxes (`List[List[int]]`, *optional*):
Bounding boxes of the second sequence.
- labels (:obj:`List[int]`, `optional`):
+ labels (`List[int]`, *optional*):
Labels of the first sequence (for token classification tasks).
- num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
+ num_tokens_to_remove (`int`, *optional*, defaults to 0):
Number of tokens to remove using the truncation strategy.
- truncation_strategy (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+ truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
The strategy to follow for truncation. Can be:
- * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+ - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will
truncate token by token, removing a token from the longest sequence in the pair if a pair of
sequences (or a batch of pairs) is provided.
- * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+ - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to
the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+ - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+ - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
greater than the model maximum admissible input size).
- stride (:obj:`int`, `optional`, defaults to 0):
+ stride (`int`, *optional*, defaults to 0):
If set to a positive number, the overflowing tokens returned will contain some tokens from the main
sequence returned. The value of this argument defines the number of additional tokens.
Returns:
- :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the
+ `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the
list of overflowing tokens.
"""
if num_tokens_to_remove <= 0:
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
index 4b9170250f..8c17828de9 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -52,57 +52,62 @@ logger = logging.get_logger(__name__)
class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
"""
- Construct a "fast" LayoutXLM tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
- :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE
- `__.
+ Construct a "fast" LayoutXLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+ [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- cls_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
+ cls_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
The bounding box to use for the special [CLS] token.
- sep_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[1000, 1000, 1000, 1000]`):
+ sep_token_box (`List[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
The bounding box to use for the special [SEP] token.
- pad_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
+ pad_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
The bounding box to use for the special [PAD] token.
- pad_token_label (:obj:`int`, `optional`, defaults to -100):
- The label to use for padding tokens. Defaults to -100, which is the :obj:`ignore_index` of PyTorch's
+ pad_token_label (`int`, *optional*, defaults to -100):
+ The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
CrossEntropyLoss.
- only_label_first_subword (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ only_label_first_subword (`bool`, *optional*, defaults to `True`):
Whether or not to only label the first subword, in case word labels are provided.
- additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
"""
@@ -189,16 +194,16 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
sequences with word-level normalized bounding boxes and optional labels.
Args:
- text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
(words of a single example or questions of a batch of examples) or a list of list of strings (batch of
words).
- text_pair (:obj:`List[str]`, :obj:`List[List[str]]`):
+ text_pair (`List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
(pretokenized string).
- boxes (:obj:`List[List[int]]`, :obj:`List[List[List[int]]]`):
+ boxes (`List[List[int]]`, `List[List[List[int]]]`):
Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
- word_labels (:obj:`List[int]`, :obj:`List[List[int]]`, `optional`):
+ word_labels (`List[int]`, `List[List[int]]`, *optional*):
Word-level integer labels (for token classification tasks such as FUNSD, CORD).
"""
# Input type checking for clearer error
@@ -630,17 +635,17 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An XLM-RoBERTa sequence has the following format:
- - single sequence: `` X ``
- - pair of sequences: `` A B ``
+ - single sequence: ` X `
+ - pair of sequences: ` A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
@@ -657,13 +662,13 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
not make use of token type ids, therefore a list of zeros is returned.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
diff --git a/src/transformers/models/led/configuration_led.py b/src/transformers/models/led/configuration_led.py
index e30c3e04c4..e6b617cc24 100644
--- a/src/transformers/models/led/configuration_led.py
+++ b/src/transformers/models/led/configuration_led.py
@@ -30,60 +30,63 @@ LED_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class LEDConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.LEDModel`. It is used to
+ This is the configuration class to store the configuration of a [`LEDModel`]. It is used to
instantiate an LED model according to the specified arguments, defining the model architecture. Instantiating a
- configuration with the defaults will yield a similar configuration to that of the LED `allenai/led-base-16384
- `__ architecture.
+ configuration with the defaults will yield a similar configuration to that of the LED [allenai/led-base-16384](https://huggingface.co/allenai/led-base-16384) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50265):
+ vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the LED model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.LEDModel` or :class:`~transformers.TFLEDModel`.
- d_model (:obj:`int`, `optional`, defaults to 1024):
+ `inputs_ids` passed when calling [`LEDModel`] or [`TFLEDModel`].
+ d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
- encoder_layers (:obj:`int`, `optional`, defaults to 12):
+ encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
- decoder_layers (:obj:`int`, `optional`, defaults to 12):
+ decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
- encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
- decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
- classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
- max_encoder_position_embeddings (:obj:`int`, `optional`, defaults to 16384):
+ max_encoder_position_embeddings (`int`, *optional*, defaults to 16384):
The maximum sequence length that the encoder might ever be used with.
- max_decoder_position_embeddings (:obj:`int`, `optional`, defaults to 16384):
+ max_decoder_position_embeddings (`int`, *optional*, defaults to 16384):
The maximum sequence length that the decoder might ever be used with.
- init_std (:obj:`float`, `optional`, defaults to 0.02):
+ init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details.
- decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the decoder. See the `LayerDrop paper `__ for more details.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models)
- Example::
+ Example:
+
+ ```python
+
+ ```
>>> from transformers import LEDModel, LEDConfig
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index b0fd2e4ed7..1610b67ada 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -340,18 +340,21 @@ class LEDEncoderSelfAttention(nn.Module):
"""
shift every row 1 step right, converting columns into diagonals.
- Example::
+ Example:
- chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
- -1.8348, 0.7672, 0.2986, 0.0285,
- -0.7584, 0.4206, -0.0405, 0.1599,
- 2.0514, -1.1600, 0.5372, 0.2629 ]
- window_overlap = num_rows = 4
- (pad & diagonalize) =>
- [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
- 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
- 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
- 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
+ ```python
+ chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
+ -1.8348, 0.7672, 0.2986, 0.0285,
+ -0.7584, 0.4206, -0.0405, 0.1599,
+ 2.0514, -1.1600, 0.5372, 0.2629 ]
+ window_overlap = num_rows = 4
+ ```
+
+ (pad & diagonalize) =>
+ [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
+ 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
+ 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
+ 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
"""
total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
chunked_hidden_states = nn.functional.pad(
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index b12c1d0786..a0d5dc503d 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -607,18 +607,21 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer):
"""
shift every row 1 step right, converting columns into diagonals.
- Example::
+ Example:
- chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
- -1.8348, 0.7672, 0.2986, 0.0285,
- -0.7584, 0.4206, -0.0405, 0.1599,
- 2.0514, -1.1600, 0.5372, 0.2629 ]
- window_overlap = num_rows = 4
- (pad & diagonalize) =>
- [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
- 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
- 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
- 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
+ ```python
+ chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
+ -1.8348, 0.7672, 0.2986, 0.0285,
+ -0.7584, 0.4206, -0.0405, 0.1599,
+ 2.0514, -1.1600, 0.5372, 0.2629 ]
+ window_overlap = num_rows = 4
+ ```
+
+ (pad & diagonalize) =>
+ [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
+ 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
+ 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
+ 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
"""
total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states)
paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]])
@@ -2368,19 +2371,20 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
"""
Returns:
- Examples::
+ Examples:
- >>> from transformers import LEDTokenizer, TFLEDForConditionalGeneration
- >>> import tensorflow as tf
- >>> mname = 'allenai/led-base-16384'
- >>> tokenizer = LEDTokenizer.from_pretrained(mname)
- >>> TXT = "My friends are but they eat too many carbs."
- >>> model = TFLEDForConditionalGeneration.from_pretrained(mname)
- >>> batch = tokenizer([TXT], return_tensors='tf')
- >>> logits = model(inputs=batch.input_ids).logits
- >>> probs = tf.nn.softmax(logits[0])
- >>> # probs[5] is associated with the mask token
- """
+ ```python
+ >>> from transformers import LEDTokenizer, TFLEDForConditionalGeneration
+ >>> import tensorflow as tf
+ >>> mname = 'allenai/led-base-16384'
+ >>> tokenizer = LEDTokenizer.from_pretrained(mname)
+ >>> TXT = "My friends are but they eat too many carbs."
+ >>> model = TFLEDForConditionalGeneration.from_pretrained(mname)
+ >>> batch = tokenizer([TXT], return_tensors='tf')
+ >>> logits = model(inputs=batch.input_ids).logits
+ >>> probs = tf.nn.softmax(logits[0])
+ >>> # probs[5] is associated with the mask token
+ ```"""
inputs = input_processing(
func=self.call,
diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py
index 3facfaa515..eca8098817 100644
--- a/src/transformers/models/led/tokenization_led.py
+++ b/src/transformers/models/led/tokenization_led.py
@@ -40,10 +40,10 @@ class LEDTokenizer(BartTokenizer):
"""
Construct a LED tokenizer.
- :class:`~transformers.LEDTokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end
+ [`LEDTokenizer`] is identical to [`BartTokenizer`] and runs end-to-end
tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning
+ Refer to superclass [`BartTokenizer`] for usage examples and documentation concerning
parameters.
"""
diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py
index a6b681c4df..b815c806f3 100644
--- a/src/transformers/models/led/tokenization_led_fast.py
+++ b/src/transformers/models/led/tokenization_led_fast.py
@@ -39,12 +39,12 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class LEDTokenizerFast(BartTokenizerFast):
r"""
- Construct a "fast" LED tokenizer (backed by HuggingFace's `tokenizers` library).
+ Construct a "fast" LED tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.LEDTokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs
+ [`LEDTokenizerFast`] is identical to [`BartTokenizerFast`] and runs
end-to-end tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning
+ Refer to superclass [`BartTokenizerFast`] for usage examples and documentation concerning
parameters.
"""
diff --git a/src/transformers/models/longformer/configuration_longformer.py b/src/transformers/models/longformer/configuration_longformer.py
index 3c72fc2763..f10f4a4cd3 100644
--- a/src/transformers/models/longformer/configuration_longformer.py
+++ b/src/transformers/models/longformer/configuration_longformer.py
@@ -34,37 +34,37 @@ LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class LongformerConfig(RobertaConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel` or a
- :class:`~transformers.TFLongformerModel`. It is used to instantiate a Longformer model according to the specified
+ This is the configuration class to store the configuration of a [`LongformerModel`] or a
+ [`TFLongformerModel`]. It is used to instantiate a Longformer model according to the specified
arguments, defining the model architecture.
- This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`. It is used
+ This is the configuration class to store the configuration of a [`LongformerModel`]. It is used
to instantiate an Longformer model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
- `roberta-base `__ architecture with a sequence length 4,096.
+ [roberta-base](https://huggingface.co/roberta-base) architecture with a sequence length 4,096.
- The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`. It reuses
+ The [`LongformerConfig`] class directly inherits [`RobertaConfig`]. It reuses
the same defaults. Please check the parent class for more information.
Args:
- attention_window (:obj:`int` or :obj:`List[int]`, `optional`, defaults to 512):
- Size of an attention window around each token. If an :obj:`int`, use the same size for all layers. To
- specify a different window size for each layer, use a :obj:`List[int]` where ``len(attention_window) ==
- num_hidden_layers``.
+ attention_window (`int` or `List[int]`, *optional*, defaults to 512):
+ Size of an attention window around each token. If an `int`, use the same size for all layers. To
+ specify a different window size for each layer, use a `List[int]` where `len(attention_window) == num_hidden_layers`.
- Example::
+ Example:
- >>> from transformers import LongformerConfig, LongformerModel
+ ```python
+ >>> from transformers import LongformerConfig, LongformerModel
- >>> # Initializing a Longformer configuration
- >>> configuration = LongformerConfig()
+ >>> # Initializing a Longformer configuration
+ >>> configuration = LongformerConfig()
- >>> # Initializing a model from the configuration
- >>> model = LongformerModel(configuration)
+ >>> # Initializing a model from the configuration
+ >>> model = LongformerModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "longformer"
def __init__(self, attention_window: Union[List[int], int] = 512, sep_token_id: int = 2, **kwargs):
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 6524f42d80..056a85450d 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -709,18 +709,21 @@ class LongformerSelfAttention(nn.Module):
"""
shift every row 1 step right, converting columns into diagonals.
- Example::
+ Example:
- chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
- -1.8348, 0.7672, 0.2986, 0.0285,
- -0.7584, 0.4206, -0.0405, 0.1599,
- 2.0514, -1.1600, 0.5372, 0.2629 ]
- window_overlap = num_rows = 4
- (pad & diagonalize) =>
- [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
- 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
- 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
- 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
+ ```python
+ chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
+ -1.8348, 0.7672, 0.2986, 0.0285,
+ -0.7584, 0.4206, -0.0405, 0.1599,
+ 2.0514, -1.1600, 0.5372, 0.2629 ]
+ window_overlap = num_rows = 4
+ ```
+
+ (pad & diagonalize) =>
+ [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
+ 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
+ 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
+ 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
"""
total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
chunked_hidden_states = nn.functional.pad(
@@ -1584,28 +1587,29 @@ class LongformerModel(LongformerPreTrainedModel):
Returns:
- Examples::
+ Examples:
- >>> import torch
- >>> from transformers import LongformerModel, LongformerTokenizer
+ ```python
+ >>> import torch
+ >>> from transformers import LongformerModel, LongformerTokenizer
- >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
- >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+ >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
+ >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
- >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
- >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
+ >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
+ >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
- >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
- >>> global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to global attention to be deactivated for all tokens
- >>> global_attention_mask[:, [1, 4, 21,]] = 1 # Set global attention to random tokens for the sake of this example
- ... # Usually, set global attention based on the task. For example,
- ... # classification: the token
- ... # QA: question tokens
- ... # LM: potentially on the beginning of sentences and paragraphs
- >>> outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
- >>> sequence_output = outputs.last_hidden_state
- >>> pooled_output = outputs.pooler_output
- """
+ >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
+ >>> global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to global attention to be deactivated for all tokens
+ >>> global_attention_mask[:, [1, 4, 21,]] = 1 # Set global attention to random tokens for the sake of this example
+ ... # Usually, set global attention based on the task. For example,
+ ... # classification: the token
+ ... # QA: question tokens
+ ... # LM: potentially on the beginning of sentences and paragraphs
+ >>> outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
+ >>> sequence_output = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index 19d354248b..0137d545da 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -1121,18 +1121,21 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
"""
shift every row 1 step right, converting columns into diagonals.
- Example::
+ Example:
- chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
- -1.8348, 0.7672, 0.2986, 0.0285,
- -0.7584, 0.4206, -0.0405, 0.1599,
- 2.0514, -1.1600, 0.5372, 0.2629 ]
- window_overlap = num_rows = 4
- (pad & diagonalize) =>
- [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
- 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
- 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
- 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
+ ```python
+ chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
+ -1.8348, 0.7672, 0.2986, 0.0285,
+ -0.7584, 0.4206, -0.0405, 0.1599,
+ 2.0514, -1.1600, 0.5372, 0.2629 ]
+ window_overlap = num_rows = 4
+ ```
+
+ (pad & diagonalize) =>
+ [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
+ 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
+ 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
+ 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
"""
total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states)
paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]])
diff --git a/src/transformers/models/longformer/tokenization_longformer.py b/src/transformers/models/longformer/tokenization_longformer.py
index d841b4147c..bba0cbd7c5 100644
--- a/src/transformers/models/longformer/tokenization_longformer.py
+++ b/src/transformers/models/longformer/tokenization_longformer.py
@@ -52,7 +52,7 @@ class LongformerTokenizer(RobertaTokenizer):
r"""
Construct a Longformer tokenizer.
- :class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to the
+ [`LongformerTokenizer`] is identical to [`RobertaTokenizer`]. Refer to the
superclass for usage examples and documentation concerning parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py
index a42346fcd7..145e90b544 100644
--- a/src/transformers/models/longformer/tokenization_longformer_fast.py
+++ b/src/transformers/models/longformer/tokenization_longformer_fast.py
@@ -58,9 +58,9 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class LongformerTokenizerFast(RobertaTokenizerFast):
r"""
- Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library).
+ Construct a "fast" Longformer tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer
+ [`LongformerTokenizerFast`] is identical to [`RobertaTokenizerFast`]. Refer
to the superclass for usage examples and documentation concerning parameters.
"""
# merges and vocab same as Roberta
diff --git a/src/transformers/models/luke/configuration_luke.py b/src/transformers/models/luke/configuration_luke.py
index ba6dc49643..0c39057042 100644
--- a/src/transformers/models/luke/configuration_luke.py
+++ b/src/transformers/models/luke/configuration_luke.py
@@ -28,64 +28,64 @@ LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class LukeConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.LukeModel`. It is used to
+ This is the configuration class to store the configuration of a [`LukeModel`]. It is used to
instantiate a LUKE model according to the specified arguments, defining the model architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
+ vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the LUKE model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.LukeModel`.
- entity_vocab_size (:obj:`int`, `optional`, defaults to 500000):
+ `inputs_ids` passed when calling [`LukeModel`].
+ entity_vocab_size (`int`, *optional*, defaults to 500000):
Entity vocabulary size of the LUKE model. Defines the number of different entities that can be represented
- by the :obj:`entity_ids` passed when calling :class:`~transformers.LukeModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ by the `entity_ids` passed when calling [`LukeModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- entity_emb_size (:obj:`int`, `optional`, defaults to 256):
+ entity_emb_size (`int`, *optional*, defaults to 256):
The number of dimensions of the entity embedding.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.LukeModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling [`LukeModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- use_entity_aware_attention (:obj:`bool`, defaults to :obj:`True`):
- Whether or not the model should use the entity-aware self-attention mechanism proposed in `LUKE: Deep
- Contextualized Entity Representations with Entity-aware Self-attention (Yamada et al.)
- `__.
+ use_entity_aware_attention (`bool`, defaults to `True`):
+ Whether or not the model should use the entity-aware self-attention mechanism proposed in [LUKE: Deep
+ Contextualized Entity Representations with Entity-aware Self-attention (Yamada et al.)](https://arxiv.org/abs/2010.01057).
- Examples::
+ Examples:
- >>> from transformers import LukeConfig, LukeModel
+ ```python
+ >>> from transformers import LukeConfig, LukeModel
- >>> # Initializing a LUKE configuration
- >>> configuration = LukeConfig()
+ >>> # Initializing a LUKE configuration
+ >>> configuration = LukeConfig()
- >>> # Initializing a model from the configuration
- >>> model = LukeModel(configuration)
+ >>> # Initializing a model from the configuration
+ >>> model = LukeModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "luke"
def __init__(
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index 468093f24a..1a73f73a38 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -924,32 +924,33 @@ class LukeModel(LukePreTrainedModel):
Returns:
- Examples::
+ Examples:
- >>> from transformers import LukeTokenizer, LukeModel
+ ```python
+ >>> from transformers import LukeTokenizer, LukeModel
- >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
- >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
+ >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
+ >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
- # Compute the contextualized entity representation corresponding to the entity mention "Beyoncé"
- >>> text = "Beyoncé lives in Los Angeles."
- >>> entity_spans = [(0, 7)] # character-based entity span corresponding to "Beyoncé"
+ # Compute the contextualized entity representation corresponding to the entity mention "Beyoncé"
+ >>> text = "Beyoncé lives in Los Angeles."
+ >>> entity_spans = [(0, 7)] # character-based entity span corresponding to "Beyoncé"
- >>> encoding = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
- >>> outputs = model(**encoding)
- >>> word_last_hidden_state = outputs.last_hidden_state
- >>> entity_last_hidden_state = outputs.entity_last_hidden_state
+ >>> encoding = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
+ >>> outputs = model(**encoding)
+ >>> word_last_hidden_state = outputs.last_hidden_state
+ >>> entity_last_hidden_state = outputs.entity_last_hidden_state
- # Input Wikipedia entities to obtain enriched contextualized representations of word tokens
- >>> text = "Beyoncé lives in Los Angeles."
- >>> entities = ["Beyoncé", "Los Angeles"] # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
- >>> entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
+ # Input Wikipedia entities to obtain enriched contextualized representations of word tokens
+ >>> text = "Beyoncé lives in Los Angeles."
+ >>> entities = ["Beyoncé", "Los Angeles"] # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
+ >>> entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
- >>> encoding = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
- >>> outputs = model(**encoding)
- >>> word_last_hidden_state = outputs.last_hidden_state
- >>> entity_last_hidden_state = outputs.entity_last_hidden_state
- """
+ >>> encoding = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
+ >>> outputs = model(**encoding)
+ >>> word_last_hidden_state = outputs.last_hidden_state
+ >>> entity_last_hidden_state = outputs.entity_last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index 785fdf2233..374819a130 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -74,79 +74,79 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
}
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
- return_token_type_ids (:obj:`bool`, `optional`):
+ return_token_type_ids (`bool`, *optional*):
Whether to return token type IDs. If left to the default, will return the token type IDs according to
- the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+ the specific tokenizer's default, defined by the `return_outputs` attribute.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- return_attention_mask (:obj:`bool`, `optional`):
+ [What are token type IDs?](../glossary#token-type-ids)
+ return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
- to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+ to the specific tokenizer's default, defined by the `return_outputs` attribute.
- `What are attention masks? <../glossary.html#attention-mask>`__
- return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ [What are attention masks?](../glossary#attention-mask)
+ return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
- of pairs) is provided with :obj:`truncation_strategy = longest_first` or :obj:`True`, an error is
+ of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is
raised instead of returning overflowing tokens.
- return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
Whether or not to return special tokens mask information.
- return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether or not to return :obj:`(char_start, char_end)` for each token.
+ return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+ Whether or not to return `(char_start, char_end)` for each token.
This is only available on fast tokenizers inheriting from
- :class:`~transformers.PreTrainedTokenizerFast`, if using Python's tokenizer, this method will raise
- :obj:`NotImplementedError`.
- return_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ [`PreTrainedTokenizerFast`], if using Python's tokenizer, this method will raise
+ `NotImplementedError`.
+ return_length (`bool`, *optional*, defaults to `False`):
Whether or not to return the lengths of the encoded inputs.
- verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ verbose (`bool`, *optional*, defaults to `True`):
Whether or not to print more information and warnings.
- **kwargs: passed to the :obj:`self.tokenize()` method
+ **kwargs: passed to the `self.tokenize()` method
- Return:
- :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+ Return:
+ [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model.
- `What are input IDs? <../glossary.html#input-ids>`__
+ [What are input IDs?](../glossary#input-ids)
- - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
- or if `"token_type_ids"` is in :obj:`self.model_input_names`).
+ - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True`
+ or if *"token_type_ids"* is in `self.model_input_names`).
- `What are token type IDs? <../glossary.html#token-type-ids>`__
+ [What are token type IDs?](../glossary#token-type-ids)
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
- :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
- `What are attention masks? <../glossary.html#attention-mask>`__
+ [What are attention masks?](../glossary#attention-mask)
- **entity_ids** -- List of entity ids to be fed to a model.
- `What are input IDs? <../glossary.html#input-ids>`__
+ [What are input IDs?](../glossary#input-ids)
- **entity_position_ids** -- List of entity positions in the input sequence to be fed to a model.
- **entity_token_type_ids** -- List of entity token type ids to be fed to a model (when
- :obj:`return_token_type_ids=True` or if `"entity_token_type_ids"` is in :obj:`self.model_input_names`).
+ `return_token_type_ids=True` or if *"entity_token_type_ids"* is in `self.model_input_names`).
- `What are token type IDs? <../glossary.html#token-type-ids>`__
+ [What are token type IDs?](../glossary#token-type-ids)
- **entity_attention_mask** -- List of indices specifying which entities should be attended to by the model
- (when :obj:`return_attention_mask=True` or if `"entity_attention_mask"` is in
- :obj:`self.model_input_names`).
+ (when `return_attention_mask=True` or if *"entity_attention_mask"* is in
+ `self.model_input_names`).
- `What are attention masks? <../glossary.html#attention-mask>`__
+ [What are attention masks?](../glossary#attention-mask)
- **entity_start_positions** -- List of the start positions of entities in the word token sequence (when
- :obj:`task="entity_span_classification"`).
+ `task="entity_span_classification"`).
- **entity_end_positions** -- List of the end positions of entities in the word token sequence (when
- :obj:`task="entity_span_classification"`).
- - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
- :obj:`return_overflowing_tokens=True`).
- - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
- :obj:`return_overflowing_tokens=True`).
+ `task="entity_span_classification"`).
+ - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+ `return_overflowing_tokens=True`).
+ - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+ `return_overflowing_tokens=True`).
- **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
- regular sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
- - **length** -- The length of the inputs (when :obj:`return_length=True`)
+ regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+ - **length** -- The length of the inputs (when `return_length=True`)
"""
@@ -155,33 +155,33 @@ class LukeTokenizer(RobertaTokenizer):
r"""
Construct a LUKE tokenizer.
- This tokenizer inherits from :class:`~transformers.RobertaTokenizer` which contains most of the main methods. Users
+ This tokenizer inherits from [`RobertaTokenizer`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods. Compared to
- :class:`~transformers.RobertaTokenizer`, :class:`~transformers.LukeTokenizer` also creates entity sequences, namely
- :obj:`entity_ids`, :obj:`entity_attention_mask`, :obj:`entity_token_type_ids`, and :obj:`entity_position_ids` to be
+ [`RobertaTokenizer`], [`LukeTokenizer`] also creates entity sequences, namely
+ `entity_ids`, `entity_attention_mask`, `entity_token_type_ids`, and `entity_position_ids` to be
used by the LUKE model.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- merges_file (:obj:`str`):
+ merges_file (`str`):
Path to the merges file.
- entity_vocab_file (:obj:`str`):
+ entity_vocab_file (`str`):
Path to the entity vocabulary file.
- task (:obj:`str`, `optional`):
- Task for which you want to prepare sequences. One of :obj:`"entity_classification"`,
- :obj:`"entity_pair_classification"`, or :obj:`"entity_span_classification"`. If you specify this argument,
+ task (`str`, *optional*):
+ Task for which you want to prepare sequences. One of `"entity_classification"`,
+ `"entity_pair_classification"`, or `"entity_span_classification"`. If you specify this argument,
the entity sequence is automatically created based on the given entity span(s).
- max_entity_length (:obj:`int`, `optional`, defaults to 32):
- The maximum length of :obj:`entity_ids`.
- max_mention_length (:obj:`int`, `optional`, defaults to 30):
+ max_entity_length (`int`, *optional*, defaults to 32):
+ The maximum length of `entity_ids`.
+ max_mention_length (`int`, *optional*, defaults to 30):
The maximum number of tokens inside an entity span.
- entity_token_1 (:obj:`str`, `optional`, defaults to :obj:``):
+ entity_token_1 (`str`, *optional*, defaults to ``):
The special token used to represent an entity span in a word token sequence. This token is only used when
- ``task`` is set to :obj:`"entity_classification"` or :obj:`"entity_pair_classification"`.
- entity_token_2 (:obj:`str`, `optional`, defaults to :obj:``):
+ `task` is set to `"entity_classification"` or `"entity_pair_classification"`.
+ entity_token_2 (`str`, *optional*, defaults to ``):
The special token used to represent an entity span in a word token sequence. This token is only used when
- ``task`` is set to :obj:`"entity_pair_classification"`.
+ `task` is set to `"entity_pair_classification"`.
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -275,39 +275,39 @@ class LukeTokenizer(RobertaTokenizer):
sequences, depending on the task you want to prepare them for.
Args:
- text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence must be a string. Note that this
tokenizer does not support tokenization based on pretokenized strings.
- text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text_pair (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence must be a string. Note that this
tokenizer does not support tokenization based on pretokenized strings.
- entity_spans (:obj:`List[Tuple[int, int]]`, :obj:`List[List[Tuple[int, int]]]`, `optional`):
+ entity_spans (`List[Tuple[int, int]]`, `List[List[Tuple[int, int]]]`, *optional*):
The sequence or batch of sequences of entity spans to be encoded. Each sequence consists of tuples each
with two integers denoting character-based start and end positions of entities. If you specify
- :obj:`"entity_classification"` or :obj:`"entity_pair_classification"` as the ``task`` argument in the
- constructor, the length of each sequence must be 1 or 2, respectively. If you specify ``entities``, the
- length of each sequence must be equal to the length of each sequence of ``entities``.
- entity_spans_pair (:obj:`List[Tuple[int, int]]`, :obj:`List[List[Tuple[int, int]]]`, `optional`):
+ `"entity_classification"` or `"entity_pair_classification"` as the `task` argument in the
+ constructor, the length of each sequence must be 1 or 2, respectively. If you specify `entities`, the
+ length of each sequence must be equal to the length of each sequence of `entities`.
+ entity_spans_pair (`List[Tuple[int, int]]`, `List[List[Tuple[int, int]]]`, *optional*):
The sequence or batch of sequences of entity spans to be encoded. Each sequence consists of tuples each
with two integers denoting character-based start and end positions of entities. If you specify the
- ``task`` argument in the constructor, this argument is ignored. If you specify ``entities_pair``, the
- length of each sequence must be equal to the length of each sequence of ``entities_pair``.
- entities (:obj:`List[str]`, :obj:`List[List[str]]`, `optional`):
+ `task` argument in the constructor, this argument is ignored. If you specify `entities_pair`, the
+ length of each sequence must be equal to the length of each sequence of `entities_pair`.
+ entities (`List[str]`, `List[List[str]]`, *optional*):
The sequence or batch of sequences of entities to be encoded. Each sequence consists of strings
representing entities, i.e., special entities (e.g., [MASK]) or entity titles of Wikipedia (e.g., Los
- Angeles). This argument is ignored if you specify the ``task`` argument in the constructor. The length
- of each sequence must be equal to the length of each sequence of ``entity_spans``. If you specify
- ``entity_spans`` without specifying this argument, the entity sequence or the batch of entity sequences
+ Angeles). This argument is ignored if you specify the `task` argument in the constructor. The length
+ of each sequence must be equal to the length of each sequence of `entity_spans`. If you specify
+ `entity_spans` without specifying this argument, the entity sequence or the batch of entity sequences
is automatically constructed by filling it with the [MASK] entity.
- entities_pair (:obj:`List[str]`, :obj:`List[List[str]]`, `optional`):
+ entities_pair (`List[str]`, `List[List[str]]`, *optional*):
The sequence or batch of sequences of entities to be encoded. Each sequence consists of strings
representing entities, i.e., special entities (e.g., [MASK]) or entity titles of Wikipedia (e.g., Los
- Angeles). This argument is ignored if you specify the ``task`` argument in the constructor. The length
- of each sequence must be equal to the length of each sequence of ``entity_spans_pair``. If you specify
- ``entity_spans_pair`` without specifying this argument, the entity sequence or the batch of entity
+ Angeles). This argument is ignored if you specify the `task` argument in the constructor. The length
+ of each sequence must be equal to the length of each sequence of `entity_spans_pair`. If you specify
+ `entity_spans_pair` without specifying this argument, the entity sequence or the batch of entity
sequences is automatically constructed by filling it with the [MASK] entity.
- max_entity_length (:obj:`int`, `optional`):
- The maximum length of :obj:`entity_ids`.
+ max_entity_length (`int`, *optional*):
+ The maximum length of `entity_ids`.
"""
# Input type checking for clearer error
is_valid_single_text = isinstance(text, str)
@@ -865,24 +865,24 @@ class LukeTokenizer(RobertaTokenizer):
Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids,
entity spans so that it can be used by the model. It adds special tokens, truncates sequences if overflowing
while taking into account the special tokens and manages a moving window (with user defined stride) for
- overflowing tokens. Please Note, for `pair_ids` different than `None` and `truncation_strategy = longest_first`
- or `True`, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
+ overflowing tokens. Please Note, for *pair_ids* different than *None* and *truncation_strategy = longest_first*
+ or *True*, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
error.
Args:
- ids (:obj:`List[int]`):
+ ids (`List[int]`):
Tokenized input ids of the first sequence.
- pair_ids (:obj:`List[int]`, `optional`):
+ pair_ids (`List[int]`, *optional*):
Tokenized input ids of the second sequence.
- entity_ids (:obj:`List[int]`, `optional`):
+ entity_ids (`List[int]`, *optional*):
Entity ids of the first sequence.
- pair_entity_ids (:obj:`List[int]`, `optional`):
+ pair_entity_ids (`List[int]`, *optional*):
Entity ids of the second sequence.
- entity_token_spans (:obj:`List[Tuple[int, int]]`, `optional`):
+ entity_token_spans (`List[Tuple[int, int]]`, *optional*):
Entity spans of the first sequence.
- pair_entity_token_spans (:obj:`List[Tuple[int, int]]`, `optional`):
+ pair_entity_token_spans (`List[Tuple[int, int]]`, *optional*):
Entity spans of the second sequence.
- max_entity_length (:obj:`int`, `optional`):
+ max_entity_length (`int`, *optional*):
The maximum length of the entity sequence.
"""
@@ -1083,46 +1083,45 @@ class LukeTokenizer(RobertaTokenizer):
"""
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
in the batch. Padding side (left/right) padding token ids are defined at the tokenizer level (with
- ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``) .. note:: If the
- ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the result
- will use the same type unless you provide a different tensor type with ``return_tensors``. In the case of
+ `self.padding_side`, `self.pad_token_id` and `self.pad_token_type_id`) .. note:: If the
+ `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the result
+ will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
PyTorch tensors, you will lose the specific device of your tensors however.
Args:
- encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
- Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
- List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
- List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
- well as in a PyTorch Dataloader collate function. Instead of :obj:`List[int]` you can have tensors
+ encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
+ Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of tokenized inputs (list of [`BatchEncoding`], *Dict[str,
+ List[List[int]]]* or *List[Dict[str, List[int]]]*) so you can use this method during preprocessing as
+ well as in a PyTorch Dataloader collate function. Instead of `List[int]` you can have tensors
(numpy arrays, PyTorch tensors or TensorFlow tensors), see the note above for the return type.
- padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+ padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
- * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
- * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
- max_length (:obj:`int`, `optional`):
+ max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
- max_entity_length (:obj:`int`, `optional`):
+ max_entity_length (`int`, *optional*):
The maximum length of the entity sequence.
- pad_to_multiple_of (:obj:`int`, `optional`):
+ pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
- return_attention_mask (:obj:`bool`, `optional`):
+ return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
- to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are
- attention masks? <../glossary.html#attention-mask>`__
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+ to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are
+ attention masks?](../glossary#attention-mask)
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
- * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
- * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
- verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return Numpy `np.ndarray` objects.
+ verbose (`bool`, *optional*, defaults to `True`):
Whether or not to print more information and warnings.
"""
# If we have a list of dicts, let's convert it in a dict of lists
diff --git a/src/transformers/models/lxmert/configuration_lxmert.py b/src/transformers/models/lxmert/configuration_lxmert.py
index e4d9474941..a0fabca535 100644
--- a/src/transformers/models/lxmert/configuration_lxmert.py
+++ b/src/transformers/models/lxmert/configuration_lxmert.py
@@ -28,86 +28,86 @@ LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class LxmertConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.LxmertModel` or a
- :class:`~transformers.TFLxmertModel`. It is used to instantiate a LXMERT model according to the specified
+ This is the configuration class to store the configuration of a [`LxmertModel`] or a
+ [`TFLxmertModel`]. It is used to instantiate a LXMERT model according to the specified
arguments, defining the model architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
+ vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the LXMERT model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.LxmertModel` or
- :class:`~transformers.TFLxmertModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ `inputs_ids` passed when calling [`LxmertModel`] or
+ [`TFLxmertModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- r_layers (:obj:`int`, `optional`, defaults to 5):
+ r_layers (`int`, *optional*, defaults to 5):
Number of hidden layers in the Transformer visual encoder.
- l_layers (:obj:`int`, `optional`, defaults to 9):
+ l_layers (`int`, *optional*, defaults to 9):
Number of hidden layers in the Transformer language encoder.
- x_layers (:obj:`int`, `optional`, defaults to 5):
+ x_layers (`int`, *optional*, defaults to 5):
Number of hidden layers in the Transformer cross modality encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 5):
+ num_attention_heads (`int`, *optional*, defaults to 5):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the *token_type_ids* passed into [`BertModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- visual_feat_dim (:obj:`int`, `optional`, defaults to 2048):
+ visual_feat_dim (`int`, *optional*, defaults to 2048):
This represents the last dimension of the pooled-object features used as input for the model, representing
the size of each object feature itself.
- visual_pos_dim (:obj:`int`, `optional`, defaults to 4):
+ visual_pos_dim (`int`, *optional*, defaults to 4):
This represents the number of spacial features that are mixed into the visual features. The default is set
to 4 because most commonly this will represent the location of a bounding box. i.e., (x, y, width, height)
- visual_loss_normalizer (:obj:`float`, `optional`, defaults to 1/15):
+ visual_loss_normalizer (`float`, *optional*, defaults to 1/15):
This represents the scaling factor in which each visual loss is multiplied by if during pretraining, one
decided to train with multiple vision-based loss objectives.
- num_qa_labels (:obj:`int`, `optional`, defaults to 9500):
+ num_qa_labels (`int`, *optional*, defaults to 9500):
This represents the total number of different question answering (QA) labels there are. If using more than
one dataset with QA, the user will need to account for the total number of labels that all of the datasets
have in total.
- num_object_labels (:obj:`int`, `optional`, defaults to 1600):
+ num_object_labels (`int`, *optional*, defaults to 1600):
This represents the total number of semantically unique objects that lxmert will be able to classify a
pooled-object feature as belonging too.
- num_attr_labels (:obj:`int`, `optional`, defaults to 400):
+ num_attr_labels (`int`, *optional*, defaults to 400):
This represents the total number of semantically unique attributes that lxmert will be able to classify a
pooled-object feature as possessing.
- task_matched (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ task_matched (`bool`, *optional*, defaults to `True`):
This task is used for sentence-image matching. If the sentence correctly describes the image the label will
be 1. If the sentence does not correctly describe the image, the label will be 0.
- task_mask_lm (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ task_mask_lm (`bool`, *optional*, defaults to `True`):
Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss
objective.
- task_obj_predict (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ task_obj_predict (`bool`, *optional*, defaults to `True`):
Whether or not to add object prediction, attribute prediction and feature regression to the loss objective.
- task_qa (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ task_qa (`bool`, *optional*, defaults to `True`):
Whether or not to add the question-answering loss to the objective
- visual_obj_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ visual_obj_loss (`bool`, *optional*, defaults to `True`):
Whether or not to calculate the object-prediction loss objective
- visual_attr_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ visual_attr_loss (`bool`, *optional*, defaults to `True`):
Whether or not to calculate the attribute-prediction loss objective
- visual_feat_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ visual_feat_loss (`bool`, *optional*, defaults to `True`):
Whether or not to calculate the feature-regression loss objective
- output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not the model should return the attentions from the vision, language, and cross-modality layers
should be returned.
- output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ output_hidden_states (`bool`, *optional*, defaults to `False`):
Whether or not the model should return the hidden states from the vision, language, and cross-modality
layers should be returned.
"""
diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py
index 75f55e5607..5d4e97ad54 100644
--- a/src/transformers/models/lxmert/tokenization_lxmert.py
+++ b/src/transformers/models/lxmert/tokenization_lxmert.py
@@ -37,10 +37,10 @@ class LxmertTokenizer(BertTokenizer):
r"""
Construct an LXMERT tokenizer.
- :class:`~transformers.LxmertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+ [`LxmertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
parameters.
"""
diff --git a/src/transformers/models/lxmert/tokenization_lxmert_fast.py b/src/transformers/models/lxmert/tokenization_lxmert_fast.py
index 9f179fb319..08323d25f6 100644
--- a/src/transformers/models/lxmert/tokenization_lxmert_fast.py
+++ b/src/transformers/models/lxmert/tokenization_lxmert_fast.py
@@ -39,12 +39,12 @@ PRETRAINED_INIT_CONFIGURATION = {
class LxmertTokenizerFast(BertTokenizerFast):
r"""
- Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library).
+ Construct a "fast" LXMERT tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+ [`LxmertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
end-to-end tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py
index a4a0df749c..3651f51487 100644
--- a/src/transformers/models/m2m_100/configuration_m2m_100.py
+++ b/src/transformers/models/m2m_100/configuration_m2m_100.py
@@ -28,71 +28,71 @@ M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class M2M100Config(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.M2M100Model`. It is used to
+ This is the configuration class to store the configuration of a [`M2M100Model`]. It is used to
instantiate an M2M100 model according to the specified arguments, defining the model architecture. Instantiating a
- configuration with the defaults will yield a similar configuration to that of the M2M100 `m2m100_418M
- `__ architecture.
+ configuration with the defaults will yield a similar configuration to that of the M2M100 [m2m100_418M](https://huggingface.co/facebook/m2m100_418M) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50265):
+ vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the M2M100 model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.M2M100Model` or
- d_model (:obj:`int`, `optional`, defaults to 1024):
+ `inputs_ids` passed when calling [`M2M100Model`] or
+ d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
- encoder_layers (:obj:`int`, `optional`, defaults to 12):
+ encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
- decoder_layers (:obj:`int`, `optional`, defaults to 12):
+ decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
- encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
- decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
- classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- init_std (:obj:`float`, `optional`, defaults to 0.02):
+ init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details.
- decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the decoder. See the `LayerDrop paper `__ for more details.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
- Example::
+ Example:
- >>> from transformers import M2M100Model, M2M100Config
+ ```python
+ >>> from transformers import M2M100Model, M2M100Config
- >>> # Initializing a M2M100 facebook/m2m100_418M style configuration
- >>> configuration = M2M100Config()
+ >>> # Initializing a M2M100 facebook/m2m100_418M style configuration
+ >>> configuration = M2M100Config()
- >>> # Initializing a model from the facebook/m2m100_418M style configuration
- >>> model = M2M100Model(configuration)
+ >>> # Initializing a model from the facebook/m2m100_418M style configuration
+ >>> model = M2M100Model(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "m2m_100"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py
index 88ce4bd44d..7cd7bc4003 100644
--- a/src/transformers/models/m2m_100/tokenization_m2m_100.py
+++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -63,60 +63,60 @@ FAIRSEQ_LANGUAGE_CODES = {
class M2M100Tokenizer(PreTrainedTokenizer):
"""
- Construct an M2M100 tokenizer. Based on `SentencePiece `__.
+ Construct an M2M100 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- spm_file (:obj:`str`):
- Path to `SentencePiece `__ file (generally has a .spm extension)
+ spm_file (`str`):
+ Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension)
that contains the vocabulary.
- src_lang (:obj:`str`, `optional`):
+ src_lang (`str`, *optional*):
A string representing the source language.
- tgt_lang (:obj:`str`, `optional`):
+ tgt_lang (`str`, *optional*):
A string representing the target language.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- language_codes (:obj:`str`, `optional`, defaults to :obj:`"m2m100"`):
- What language codes to use. Should be one of :obj:`"m2m100"` or :obj:`"wmt21"`.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ language_codes (`str`, *optional*, defaults to `"m2m100"`):
+ What language codes to use. Should be one of `"m2m100"` or `"wmt21"`.
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
- Examples::
+ Examples:
- >>> from transformers import M2M100Tokenizer
- >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M, src_lang="en", tgt_lang="ro")
- >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
- >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
- >>> model_inputs = tokenizer(src_text, return_tensors="pt")
- >>> with tokenizer.as_target_tokenizer():
- ... labels = tokenizer(tgt_text, return_tensors="pt").input_ids
- >>> # model(**model_inputs, labels=labels) should work
- """
+ ```python
+ >>> from transformers import M2M100Tokenizer
+ >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M, src_lang="en", tgt_lang="ro")
+ >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
+ >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
+ >>> model_inputs = tokenizer(src_text, return_tensors="pt")
+ >>> with tokenizer.as_target_tokenizer():
+ ... labels = tokenizer(tgt_text, return_tensors="pt").input_ids
+ >>> # model(**model_inputs, labels=labels) should work
+ ```"""
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -226,18 +226,18 @@ class M2M100Tokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -256,22 +256,22 @@ class M2M100Tokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
- adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
+ adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:
- - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
- - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
+ - `input_ids` (for encoder) `X [eos, src_lang_code]`
+ - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index 825c7d707a..a0be3c7723 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -28,77 +28,78 @@ MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class MarianConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.MarianModel`. It is used to
+ This is the configuration class to store the configuration of a [`MarianModel`]. It is used to
instantiate an Marian model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Marian
- `Helsinki-NLP/opus-mt-en-de `__ architecture.
+ [Helsinki-NLP/opus-mt-en-de](https://huggingface.co/Helsinki-NLP/opus-mt-en-de) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50265):
+ vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the Marian model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.MarianModel` or
- :class:`~transformers.TFMarianModel`.
- d_model (:obj:`int`, `optional`, defaults to 1024):
+ `inputs_ids` passed when calling [`MarianModel`] or
+ [`TFMarianModel`].
+ d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
- encoder_layers (:obj:`int`, `optional`, defaults to 12):
+ encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
- decoder_layers (:obj:`int`, `optional`, defaults to 12):
+ decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
- encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
- decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
- classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- init_std (:obj:`float`, `optional`, defaults to 0.02):
+ init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details.
- decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the decoder. See the `LayerDrop paper `__ for more details.
- scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ scale_embedding (`bool`, *optional*, defaults to `False`):
Scale embeddings by diving by sqrt(d_model).
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models)
- forced_eos_token_id (:obj:`int`, `optional`, defaults to 0):
- The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
- :obj:`eos_token_id`.
+ forced_eos_token_id (`int`, *optional*, defaults to 0):
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+ `eos_token_id`.
- Examples::
+ Examples:
- >>> from transformers import MarianModel, MarianConfig
+ ```python
+ >>> from transformers import MarianModel, MarianConfig
- >>> # Initializing a Marian Helsinki-NLP/opus-mt-en-de style configuration
- >>> configuration = MarianConfig()
+ >>> # Initializing a Marian Helsinki-NLP/opus-mt-en-de style configuration
+ >>> configuration = MarianConfig()
- >>> # Initializing a model from the Helsinki-NLP/opus-mt-en-de style configuration
- >>> model = MarianModel(configuration)
+ >>> # Initializing a model from the Helsinki-NLP/opus-mt-en-de style configuration
+ >>> model = MarianModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "marian"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py
index 5d58f03877..388d0cafd7 100644
--- a/src/transformers/models/marian/modeling_flax_marian.py
+++ b/src/transformers/models/marian/modeling_flax_marian.py
@@ -975,17 +975,18 @@ class FlaxMarianPreTrainedModel(FlaxPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import MarianTokenizer, FlaxMarianMTModel
+ ```python
+ >>> from transformers import MarianTokenizer, FlaxMarianMTModel
- >>> tokenizer = MarianTokenizer.from_pretrained('facebook/marian-large-cnn')
- >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+ >>> tokenizer = MarianTokenizer.from_pretrained('facebook/marian-large-cnn')
+ >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
- >>> encoder_outputs = model.encode(**inputs)
- """
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
+ >>> encoder_outputs = model.encode(**inputs)
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1041,23 +1042,24 @@ class FlaxMarianPreTrainedModel(FlaxPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import MarianTokenizer, FlaxMarianMTModel
+ ```python
+ >>> from transformers import MarianTokenizer, FlaxMarianMTModel
- >>> tokenizer = MarianTokenizer.from_pretrained('facebook/marian-large-cnn')
- >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+ >>> tokenizer = MarianTokenizer.from_pretrained('facebook/marian-large-cnn')
+ >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
- >>> encoder_outputs = model.encode(**inputs)
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
+ >>> encoder_outputs = model.encode(**inputs)
- >>> decoder_start_token_id = model.config.decoder_start_token_id
- >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+ >>> decoder_start_token_id = model.config.decoder_start_token_id
+ >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> last_decoder_hidden_states = outputs.last_hidden_state
- """
+ >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+ >>> last_decoder_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1308,23 +1310,24 @@ class FlaxMarianMTModel(FlaxMarianPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import MarianTokenizer, FlaxMarianMTModel
+ ```python
+ >>> from transformers import MarianTokenizer, FlaxMarianMTModel
- >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
- >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+ >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+ >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
- >>> encoder_outputs = model.encode(**inputs)
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
+ >>> encoder_outputs = model.encode(**inputs)
- >>> decoder_start_token_id = model.config.decoder_start_token_id
- >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+ >>> decoder_start_token_id = model.config.decoder_start_token_id
+ >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> logits = outputs.logits
- """
+ >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1468,20 +1471,22 @@ class FlaxMarianMTModel(FlaxMarianPreTrainedModel):
FLAX_MARIAN_MT_DOCSTRING = """
Returns:
- Example::
+ Example:
- >>> from transformers import MarianTokenizer, FlaxMarianMTModel
+ ```python
+ >>> from transformers import MarianTokenizer, FlaxMarianMTModel
- >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
- >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+ >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+ >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> input_ids = tokenizer(text, max_length=64, return_tensors='jax').input_ids
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> input_ids = tokenizer(text, max_length=64, return_tensors='jax').input_ids
- >>> sequences = model.generate(input_ids, max_length=64, num_beams=2).sequences
+ >>> sequences = model.generate(input_ids, max_length=64, num_beams=2).sequences
- >>> outputs = tokenizer.batch_decode(sequences, skip_special_tokens=True)
- >>> # should give `Meine Freunde sind cool, aber sie essen zu viele Kohlenhydrate.`
+ >>> outputs = tokenizer.batch_decode(sequences, skip_special_tokens=True)
+ >>> # should give *Meine Freunde sind cool, aber sie essen zu viele Kohlenhydrate.*
+ ```
"""
overwrite_call_docstring(
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 19903612ce..b7752e938d 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -523,23 +523,25 @@ MARIAN_START_DOCSTRING = r"""
MARIAN_GENERATION_EXAMPLE = r"""
Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints.
- Available models are listed `here `__.
+ Available models are listed [here](https://huggingface.co/models?search=Helsinki-NLP).
- Examples::
+ Examples:
- >>> from transformers import MarianTokenizer, MarianMTModel
- >>> from typing import List
- >>> src = 'fr' # source language
- >>> trg = 'en' # target language
- >>> sample_text = "où est l'arrêt de bus ?"
- >>> model_name = f'Helsinki-NLP/opus-mt-{src}-{trg}'
+ ```python
+ >>> from transformers import MarianTokenizer, MarianMTModel
+ >>> from typing import List
+ >>> src = 'fr' # source language
+ >>> trg = 'en' # target language
+ >>> sample_text = "où est l'arrêt de bus ?"
+ >>> model_name = f'Helsinki-NLP/opus-mt-{src}-{trg}'
- >>> model = MarianMTModel.from_pretrained(model_name)
- >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
- >>> batch = tokenizer([sample_text], return_tensors="pt")
- >>> gen = model.generate(**batch)
- >>> tokenizer.batch_decode(gen, skip_special_tokens=True)
- "Where is the bus stop ?"
+ >>> model = MarianMTModel.from_pretrained(model_name)
+ >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+ >>> batch = tokenizer([sample_text], return_tensors="pt")
+ >>> gen = model.generate(**batch)
+ >>> tokenizer.batch_decode(gen, skip_special_tokens=True)
+ "Where is the bus stop ?"
+ ```
"""
MARIAN_INPUTS_DOCSTRING = r"""
@@ -1124,20 +1126,21 @@ class MarianModel(MarianPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import MarianTokenizer, MarianModel
+ ```python
+ >>> from transformers import MarianTokenizer, MarianModel
- >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
- >>> model = MarianModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+ >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+ >>> model = MarianModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
- >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
- >>> decoder_input_ids = tokenizer(" Studien haben gezeigt dass es hilfreich ist einen Hund zu besitzen",
- ... return_tensors="pt", add_special_tokens=False).input_ids # Batch size 1
- >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+ >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
+ >>> decoder_input_ids = tokenizer(" Studien haben gezeigt dass es hilfreich ist einen Hund zu besitzen",
+ ... return_tensors="pt", add_special_tokens=False).input_ids # Batch size 1
+ >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
- >>> last_hidden_states = outputs.last_hidden_state
- """
+ >>> last_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index d4bb501838..f8638c14a3 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -555,23 +555,25 @@ MARIAN_START_DOCSTRING = r"""
MARIAN_GENERATION_EXAMPLE = r"""
TF version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints. Available
- models are listed `here `__.
+ models are listed [here](https://huggingface.co/models?search=Helsinki-NLP).
- Examples::
+ Examples:
- >>> from transformers import MarianTokenizer, TFMarianMTModel
- >>> from typing import List
- >>> src = 'fr' # source language
- >>> trg = 'en' # target language
- >>> sample_text = "où est l'arrêt de bus ?"
- >>> model_name = f'Helsinki-NLP/opus-mt-{src}-{trg}'
+ ```python
+ >>> from transformers import MarianTokenizer, TFMarianMTModel
+ >>> from typing import List
+ >>> src = 'fr' # source language
+ >>> trg = 'en' # target language
+ >>> sample_text = "où est l'arrêt de bus ?"
+ >>> model_name = f'Helsinki-NLP/opus-mt-{src}-{trg}'
- >>> model = TFMarianMTModel.from_pretrained(model_name)
- >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
- >>> batch = tokenizer([sample_text], return_tensors="tf")
- >>> gen = model.generate(**batch)
- >>> tokenizer.batch_decode(gen, skip_special_tokens=True)
- "Where is the bus stop ?"
+ >>> model = TFMarianMTModel.from_pretrained(model_name)
+ >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+ >>> batch = tokenizer([sample_text], return_tensors="tf")
+ >>> gen = model.generate(**batch)
+ >>> tokenizer.batch_decode(gen, skip_special_tokens=True)
+ "Where is the bus stop ?"
+ ```
"""
MARIAN_INPUTS_DOCSTRING = r"""
diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py
index 828afd53b9..5022569b8d 100644
--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -55,61 +55,61 @@ PRETRAINED_INIT_CONFIGURATION = {}
class MarianTokenizer(PreTrainedTokenizer):
r"""
- Construct a Marian tokenizer. Based on `SentencePiece `__.
+ Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- source_spm (:obj:`str`):
- `SentencePiece `__ file (generally has a .spm extension) that
+ source_spm (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
contains the vocabulary for the source language.
- target_spm (:obj:`str`):
- `SentencePiece `__ file (generally has a .spm extension) that
+ target_spm (`str`):
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
contains the vocabulary for the target language.
- source_lang (:obj:`str`, `optional`):
+ source_lang (`str`, *optional*):
A string representing the source language.
- target_lang (:obj:`str`, `optional`):
+ target_lang (`str`, *optional*):
A string representing the target language.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- model_max_length (:obj:`int`, `optional`, defaults to 512):
+ model_max_length (`int`, *optional*, defaults to 512):
The maximum sentence length the model accepts.
- additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["", ""]`):
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["", ""]`):
Additional special tokens used by the tokenizer.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
- Examples::
+ Examples:
- >>> from transformers import MarianTokenizer
- >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
- >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
- >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
- >>> inputs = tokenizer(src_texts, return_tensors="pt", padding=True)
- >>> with tokenizer.as_target_tokenizer():
- ... labels = tokenizer(tgt_texts, return_tensors="pt", padding=True)
- >>> inputs["labels"] = labels["input_ids"]
- # keys [input_ids, attention_mask, labels].
- >>> outputs = model(**inputs) should work
- """
+ ```python
+ >>> from transformers import MarianTokenizer
+ >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+ >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
+ >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
+ >>> inputs = tokenizer(src_texts, return_tensors="pt", padding=True)
+ >>> with tokenizer.as_target_tokenizer():
+ ... labels = tokenizer(tgt_texts, return_tensors="pt", padding=True)
+ >>> inputs["labels"] = labels["input_ids"]
+ # keys [input_ids, attention_mask, labels].
+ >>> outputs = model(**inputs) should work
+ ```"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
@@ -202,20 +202,20 @@ class MarianTokenizer(PreTrainedTokenizer):
Convert a list of lists of token ids into a list of strings by calling decode.
Args:
- sequences (:obj:`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
- List of tokenized input ids. Can be obtained using the ``__call__`` method.
- skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
+ List of tokenized input ids. Can be obtained using the `__call__` method.
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
- clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to clean up the tokenization spaces.
- use_source_tokenizer (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ use_source_tokenizer (`bool`, *optional*, defaults to `False`):
Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
problems).
- kwargs (additional keyword arguments, `optional`):
+ kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
Returns:
- :obj:`List[str]`: The list of decoded sentences.
+ `List[str]`: The list of decoded sentences.
"""
return super().batch_decode(sequences, **kwargs)
@@ -224,23 +224,23 @@ class MarianTokenizer(PreTrainedTokenizer):
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
tokens and clean up tokenization spaces.
- Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
+ Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
Args:
- token_ids (:obj:`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
- List of tokenized input ids. Can be obtained using the ``__call__`` method.
- skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+ List of tokenized input ids. Can be obtained using the `__call__` method.
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
- clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to clean up the tokenization spaces.
- use_source_tokenizer (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ use_source_tokenizer (`bool`, *optional*, defaults to `False`):
Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
problems).
- kwargs (additional keyword arguments, `optional`):
+ kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
Returns:
- :obj:`str`: The decoded sentence.
+ `str`: The decoded sentence.
"""
return super().decode(token_ids, **kwargs)
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index d1eb27c0e8..2e4769583f 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -32,77 +32,77 @@ MBART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class MBartConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.MBartModel`. It is used to
+ This is the configuration class to store the configuration of a [`MBartModel`]. It is used to
instantiate an MBART model according to the specified arguments, defining the model architecture. Instantiating a
- configuration with the defaults will yield a similar configuration to that of the MBART `facebook/mbart-large-cc25
- `__ architecture.
+ configuration with the defaults will yield a similar configuration to that of the MBART [facebook/mbart-large-cc25](https://huggingface.co/facebook/mbart-large-cc25) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50265):
+ vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.MBartModel` or
- :class:`~transformers.TFMBartModel`.
- d_model (:obj:`int`, `optional`, defaults to 1024):
+ `inputs_ids` passed when calling [`MBartModel`] or
+ [`TFMBartModel`].
+ d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
- encoder_layers (:obj:`int`, `optional`, defaults to 12):
+ encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
- decoder_layers (:obj:`int`, `optional`, defaults to 12):
+ decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
- encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
- decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
- classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- init_std (:obj:`float`, `optional`, defaults to 0.02):
+ init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details.
- decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the decoder. See the `LayerDrop paper `__ for more details.
- scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+ https://arxiv.org/abs/1909.11556) for more details.
+ scale_embedding (`bool`, *optional*, defaults to `False`):
Scale embeddings by diving by sqrt(d_model).
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models)
- forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
- The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
- :obj:`eos_token_id`.
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+ `eos_token_id`.
- Example::
+ Example:
- >>> from transformers import MBartModel, MBartConfig
+ ```python
+ >>> from transformers import MBartModel, MBartConfig
- >>> # Initializing a MBART facebook/mbart-large-cc25 style configuration
- >>> configuration = MBartConfig()
+ >>> # Initializing a MBART facebook/mbart-large-cc25 style configuration
+ >>> configuration = MBartConfig()
- >>> # Initializing a model from the facebook/mbart-large-cc25 style configuration
- >>> model = MBartModel(configuration)
+ >>> # Initializing a model from the facebook/mbart-large-cc25 style configuration
+ >>> model = MBartModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "mbart"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/mbart/modeling_flax_mbart.py b/src/transformers/models/mbart/modeling_flax_mbart.py
index 88d8b76b69..d6a425ff3c 100644
--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -1041,17 +1041,18 @@ class FlaxMBartPreTrainedModel(FlaxPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
+ ```python
+ >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
- >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
- >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+ >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
+ >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
- >>> encoder_outputs = model.encode(**inputs)
- """
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+ >>> encoder_outputs = model.encode(**inputs)
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1106,23 +1107,24 @@ class FlaxMBartPreTrainedModel(FlaxPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
+ ```python
+ >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
- >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
- >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+ >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
+ >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
- >>> encoder_outputs = model.encode(**inputs)
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+ >>> encoder_outputs = model.encode(**inputs)
- >>> decoder_start_token_id = model.config.decoder_start_token_id
- >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+ >>> decoder_start_token_id = model.config.decoder_start_token_id
+ >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> last_decoder_hidden_states = outputs.last_hidden_state
- """
+ >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+ >>> last_decoder_hidden_states = outputs.last_hidden_state
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1372,23 +1374,24 @@ class FlaxMBartForConditionalGeneration(FlaxMBartPreTrainedModel):
r"""
Returns:
- Example::
+ Example:
- >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
+ ```python
+ >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
- >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
- >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+ >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
+ >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
- >>> text = "My friends are cool but they eat too many carbs."
- >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
- >>> encoder_outputs = model.encode(**inputs)
+ >>> text = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+ >>> encoder_outputs = model.encode(**inputs)
- >>> decoder_start_token_id = model.config.decoder_start_token_id
- >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+ >>> decoder_start_token_id = model.config.decoder_start_token_id
+ >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
- >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
- >>> logits = outputs.logits
- """
+ >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+ >>> logits = outputs.logits
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py
index cf3bfab08f..ce33736f74 100644
--- a/src/transformers/models/mbart/tokenization_mbart.py
+++ b/src/transformers/models/mbart/tokenization_mbart.py
@@ -71,24 +71,25 @@ class MBartTokenizer(XLMRobertaTokenizer):
"""
Construct an MBART tokenizer.
- :class:`~transformers.MBartTokenizer` is a subclass of :class:`~transformers.XLMRobertaTokenizer`. Refer to
- superclass :class:`~transformers.XLMRobertaTokenizer` for usage examples and documentation concerning the
+ [`MBartTokenizer`] is a subclass of [`XLMRobertaTokenizer`]. Refer to
+ superclass [`XLMRobertaTokenizer`] for usage examples and documentation concerning the
initialization parameters and other methods.
- The tokenization method is `` `` for source language documents, and ``
+ The tokenization method is ` ` for source language documents, and ``
``` for target language documents.
- Examples::
+ Examples:
- >>> from transformers import MBartTokenizer
- >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro', src_lang="en_XX", tgt_lang="ro_RO")
- >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
- >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
- >>> inputs = tokenizer(example_english_phrase, return_tensors="pt)
- >>> with tokenizer.as_target_tokenizer():
- ... labels = tokenizer(expected_translation_romanian, return_tensors="pt")
- >>> inputs["labels"] = labels["input_ids"]
- """
+ ```python
+ >>> from transformers import MBartTokenizer
+ >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro', src_lang="en_XX", tgt_lang="ro_RO")
+ >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+ >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+ >>> inputs = tokenizer(example_english_phrase, return_tensors="pt)
+ >>> with tokenizer.as_target_tokenizer():
+ ... labels = tokenizer(expected_translation_romanian, return_tensors="pt")
+ >>> inputs["labels"] = labels["input_ids"]
+ ```"""
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -149,18 +150,18 @@ class MBartTokenizer(XLMRobertaTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -179,22 +180,22 @@ class MBartTokenizer(XLMRobertaTokenizer):
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
- adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
+ adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:
- - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
- - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
+ - `input_ids` (for encoder) `X [eos, src_lang_code]`
+ - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py
index b135ecba4c..bf578bb2c4 100644
--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -82,27 +82,27 @@ FAIRSEQ_LANGUAGE_CODES = [
class MBartTokenizerFast(XLMRobertaTokenizerFast):
"""
- Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library). Based on `BPE
- `__.
+ Construct a "fast" MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
- :class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast`. Refer to
- superclass :class:`~transformers.XLMRobertaTokenizerFast` for usage examples and documentation concerning the
+ [`MBartTokenizerFast`] is a subclass of [`XLMRobertaTokenizerFast`]. Refer to
+ superclass [`XLMRobertaTokenizerFast`] for usage examples and documentation concerning the
initialization parameters and other methods.
- The tokenization method is `` `` for source language documents, and ``
+ The tokenization method is ` ` for source language documents, and ``
``` for target language documents.
- Examples::
+ Examples:
- >>> from transformers import MBartTokenizerFast
- >>> tokenizer = MBartTokenizerFast.from_pretrained('facebook/mbart-large-en-ro', src_lang="en_XX", tgt_lang="ro_RO")
- >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
- >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
- >>> inputs = tokenizer(example_english_phrase, return_tensors="pt)
- >>> with tokenizer.as_target_tokenizer():
- ... labels = tokenizer(expected_translation_romanian, return_tensors="pt")
- >>> inputs["labels"] = labels["input_ids"]
- """
+ ```python
+ >>> from transformers import MBartTokenizerFast
+ >>> tokenizer = MBartTokenizerFast.from_pretrained('facebook/mbart-large-en-ro', src_lang="en_XX", tgt_lang="ro_RO")
+ >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+ >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+ >>> inputs = tokenizer(example_english_phrase, return_tensors="pt)
+ >>> with tokenizer.as_target_tokenizer():
+ ... labels = tokenizer(expected_translation_romanian, return_tensors="pt")
+ >>> inputs["labels"] = labels["input_ids"]
+ ```"""
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -164,22 +164,22 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. The special tokens depend on calling set_lang.
- An MBART sequence has the following format, where ``X`` represents the sequence:
+ An MBART sequence has the following format, where `X` represents the sequence:
- - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
- - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
+ - `input_ids` (for encoder) `X [eos, src_lang_code]`
+ - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py
index dbfd53a7fb..48f34cd9ac 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50.py
@@ -47,61 +47,61 @@ FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE",
class MBart50Tokenizer(PreTrainedTokenizer):
"""
- Construct a MBart50 tokenizer. Based on `SentencePiece `__.
+ Construct a MBart50 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- src_lang (:obj:`str`, `optional`):
+ src_lang (`str`, *optional*):
A string representing the source language.
- tgt_lang (:obj:`str`, `optional`):
+ tgt_lang (`str`, *optional*):
A string representing the target language.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
- Examples::
+ Examples:
- >>> from transformers import MBart50Tokenizer
- >>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
- >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
- >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
- >>> model_inputs = tokenizer(src_text, return_tensors="pt")
- >>> with tokenizer.as_target_tokenizer():
- ... labels = tokenizer(tgt_text, return_tensors="pt").input_ids
- >>> # model(**model_inputs, labels=labels) should work
- """
+ ```python
+ >>> from transformers import MBart50Tokenizer
+ >>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
+ >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
+ >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
+ >>> model_inputs = tokenizer(src_text, return_tensors="pt")
+ >>> with tokenizer.as_target_tokenizer():
+ ... labels = tokenizer(tgt_text, return_tensors="pt").input_ids
+ >>> # model(**model_inputs, labels=labels) should work
+ ```"""
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -252,18 +252,18 @@ class MBart50Tokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -282,22 +282,22 @@ class MBart50Tokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
- adding special tokens. An MBART-50 sequence has the following format, where ``X`` represents the sequence:
+ adding special tokens. An MBART-50 sequence has the following format, where `X` represents the sequence:
- - ``input_ids`` (for encoder) ``[src_lang_code] X [eos]``
- - ``labels``: (for decoder) ``[tgt_lang_code] X [eos]``
+ - `input_ids` (for encoder) `[src_lang_code] X [eos]`
+ - `labels`: (for decoder) `[tgt_lang_code] X [eos]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
index 93f93d2423..7b481f4362 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
@@ -56,48 +56,48 @@ FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE",
class MBart50TokenizerFast(PreTrainedTokenizerFast):
"""
- Construct a "fast" MBART tokenizer for mBART-50 (backed by HuggingFace's `tokenizers` library). Based on `BPE
- `__.
+ Construct a "fast" MBART tokenizer for mBART-50 (backed by HuggingFace's *tokenizers* library). Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- src_lang (:obj:`str`, `optional`):
+ src_lang (`str`, *optional*):
A string representing the source language.
- tgt_lang (:obj:`str`, `optional`):
+ tgt_lang (`str`, *optional*):
A string representing the target language.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- Examples::
+ Examples:
- >>> from transformers import MBart50TokenizerFast
- >>> tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
- >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
- >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
- >>> model_inputs = tokenizer(src_text, return_tensors="pt")
- >>> with tokenizer.as_target_tokenizer():
- ... labels = tokenizer(tgt_text, return_tensors="pt").input_ids
- >>> # model(**model_inputs, labels=labels) should work
- """
+ ```python
+ >>> from transformers import MBart50TokenizerFast
+ >>> tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
+ >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
+ >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
+ >>> model_inputs = tokenizer(src_text, return_tensors="pt")
+ >>> with tokenizer.as_target_tokenizer():
+ ... labels = tokenizer(tgt_text, return_tensors="pt").input_ids
+ >>> # model(**model_inputs, labels=labels) should work
+ ```"""
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -172,22 +172,22 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. The special tokens depend on calling set_lang.
- An MBART-50 sequence has the following format, where ``X`` represents the sequence:
+ An MBART-50 sequence has the following format, where `X` represents the sequence:
- - ``input_ids`` (for encoder) ``[src_lang_code] X [eos]``
- - ``labels``: (for decoder) ``[tgt_lang_code] X [eos]``
+ - `input_ids` (for encoder) `[src_lang_code] X [eos]`
+ - `labels`: (for decoder) `[tgt_lang_code] X [eos]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index d6e32cd496..0210f0466f 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -27,68 +27,67 @@ MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class MegatronBertConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.MegatronBertModel`. It is
+ This is the configuration class to store the configuration of a [`MegatronBertModel`]. It is
used to instantiate a MEGATRON_BERT model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the MEGATRON_BERT
- `megatron-bert-uncased-345m `__ architecture.
+ [megatron-bert-uncased-345m](https://huggingface.co/nvidia/megatron-bert-uncased-345m) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 29056):
+ vocab_size (`int`, *optional*, defaults to 29056):
Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented
- by the :obj:`inputs_ids` passed when calling :class:`~transformers.MegatronBertModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 1024):
+ by the `inputs_ids` passed when calling [`MegatronBertModel`].
+ hidden_size (`int`, *optional*, defaults to 1024):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+ num_hidden_layers (`int`, *optional*, defaults to 24):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ num_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 4096):
+ intermediate_size (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed when calling
- :class:`~transformers.MegatronBertModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling
+ [`MegatronBertModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
- Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
- :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
- :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
- `__. For more information on :obj:`"relative_key_query"`, please refer to
- `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
- `__.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+ `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+ `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+ *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
- relevant if ``config.is_decoder=True``.
+ relevant if `config.is_decoder=True`.
- Examples::
+ Examples:
- >>> from transformers import MegatronBertModel, MegatronBertConfig
+ ```python
+ >>> from transformers import MegatronBertModel, MegatronBertConfig
- >>> # Initializing a MEGATRON_BERT bert-base-uncased style configuration
- >>> configuration = MegatronBertConfig()
+ >>> # Initializing a MEGATRON_BERT bert-base-uncased style configuration
+ >>> configuration = MegatronBertConfig()
- >>> # Initializing a model from the bert-base-uncased style configuration
- >>> model = MegatronBertModel(configuration)
+ >>> # Initializing a model from the bert-base-uncased style configuration
+ >>> model = MegatronBertModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "megatron-bert"
def __init__(
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index aa547737c7..06dc3f4430 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -72,159 +72,164 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
}
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
- return_token_type_ids (:obj:`bool`, `optional`):
+ return_token_type_ids (`bool`, *optional*):
Whether to return token type IDs. If left to the default, will return the token type IDs according to
- the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+ the specific tokenizer's default, defined by the `return_outputs` attribute.
- `What are token type IDs? <../glossary.html#token-type-ids>`__
- return_attention_mask (:obj:`bool`, `optional`):
+ [What are token type IDs?](../glossary#token-type-ids)
+ return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
- to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+ to the specific tokenizer's default, defined by the `return_outputs` attribute.
- `What are attention masks? <../glossary.html#attention-mask>`__
- return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ [What are attention masks?](../glossary#attention-mask)
+ return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
- of pairs) is provided with :obj:`truncation_strategy = longest_first` or :obj:`True`, an error is
+ of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is
raised instead of returning overflowing tokens.
- return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
Whether or not to return special tokens mask information.
- return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether or not to return :obj:`(char_start, char_end)` for each token.
+ return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+ Whether or not to return `(char_start, char_end)` for each token.
This is only available on fast tokenizers inheriting from
- :class:`~transformers.PreTrainedTokenizerFast`, if using Python's tokenizer, this method will raise
- :obj:`NotImplementedError`.
- return_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ [`PreTrainedTokenizerFast`], if using Python's tokenizer, this method will raise
+ `NotImplementedError`.
+ return_length (`bool`, *optional*, defaults to `False`):
Whether or not to return the lengths of the encoded inputs.
- verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ verbose (`bool`, *optional*, defaults to `True`):
Whether or not to print more information and warnings.
- **kwargs: passed to the :obj:`self.tokenize()` method
+ **kwargs: passed to the `self.tokenize()` method
- Return:
- :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+ Return:
+ [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model.
- `What are input IDs? <../glossary.html#input-ids>`__
+ [What are input IDs?](../glossary#input-ids)
- - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
- or if `"token_type_ids"` is in :obj:`self.model_input_names`).
+ - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True`
+ or if *"token_type_ids"* is in `self.model_input_names`).
- `What are token type IDs? <../glossary.html#token-type-ids>`__
+ [What are token type IDs?](../glossary#token-type-ids)
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
- :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
- `What are attention masks? <../glossary.html#attention-mask>`__
+ [What are attention masks?](../glossary#attention-mask)
- **entity_ids** -- List of entity ids to be fed to a model.
- `What are input IDs? <../glossary.html#input-ids>`__
+ [What are input IDs?](../glossary#input-ids)
- **entity_position_ids** -- List of entity positions in the input sequence to be fed to a model.
- **entity_token_type_ids** -- List of entity token type ids to be fed to a model (when
- :obj:`return_token_type_ids=True` or if `"entity_token_type_ids"` is in :obj:`self.model_input_names`).
+ `return_token_type_ids=True` or if *"entity_token_type_ids"* is in `self.model_input_names`).
- `What are token type IDs? <../glossary.html#token-type-ids>`__
+ [What are token type IDs?](../glossary#token-type-ids)
- **entity_attention_mask** -- List of indices specifying which entities should be attended to by the model
- (when :obj:`return_attention_mask=True` or if `"entity_attention_mask"` is in
- :obj:`self.model_input_names`).
+ (when `return_attention_mask=True` or if *"entity_attention_mask"* is in
+ `self.model_input_names`).
- `What are attention masks? <../glossary.html#attention-mask>`__
+ [What are attention masks?](../glossary#attention-mask)
- **entity_start_positions** -- List of the start positions of entities in the word token sequence (when
- :obj:`task="entity_span_classification"`).
+ `task="entity_span_classification"`).
- **entity_end_positions** -- List of the end positions of entities in the word token sequence (when
- :obj:`task="entity_span_classification"`).
- - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
- :obj:`return_overflowing_tokens=True`).
- - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
- :obj:`return_overflowing_tokens=True`).
+ `task="entity_span_classification"`).
+ - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+ `return_overflowing_tokens=True`).
+ - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+ `return_overflowing_tokens=True`).
- **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
- regular sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
- - **length** -- The length of the inputs (when :obj:`return_length=True`)
+ regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+ - **length** -- The length of the inputs (when `return_length=True`)
"""
class MLukeTokenizer(PreTrainedTokenizer):
"""
- Adapted from :class:`~transformers.XLMRobertaTokenizer` and :class:`~transformers.LukeTokenizer`. Based on
- `SentencePiece `__.
+ Adapted from [`XLMRobertaTokenizer`] and [`LukeTokenizer`]. Based on
+ [SentencePiece](https://github.com/google/sentencepiece).
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- entity_vocab_file (:obj:`str`):
+ entity_vocab_file (`str`):
Path to the entity vocabulary file.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- task (:obj:`str`, `optional`):
- Task for which you want to prepare sequences. One of :obj:`"entity_classification"`,
- :obj:`"entity_pair_classification"`, or :obj:`"entity_span_classification"`. If you specify this argument,
+ task (`str`, *optional*):
+ Task for which you want to prepare sequences. One of `"entity_classification"`,
+ `"entity_pair_classification"`, or `"entity_span_classification"`. If you specify this argument,
the entity sequence is automatically created based on the given entity span(s).
- max_entity_length (:obj:`int`, `optional`, defaults to 32):
- The maximum length of :obj:`entity_ids`.
- max_mention_length (:obj:`int`, `optional`, defaults to 30):
+ max_entity_length (`int`, *optional*, defaults to 32):
+ The maximum length of `entity_ids`.
+ max_mention_length (`int`, *optional*, defaults to 30):
The maximum number of tokens inside an entity span.
- entity_token_1 (:obj:`str`, `optional`, defaults to :obj:``):
+ entity_token_1 (`str`, *optional*, defaults to ``):
The special token used to represent an entity span in a word token sequence. This token is only used when
- ``task`` is set to :obj:`"entity_classification"` or :obj:`"entity_pair_classification"`.
- entity_token_2 (:obj:`str`, `optional`, defaults to :obj:``):
+ `task` is set to `"entity_classification"` or `"entity_pair_classification"`.
+ entity_token_2 (`str`, *optional*, defaults to ``):
The special token used to represent an entity span in a word token sequence. This token is only used when
- ``task`` is set to :obj:`"entity_pair_classification"`.
- additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
+ `task` is set to `"entity_pair_classification"`.
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
- sp_model_kwargs (:obj:`dict`, `optional`):
- Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
- `__ can be used, among other things, to set:
+ sp_model_kwargs (`dict`, *optional*):
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
- - ``enable_sampling``: Enable subword regularization.
- - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+ - `enable_sampling`: Enable subword regularization.
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- - ``nbest_size = {0,1}``: No sampling is performed.
- - ``nbest_size > 1``: samples from the nbest_size results.
- - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+ - `nbest_size = {0,1}`: No sampling is performed.
+ - `nbest_size > 1`: samples from the nbest_size results.
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes:
- sp_model (:obj:`SentencePieceProcessor`):
- The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+ sp_model (`SentencePieceProcessor`):
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -373,39 +378,39 @@ class MLukeTokenizer(PreTrainedTokenizer):
sequences, depending on the task you want to prepare them for.
Args:
- text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence must be a string. Note that this
tokenizer does not support tokenization based on pretokenized strings.
- text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+ text_pair (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence must be a string. Note that this
tokenizer does not support tokenization based on pretokenized strings.
- entity_spans (:obj:`List[Tuple[int, int]]`, :obj:`List[List[Tuple[int, int]]]`, `optional`):
+ entity_spans (`List[Tuple[int, int]]`, `List[List[Tuple[int, int]]]`, *optional*):
The sequence or batch of sequences of entity spans to be encoded. Each sequence consists of tuples each
with two integers denoting character-based start and end positions of entities. If you specify
- :obj:`"entity_classification"` or :obj:`"entity_pair_classification"` as the ``task`` argument in the
- constructor, the length of each sequence must be 1 or 2, respectively. If you specify ``entities``, the
- length of each sequence must be equal to the length of each sequence of ``entities``.
- entity_spans_pair (:obj:`List[Tuple[int, int]]`, :obj:`List[List[Tuple[int, int]]]`, `optional`):
+ `"entity_classification"` or `"entity_pair_classification"` as the `task` argument in the
+ constructor, the length of each sequence must be 1 or 2, respectively. If you specify `entities`, the
+ length of each sequence must be equal to the length of each sequence of `entities`.
+ entity_spans_pair (`List[Tuple[int, int]]`, `List[List[Tuple[int, int]]]`, *optional*):
The sequence or batch of sequences of entity spans to be encoded. Each sequence consists of tuples each
with two integers denoting character-based start and end positions of entities. If you specify the
- ``task`` argument in the constructor, this argument is ignored. If you specify ``entities_pair``, the
- length of each sequence must be equal to the length of each sequence of ``entities_pair``.
- entities (:obj:`List[str]`, :obj:`List[List[str]]`, `optional`):
+ `task` argument in the constructor, this argument is ignored. If you specify `entities_pair`, the
+ length of each sequence must be equal to the length of each sequence of `entities_pair`.
+ entities (`List[str]`, `List[List[str]]`, *optional*):
The sequence or batch of sequences of entities to be encoded. Each sequence consists of strings
representing entities, i.e., special entities (e.g., [MASK]) or entity titles of Wikipedia (e.g., Los
- Angeles). This argument is ignored if you specify the ``task`` argument in the constructor. The length
- of each sequence must be equal to the length of each sequence of ``entity_spans``. If you specify
- ``entity_spans`` without specifying this argument, the entity sequence or the batch of entity sequences
+ Angeles). This argument is ignored if you specify the `task` argument in the constructor. The length
+ of each sequence must be equal to the length of each sequence of `entity_spans`. If you specify
+ `entity_spans` without specifying this argument, the entity sequence or the batch of entity sequences
is automatically constructed by filling it with the [MASK] entity.
- entities_pair (:obj:`List[str]`, :obj:`List[List[str]]`, `optional`):
+ entities_pair (`List[str]`, `List[List[str]]`, *optional*):
The sequence or batch of sequences of entities to be encoded. Each sequence consists of strings
representing entities, i.e., special entities (e.g., [MASK]) or entity titles of Wikipedia (e.g., Los
- Angeles). This argument is ignored if you specify the ``task`` argument in the constructor. The length
- of each sequence must be equal to the length of each sequence of ``entity_spans_pair``. If you specify
- ``entity_spans_pair`` without specifying this argument, the entity sequence or the batch of entity
+ Angeles). This argument is ignored if you specify the `task` argument in the constructor. The length
+ of each sequence must be equal to the length of each sequence of `entity_spans_pair`. If you specify
+ `entity_spans_pair` without specifying this argument, the entity sequence or the batch of entity
sequences is automatically constructed by filling it with the [MASK] entity.
- max_entity_length (:obj:`int`, `optional`):
- The maximum length of :obj:`entity_ids`.
+ max_entity_length (`int`, *optional*):
+ The maximum length of `entity_ids`.
"""
# Input type checking for clearer error
is_valid_single_text = isinstance(text, str)
@@ -969,24 +974,24 @@ class MLukeTokenizer(PreTrainedTokenizer):
Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids,
entity spans so that it can be used by the model. It adds special tokens, truncates sequences if overflowing
while taking into account the special tokens and manages a moving window (with user defined stride) for
- overflowing tokens. Please Note, for `pair_ids` different than `None` and `truncation_strategy = longest_first`
- or `True`, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
+ overflowing tokens. Please Note, for *pair_ids* different than *None* and *truncation_strategy = longest_first*
+ or *True*, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
error.
Args:
- ids (:obj:`List[int]`):
+ ids (`List[int]`):
Tokenized input ids of the first sequence.
- pair_ids (:obj:`List[int]`, `optional`):
+ pair_ids (`List[int]`, *optional*):
Tokenized input ids of the second sequence.
- entity_ids (:obj:`List[int]`, `optional`):
+ entity_ids (`List[int]`, *optional*):
Entity ids of the first sequence.
- pair_entity_ids (:obj:`List[int]`, `optional`):
+ pair_entity_ids (`List[int]`, *optional*):
Entity ids of the second sequence.
- entity_token_spans (:obj:`List[Tuple[int, int]]`, `optional`):
+ entity_token_spans (`List[Tuple[int, int]]`, *optional*):
Entity spans of the first sequence.
- pair_entity_token_spans (:obj:`List[Tuple[int, int]]`, `optional`):
+ pair_entity_token_spans (`List[Tuple[int, int]]`, *optional*):
Entity spans of the second sequence.
- max_entity_length (:obj:`int`, `optional`):
+ max_entity_length (`int`, *optional*):
The maximum length of the entity sequence.
"""
@@ -1188,46 +1193,45 @@ class MLukeTokenizer(PreTrainedTokenizer):
"""
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
in the batch. Padding side (left/right) padding token ids are defined at the tokenizer level (with
- ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``) .. note:: If the
- ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the result
- will use the same type unless you provide a different tensor type with ``return_tensors``. In the case of
+ `self.padding_side`, `self.pad_token_id` and `self.pad_token_type_id`) .. note:: If the
+ `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the result
+ will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
PyTorch tensors, you will lose the specific device of your tensors however.
Args:
- encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
- Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
- List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
- List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
- well as in a PyTorch Dataloader collate function. Instead of :obj:`List[int]` you can have tensors
+ encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
+ Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of tokenized inputs (list of [`BatchEncoding`], *Dict[str,
+ List[List[int]]]* or *List[Dict[str, List[int]]]*) so you can use this method during preprocessing as
+ well as in a PyTorch Dataloader collate function. Instead of `List[int]` you can have tensors
(numpy arrays, PyTorch tensors or TensorFlow tensors), see the note above for the return type.
- padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+ padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
- * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
- * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
- max_length (:obj:`int`, `optional`):
+ max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
- max_entity_length (:obj:`int`, `optional`):
+ max_entity_length (`int`, *optional*):
The maximum length of the entity sequence.
- pad_to_multiple_of (:obj:`int`, `optional`):
+ pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
- return_attention_mask (:obj:`bool`, `optional`):
+ return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
- to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are
- attention masks? <../glossary.html#attention-mask>`__
- return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+ to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are
+ attention masks?](../glossary#attention-mask)
+ return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
- * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
- * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
- verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return Numpy `np.ndarray` objects.
+ verbose (`bool`, *optional*, defaults to `True`):
Whether or not to print more information and warnings.
"""
# If we have a list of dicts, let's convert it in a dict of lists
@@ -1495,17 +1499,17 @@ class MLukeTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An XLM-RoBERTa sequence has the following format:
- - single sequence: `` X ``
- - pair of sequences: `` A B ``
+ - single sequence: ` X `
+ - pair of sequences: ` A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
@@ -1520,18 +1524,18 @@ class MLukeTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` method.
+ special tokens using the tokenizer `prepare_for_model` method.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
@@ -1552,13 +1556,13 @@ class MLukeTokenizer(PreTrainedTokenizer):
not make use of token type ids, therefore a list of zeros is returned.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
diff --git a/src/transformers/models/mmbt/configuration_mmbt.py b/src/transformers/models/mmbt/configuration_mmbt.py
index bbb6c9d240..1137917c34 100644
--- a/src/transformers/models/mmbt/configuration_mmbt.py
+++ b/src/transformers/models/mmbt/configuration_mmbt.py
@@ -23,15 +23,15 @@ logger = logging.get_logger(__name__)
class MMBTConfig(object):
"""
- This is the configuration class to store the configuration of a :class:`~transformers.MMBTModel`. It is used to
+ This is the configuration class to store the configuration of a [`MMBTModel`]. It is used to
instantiate a MMBT model according to the specified arguments, defining the model architecture.
Args:
- config (:class:`~transformers.PreTrainedConfig`):
+ config ([`PreTrainedConfig`]):
Config of the underlying Transformer models. Its values are copied over to use a single config.
- num_labels (:obj:`int`, `optional`):
+ num_labels (`int`, *optional*):
Size of final Linear layer for classification.
- modal_hidden_size (:obj:`int`, `optional`, defaults to 2048):
+ modal_hidden_size (`int`, *optional*, defaults to 2048):
Embedding dimension of the non-text modality encoder.
"""
diff --git a/src/transformers/models/mmbt/modeling_mmbt.py b/src/transformers/models/mmbt/modeling_mmbt.py
index d9b76c6f6b..d30d31fba6 100644
--- a/src/transformers/models/mmbt/modeling_mmbt.py
+++ b/src/transformers/models/mmbt/modeling_mmbt.py
@@ -208,13 +208,14 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
r"""
Returns:
- Examples::
+ Examples:
- # For example purposes. Not runnable.
- transformer = BertModel.from_pretrained('bert-base-uncased')
- encoder = ImageEncoder(args)
- mmbt = MMBTModel(config, transformer, encoder)
- """
+ ```python
+ # For example purposes. Not runnable.
+ transformer = BertModel.from_pretrained('bert-base-uncased')
+ encoder = ImageEncoder(args)
+ mmbt = MMBTModel(config, transformer, encoder)
+ ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/mobilebert/configuration_mobilebert.py b/src/transformers/models/mobilebert/configuration_mobilebert.py
index 4f8e338d33..a738fc54c7 100644
--- a/src/transformers/models/mobilebert/configuration_mobilebert.py
+++ b/src/transformers/models/mobilebert/configuration_mobilebert.py
@@ -27,78 +27,80 @@ MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class MobileBertConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel` or a
- :class:`~transformers.TFMobileBertModel`. It is used to instantiate a MobileBERT model according to the specified
+ This is the configuration class to store the configuration of a [`MobileBertModel`] or a
+ [`TFMobileBertModel`]. It is used to instantiate a MobileBERT model according to the specified
arguments, defining the model architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
+ vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the MobileBERT model. Defines the number of different tokens that can be represented by
- the :obj:`inputs_ids` passed when calling :class:`~transformers.MobileBertModel` or
- :class:`~transformers.TFMobileBertModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 512):
+ the `inputs_ids` passed when calling [`MobileBertModel`] or
+ [`TFMobileBertModel`].
+ hidden_size (`int`, *optional*, defaults to 512):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+ num_hidden_layers (`int`, *optional*, defaults to 24):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 4):
+ num_attention_heads (`int`, *optional*, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 512):
+ intermediate_size (`int`, *optional*, defaults to 512):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`):
+ hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.0):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.MobileBertModel`
- or :class:`~transformers.TFMobileBertModel`.
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ type_vocab_size (`int`, *optional*, defaults to 2):
+ The vocabulary size of the `token_type_ids` passed when calling [`MobileBertModel`]
+ or [`TFMobileBertModel`].
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- pad_token_id (:obj:`int`, `optional`, defaults to 0):
+ pad_token_id (`int`, *optional*, defaults to 0):
The ID of the token in the word embedding to use as padding.
- embedding_size (:obj:`int`, `optional`, defaults to 128):
+ embedding_size (`int`, *optional*, defaults to 128):
The dimension of the word embedding vectors.
- trigram_input (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ trigram_input (`bool`, *optional*, defaults to `True`):
Use a convolution of trigram as input.
- use_bottleneck (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_bottleneck (`bool`, *optional*, defaults to `True`):
Whether to use bottleneck in BERT.
- intra_bottleneck_size (:obj:`int`, `optional`, defaults to 128):
+ intra_bottleneck_size (`int`, *optional*, defaults to 128):
Size of bottleneck layer output.
- use_bottleneck_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ use_bottleneck_attention (`bool`, *optional*, defaults to `False`):
Whether to use attention inputs from the bottleneck transformation.
- key_query_shared_bottleneck (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ key_query_shared_bottleneck (`bool`, *optional*, defaults to `True`):
Whether to use the same linear transformation for query&key in the bottleneck.
- num_feedforward_networks (:obj:`int`, `optional`, defaults to 4):
+ num_feedforward_networks (`int`, *optional*, defaults to 4):
Number of FFNs in a block.
- normalization_type (:obj:`str`, `optional`, defaults to :obj:`"no_norm"`):
+ normalization_type (`str`, *optional*, defaults to `"no_norm"`):
The normalization type in MobileBERT.
- classifier_dropout (:obj:`float`, `optional`):
+ classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
- Examples::
+ Examples:
- >>> from transformers import MobileBertModel, MobileBertConfig
+ ```python
+ >>> from transformers import MobileBertModel, MobileBertConfig
- >>> # Initializing a MobileBERT configuration
- >>> configuration = MobileBertConfig()
+ >>> # Initializing a MobileBERT configuration
+ >>> configuration = MobileBertConfig()
- >>> # Initializing a model from the configuration above
- >>> model = MobileBertModel(configuration)
+ >>> # Initializing a model from the configuration above
+ >>> model = MobileBertModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```
Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained
checkpoints.
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index 6de104a941..db7ad2ae09 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -1031,18 +1031,18 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
r"""
Return:
- Examples::
+ Examples:
- >>> import tensorflow as tf
- >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
+ ```python
+ >>> import tensorflow as tf
+ >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
- >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
- >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
- >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
- >>> outputs = model(input_ids)
- >>> prediction_scores, seq_relationship_scores = outputs[:2]
-
- """
+ >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+ >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
+ >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
+ >>> outputs = model(input_ids)
+ >>> prediction_scores, seq_relationship_scores = outputs[:2]
+ ```"""
inputs = input_processing(
func=self.call,
config=self.config,
@@ -1242,20 +1242,21 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel, TFNextS
r"""
Return:
- Examples::
+ Examples:
- >>> import tensorflow as tf
- >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
+ ```python
+ >>> import tensorflow as tf
+ >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
- >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
- >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
+ >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+ >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
- >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
- >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+ >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+ >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
- >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
- """
+ >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
+ ```"""
inputs = input_processing(
func=self.call,
config=self.config,
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py
index b19fdcbf75..ef9828c407 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py
@@ -37,10 +37,10 @@ class MobileBertTokenizer(BertTokenizer):
r"""
Construct a MobileBERT tokenizer.
- :class:`~transformers.MobileBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+ [`MobileBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
parameters.
"""
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
index 702d4d98b3..28eced0356 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
@@ -39,12 +39,12 @@ PRETRAINED_INIT_CONFIGURATION = {}
class MobileBertTokenizerFast(BertTokenizerFast):
r"""
- Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library).
+ Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's *tokenizers* library).
- :class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+ [`MobileBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
end-to-end tokenization: punctuation splitting and wordpiece.
- Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+ Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
parameters.
"""
diff --git a/src/transformers/models/mpnet/configuration_mpnet.py b/src/transformers/models/mpnet/configuration_mpnet.py
index 0026b1d6eb..a6fc99486b 100644
--- a/src/transformers/models/mpnet/configuration_mpnet.py
+++ b/src/transformers/models/mpnet/configuration_mpnet.py
@@ -28,57 +28,58 @@ MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class MPNetConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.MPNetModel` or a
- :class:`~transformers.TFMPNetModel`. It is used to instantiate a MPNet model according to the specified arguments,
+ This is the configuration class to store the configuration of a [`MPNetModel`] or a
+ [`TFMPNetModel`]. It is used to instantiate a MPNet model according to the specified arguments,
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
- to that of the MPNet `mpnet-base `__ architecture.
+ to that of the MPNet [mpnet-base](https://huggingface.co/mpnet-base) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 30527):
+ vocab_size (`int`, *optional*, defaults to 30527):
Vocabulary size of the MPNet model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.MPNetModel` or
- :class:`~transformers.TFMPNetModel`.
- hidden_size (:obj:`int`, `optional`, defaults to 768):
+ `inputs_ids` passed when calling [`MPNetModel`] or
+ [`TFMPNetModel`].
+ hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+ num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+ num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+ intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
- hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+ max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
- relative_attention_num_buckets (:obj:`int`, `optional`, defaults to 32):
+ relative_attention_num_buckets (`int`, *optional*, defaults to 32):
The number of buckets to use for each attention layer.
- Examples::
+ Examples:
- >>> from transformers import MPNetModel, MPNetConfig
+ ```python
+ >>> from transformers import MPNetModel, MPNetConfig
- >>> # Initializing a MPNet mpnet-base style configuration
- >>> configuration = MPNetConfig()
+ >>> # Initializing a MPNet mpnet-base style configuration
+ >>> configuration = MPNetConfig()
- >>> # Initializing a model from the mpnet-base style configuration
- >>> model = MPNetModel(configuration)
+ >>> # Initializing a model from the mpnet-base style configuration
+ >>> model = MPNetModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "mpnet"
def __init__(
diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py
index c59cd56ec0..1de9746a21 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet.py
@@ -66,56 +66,61 @@ def whitespace_tokenize(text):
class MPNetTokenizer(PreTrainedTokenizer):
"""
- This tokenizer inherits from :class:`~transformers.BertTokenizer` which contains most of the methods. Users should
+ This tokenizer inherits from [`BertTokenizer`] which contains most of the methods. Users should
refer to the superclass for more information regarding methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
- do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_basic_tokenize (`bool`, *optional*, defaults to `True`):
Whether or not to do basic tokenization before WordPiece.
- never_split (:obj:`Iterable`, `optional`):
+ never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
- :obj:`do_basic_tokenize=True`
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ `do_basic_tokenize=True`
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
- This should likely be deactivated for Japanese (see this `issue
- `__).
- strip_accents: (:obj:`bool`, `optional`):
+ This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+ strip_accents: (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
- value for :obj:`lowercase` (as in the original BERT).
+ value for `lowercase` (as in the original BERT).
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -229,17 +234,17 @@ class MPNetTokenizer(PreTrainedTokenizer):
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A MPNet sequence has the following format:
- - single sequence: `` X ``
- - pair of sequences: `` A B ``
+ - single sequence: ` X `
+ - pair of sequences: ` A B `
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+ `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -252,18 +257,18 @@ class MPNetTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` methods.
+ special tokens using the tokenizer `prepare_for_model` methods.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of ids.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
- :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
@@ -282,13 +287,13 @@ class MPNetTokenizer(PreTrainedTokenizer):
make use of token type ids, therefore a list of zeros is returned.
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of ids.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
@@ -324,19 +329,18 @@ class BasicTokenizer(object):
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
- never_split (:obj:`Iterable`, `optional`):
+ never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
- :obj:`do_basic_tokenize=True`
- tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ `do_basic_tokenize=True`
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
- This should likely be deactivated for Japanese (see this `issue
- `__).
- strip_accents: (:obj:`bool`, `optional`):
+ This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+ strip_accents: (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
- value for :obj:`lowercase` (as in the original BERT).
+ value for `lowercase` (as in the original BERT).
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
@@ -353,9 +357,9 @@ class BasicTokenizer(object):
WordPieceTokenizer.
Args:
- **never_split**: (`optional`) list of str
+ never_split (`LIst[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
- :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+ [`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
# union() returns a new set by concatenating the two sets.
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
@@ -482,14 +486,14 @@ class WordpieceTokenizer(object):
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
- For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+ For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
- text: A single token or whitespace separated tokens. This should have
- already been passed through `BasicTokenizer`.
+ text: A single token or whitespace separated tokens. This should have
+ already been passed through *BasicTokenizer*.
Returns:
- A list of wordpiece tokens.
+ A list of wordpiece tokens.
"""
output_tokens = []
diff --git a/src/transformers/models/mpnet/tokenization_mpnet_fast.py b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
index 8b5aedb278..87b50e144a 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet_fast.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
@@ -50,51 +50,57 @@ PRETRAINED_INIT_CONFIGURATION = {
class MPNetTokenizerFast(PreTrainedTokenizerFast):
r"""
- Construct a "fast" MPNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece.
+ Construct a "fast" MPNet tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
File containing the vocabulary.
- do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
- bos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ bos_token (`str`, *optional*, defaults to `""`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the :obj:`cls_token`.
- eos_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the `cls_token`.
+
+
+
+ eos_token (`str`, *optional*, defaults to `""`):
The end of sequence token.
- .. note::
+
- When building a sequence using special tokens, this is not the token that is used for the end of
- sequence. The token used is the :obj:`sep_token`.
- sep_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ When building a sequence using special tokens, this is not the token that is used for the end of
+ sequence. The token used is the `sep_token`.
+
+
+
+ sep_token (`str`, *optional*, defaults to `""`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
- cls_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ cls_token (`str`, *optional*, defaults to `""`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
- pad_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ pad_token (`str`, *optional*, defaults to `""`):
The token used for padding, for example when batching sequences of different lengths.
- mask_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ mask_token (`str`, *optional*, defaults to `""`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
- Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
- issue `__).
- strip_accents: (:obj:`bool`, `optional`):
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+ Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+ issue](https://github.com/huggingface/transformers/issues/328)).
+ strip_accents: (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
- value for :obj:`lowercase` (as in the original BERT).
+ value for `lowercase` (as in the original BERT).
"""
vocab_files_names = VOCAB_FILES_NAMES
@@ -151,11 +157,11 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
@property
def mask_token(self) -> str:
"""
- :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
+ `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
not having been set.
MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
- comprise the space before the ``.
+ comprise the space before the **.
"""
if self._mask_token is None and self.verbose:
logger.error("Using mask_token, but it is not set yet.")
@@ -189,13 +195,13 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
make use of token type ids, therefore a list of zeros is returned
Args:
- token_ids_0 (:obj:`List[int]`):
+ token_ids_0 (`List[int]`):
List of ids.
- token_ids_1 (:obj:`List[int]`, `optional`):
+ token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs
Returns:
- :obj:`List[int]`: List of zeros.
+ `List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py
index a5b01da8cb..e2275c5443 100644
--- a/src/transformers/models/mt5/configuration_mt5.py
+++ b/src/transformers/models/mt5/configuration_mt5.py
@@ -23,44 +23,43 @@ logger = logging.get_logger(__name__)
class MT5Config(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.MT5Model` or a
- :class:`~transformers.TFMT5Model`. It is used to instantiate a mT5 model according to the specified arguments,
+ This is the configuration class to store the configuration of a [`MT5Model`] or a
+ [`TFMT5Model`]. It is used to instantiate a mT5 model according to the specified arguments,
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
- to that of the mT5 `google/mt5-small `__ architecture.
+ to that of the mT5 [google/mt5-small](https://huggingface.co/google/mt5-small) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Arguments:
- vocab_size (:obj:`int`, `optional`, defaults to 250112):
+ vocab_size (`int`, *optional*, defaults to 250112):
Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.T5Model` or :class:`~transformers.TFT5Model`.
- d_model (:obj:`int`, `optional`, defaults to 512):
+ `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
+ d_model (`int`, *optional*, defaults to 512):
Size of the encoder layers and the pooler layer.
- d_kv (:obj:`int`, `optional`, defaults to 64):
- Size of the key, query, value projections per attention head. :obj:`d_kv` has to be equal to :obj:`d_model
- // num_heads`.
- d_ff (:obj:`int`, `optional`, defaults to 1024):
- Size of the intermediate feed forward layer in each :obj:`T5Block`.
- num_layers (:obj:`int`, `optional`, defaults to 8):
+ d_kv (`int`, *optional*, defaults to 64):
+ Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
+ d_ff (`int`, *optional*, defaults to 1024):
+ Size of the intermediate feed forward layer in each `T5Block`.
+ num_layers (`int`, *optional*, defaults to 8):
Number of hidden layers in the Transformer encoder.
- num_decoder_layers (:obj:`int`, `optional`):
- Number of hidden layers in the Transformer decoder. Will use the same value as :obj:`num_layers` if not
+ num_decoder_layers (`int`, *optional*):
+ Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not
set.
- num_heads (:obj:`int`, `optional`, defaults to 6):
+ num_heads (`int`, *optional*, defaults to 6):
Number of attention heads for each attention layer in the Transformer encoder.
- relative_attention_num_buckets (:obj:`int`, `optional`, defaults to 32):
+ relative_attention_num_buckets (`int`, *optional*, defaults to 32):
The number of buckets to use for each attention layer.
- dropout_rate (:obj:`float`, `optional`, defaults to 0.1):
+ dropout_rate (`float`, *optional*, defaults to 0.1):
The ratio for all dropout layers.
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-6):
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
The epsilon used by the layer normalization layers.
- initializer_factor (:obj:`float`, `optional`, defaults to 1):
+ initializer_factor (`float`, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
- feed_forward_proj (:obj:`string`, `optional`, defaults to :obj:`"gated-gelu"`):
- Type of feed forward layer to be used. Should be one of :obj:`"relu"` or :obj:`"gated-gelu"`.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
+ Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
"""
model_type = "mt5"
diff --git a/src/transformers/models/openai/configuration_openai.py b/src/transformers/models/openai/configuration_openai.py
index 5ba2a80078..7f212b6c3f 100644
--- a/src/transformers/models/openai/configuration_openai.py
+++ b/src/transformers/models/openai/configuration_openai.py
@@ -26,91 +26,92 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://huggingface.c
class OpenAIGPTConfig(PretrainedConfig):
"""
- This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel` or a
- :class:`~transformers.TFOpenAIGPTModel`. It is used to instantiate a GPT model according to the specified
+ This is the configuration class to store the configuration of a [`OpenAIGPTModel`] or a
+ [`TFOpenAIGPTModel`]. It is used to instantiate a GPT model according to the specified
arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
- configuration to that of the `GPT `__ architecture from OpenAI.
+ configuration to that of the [GPT](https://huggingface.co/openai-gpt) architecture from OpenAI.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 40478):
+ vocab_size (`int`, *optional*, defaults to 40478):
Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.OpenAIGPTModel` or
- :class:`~transformers.TFOpenAIGPTModel`.
- n_positions (:obj:`int`, `optional`, defaults to 512):
+ `inputs_ids` passed when calling [`OpenAIGPTModel`] or
+ [`TFOpenAIGPTModel`].
+ n_positions (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- n_embd (:obj:`int`, `optional`, defaults to 768):
+ n_embd (`int`, *optional*, defaults to 768):
Dimensionality of the embeddings and hidden states.
- n_layer (:obj:`int`, `optional`, defaults to 12):
+ n_layer (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
- n_head (:obj:`int`, `optional`, defaults to 12):
+ n_head (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
- afn (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+ afn (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ resid_pdrop (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+ embd_pdrop (`int`, *optional*, defaults to 0.1):
The dropout ratio for the embeddings.
- attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+ attn_pdrop (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention.
- layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
The epsilon to use in the layer normalization layers
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+ initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- predict_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ predict_special_tokens (`bool`, *optional*, defaults to `True`):
Whether or not special tokens should be predicted when the model has a language modeling head.
- summary_type (:obj:`str`, `optional`, defaults to :obj:`"cls_index"`):
+ summary_type (`str`, *optional*, defaults to `"cls_index"`):
Argument used when doing sequence summary, used in the models
- :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+ [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
Has to be one of the following options:
- - :obj:`"last"`: Take the last token hidden state (like XLNet).
- - :obj:`"first"`: Take the first token hidden state (like BERT).
- - :obj:`"mean"`: Take the mean of all tokens hidden states.
- - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
- - :obj:`"attn"`: Not implemented now, use multi-head attention.
- summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ - `"last"`: Take the last token hidden state (like XLNet).
+ - `"first"`: Take the first token hidden state (like BERT).
+ - `"mean"`: Take the mean of all tokens hidden states.
+ - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+ - `"attn"`: Not implemented now, use multi-head attention.
+ summary_use_proj (`bool`, *optional*, defaults to `True`):
Argument used when doing sequence summary, used in the models
- :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+ [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
Whether or not to add a projection after the vector extraction.
- summary_activation (:obj:`str`, `optional`):
+ summary_activation (`str`, *optional*):
Argument used when doing sequence summary, used in the models
- :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+ [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
- Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
- summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+ summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
Argument used when doing sequence summary, used in the models
- :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+ [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
- Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
- summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
+ Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+ summary_first_dropout (`float`, *optional*, defaults to 0.1):
Argument used when doing sequence summary, used in the models
- :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+ [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
The dropout ratio to be used after the projection and activation.
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
- Examples::
+ Examples:
- >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
+ ```python
+ >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
- >>> # Initializing a GPT configuration
- >>> configuration = OpenAIGPTConfig()
+ >>> # Initializing a GPT configuration
+ >>> configuration = OpenAIGPTConfig()
- >>> # Initializing a model from the configuration
- >>> model = OpenAIGPTModel(configuration)
+ >>> # Initializing a model from the configuration
+ >>> model = OpenAIGPTModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- """
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
model_type = "openai-gpt"
attribute_map = {
diff --git a/src/transformers/models/openai/tokenization_openai.py b/src/transformers/models/openai/tokenization_openai.py
index e5bc6b245f..0b6987000c 100644
--- a/src/transformers/models/openai/tokenization_openai.py
+++ b/src/transformers/models/openai/tokenization_openai.py
@@ -75,18 +75,18 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:
- lowercases all inputs,
- - uses :obj:`SpaCy` tokenizer and :obj:`ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
- :obj:`BasicTokenizer` if not.
+ - uses `SpaCy` tokenizer and `ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
+ `BasicTokenizer` if not.
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- merges_file (:obj:`str`):
+ merges_file (`str`):
Path to the merges file.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
diff --git a/src/transformers/models/openai/tokenization_openai_fast.py b/src/transformers/models/openai/tokenization_openai_fast.py
index 0b15b6efaa..88bd569ba8 100644
--- a/src/transformers/models/openai/tokenization_openai_fast.py
+++ b/src/transformers/models/openai/tokenization_openai_fast.py
@@ -39,21 +39,21 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
"""
- Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with
+ Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
the following peculiarities:
- lower case all inputs
- uses BERT's BasicTokenizer for pre-BPE tokenization
- This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
- vocab_file (:obj:`str`):
+ vocab_file (`str`):
Path to the vocabulary file.
- merges_file (:obj:`str`):
+ merges_file (`str`):
Path to the merges file.
- unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
+ unk_token (`str`, *optional*, defaults to `""`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index 8cf76c482b..884a4524b5 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -28,77 +28,77 @@ PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class PegasusConfig(PretrainedConfig):
r"""
- This is the configuration class to store the configuration of a :class:`~transformers.PegasusModel`. It is used to
+ This is the configuration class to store the configuration of a [`PegasusModel`]. It is used to
instantiate an PEGASUS model according to the specified arguments, defining the model architecture. Instantiating a
- configuration with the defaults will yield a similar configuration to that of the PEGASUS `google/pegasus-large
- `__ architecture.
+ configuration with the defaults will yield a similar configuration to that of the PEGASUS [google/pegasus-large](https://huggingface.co/google/pegasus-large) architecture.
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
Args:
- vocab_size (:obj:`int`, `optional`, defaults to 50265):
+ vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the PEGASUS model. Defines the number of different tokens that can be represented by the
- :obj:`inputs_ids` passed when calling :class:`~transformers.PegasusModel` or
- :class:`~transformers.TFPegasusModel`.
- d_model (:obj:`int`, `optional`, defaults to 1024):
+ `inputs_ids` passed when calling [`PegasusModel`] or
+ [`TFPegasusModel`].
+ d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
- encoder_layers (:obj:`int`, `optional`, defaults to 12):
+ encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
- decoder_layers (:obj:`int`, `optional`, defaults to 12):
+ decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
- encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
- decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
- decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
- activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
- dropout (:obj:`float`, `optional`, defaults to 0.1):
+ `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+ dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
- activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
- classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
- max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
- init_std (:obj:`float`, `optional`, defaults to 0.02):
+ init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
- The LayerDrop probability for the encoder. See the `LayerDrop paper