From e7206ceab9db2dec4742cec6ac57b9a630bd6ea6 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Mon, 19 Sep 2022 19:22:34 +0200 Subject: [PATCH] Improve vision models docs (#19103) * Add tips * Add BEiT figure * Fix URL * Move tip to start * Add tip to TF model as well Co-authored-by: Niels Rogge --- docs/source/en/model_doc/beit.mdx | 5 +++++ docs/source/en/model_doc/vit.mdx | 12 +++++------- docs/source/en/model_doc/xclip.mdx | 3 ++- src/transformers/models/deit/modeling_deit.py | 11 +++++++++-- src/transformers/models/swin/modeling_swin.py | 11 +++++++++-- .../models/swinv2/modeling_swinv2.py | 12 ++++++++++-- .../models/vit/modeling_tf_vit.py | 8 ++++++++ src/transformers/models/vit/modeling_vit.py | 19 +++++++++++++++++-- .../models/vit_mae/modeling_vit_mae.py | 11 ++++++++++- 9 files changed, 75 insertions(+), 17 deletions(-) diff --git a/docs/source/en/model_doc/beit.mdx b/docs/source/en/model_doc/beit.mdx index 625357810d..f8177443d1 100644 --- a/docs/source/en/model_doc/beit.mdx +++ b/docs/source/en/model_doc/beit.mdx @@ -59,6 +59,11 @@ Tips: `use_relative_position_bias` attribute of [`BeitConfig`] to `True` in order to add position embeddings. + + + BEiT pre-training. Taken from the original paper. + This model was contributed by [nielsr](https://huggingface.co/nielsr). The JAX/FLAX version of this model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/beit). diff --git a/docs/source/en/model_doc/vit.mdx b/docs/source/en/model_doc/vit.mdx index 37c469f6aa..5978d4518e 100644 --- a/docs/source/en/model_doc/vit.mdx +++ b/docs/source/en/model_doc/vit.mdx @@ -12,13 +12,6 @@ specific language governing permissions and limitations under the License. # Vision Transformer (ViT) - - -This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight -breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title). - - - ## Overview The Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition @@ -63,6 +56,11 @@ Tips: language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant improvement of 2% to training from scratch, but still 4% behind supervised pre-training. + + + ViT architecture. Taken from the original paper. + Following the original Vision Transformer, some follow-up works have been made: - [DeiT](deit) (Data-efficient Image Transformers) by Facebook AI. DeiT models are distilled vision transformers. diff --git a/docs/source/en/model_doc/xclip.mdx b/docs/source/en/model_doc/xclip.mdx index 4d572b6760..96832f46e5 100644 --- a/docs/source/en/model_doc/xclip.mdx +++ b/docs/source/en/model_doc/xclip.mdx @@ -23,7 +23,8 @@ The abstract from the paper is the following: Tips: -- Usage of X-CLIP is identical to CLIP. +- Usage of X-CLIP is identical to [CLIP](clip). +- Demo notebooks for X-CLIP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/X-CLIP). drawing diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index 8f8307499f..44110f5e44 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -555,8 +555,15 @@ class DeiTPooler(nn.Module): @add_start_docstrings( - "DeiT Model with a decoder on top for masked image modeling, as proposed in" - " [SimMIM](https://arxiv.org/abs/2111.09886).", + """DeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886). + + + + Note that we provide a script to pre-train this model on custom data in our [examples + directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining). + + + """, DEIT_START_DOCSTRING, ) class DeiTForMaskedImageModeling(DeiTPreTrainedModel): diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index 58d01d1cdf..588a4200fb 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -1007,8 +1007,15 @@ class SwinModel(SwinPreTrainedModel): @add_start_docstrings( - "Swin Model with a decoder on top for masked image modeling, as proposed in" - " [SimMIM](https://arxiv.org/abs/2111.09886).", + """Swin Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886). + + + + Note that we provide a script to pre-train this model on custom data in our [examples + directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining). + + + """, SWIN_START_DOCSTRING, ) class SwinForMaskedImageModeling(SwinPreTrainedModel): diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py index 890530691d..926a7dd276 100644 --- a/src/transformers/models/swinv2/modeling_swinv2.py +++ b/src/transformers/models/swinv2/modeling_swinv2.py @@ -1087,8 +1087,16 @@ class Swinv2Model(Swinv2PreTrainedModel): @add_start_docstrings( - "Swinv2 Model with a decoder on top for masked image modeling, as proposed in" - " [SimMIM](https://arxiv.org/abs/2111.09886).", + """Swinv2 Model with a decoder on top for masked image modeling, as proposed in +[SimMIM](https://arxiv.org/abs/2111.09886). + + + + Note that we provide a script to pre-train this model on custom data in our [examples + directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining). + + + """, SWINV2_START_DOCSTRING, ) # Copied from transformers.models.swin.modeling_swin.SwinForMaskedImageModeling with SWIN->SWINV2,Swin->Swinv2,swin->swinv2,224->256,window7->window8 diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py index 754a86ce28..727cbb7517 100644 --- a/src/transformers/models/vit/modeling_tf_vit.py +++ b/src/transformers/models/vit/modeling_tf_vit.py @@ -733,6 +733,14 @@ class TFViTPooler(tf.keras.layers.Layer): """ ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet. + + + + Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by + setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained + position embeddings to the higher resolution. + + """, VIT_START_DOCSTRING, ) diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py index 7017f232f0..e6df4baa70 100644 --- a/src/transformers/models/vit/modeling_vit.py +++ b/src/transformers/models/vit/modeling_vit.py @@ -597,8 +597,15 @@ class ViTPooler(nn.Module): @add_start_docstrings( - "ViT Model with a decoder on top for masked image modeling, as proposed in" - " [SimMIM](https://arxiv.org/abs/2111.09886).", + """ViT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886). + + + + Note that we provide a script to pre-train this model on custom data in our [examples + directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining). + + + """, VIT_START_DOCSTRING, ) class ViTForMaskedImageModeling(ViTPreTrainedModel): @@ -712,6 +719,14 @@ class ViTForMaskedImageModeling(ViTPreTrainedModel): """ ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet. + + + + Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by + setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained + position embeddings to the higher resolution. + + """, VIT_START_DOCSTRING, ) diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py index 20ed445271..b612d2f67b 100755 --- a/src/transformers/models/vit_mae/modeling_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_vit_mae.py @@ -837,7 +837,16 @@ class ViTMAEDecoder(nn.Module): @add_start_docstrings( - "The ViTMAE Model transformer with the decoder on top for self-supervised pre-training.", + """The ViTMAE Model transformer with the decoder on top for self-supervised pre-training. + + + + Note that we provide a script to pre-train this model on custom data in our [examples + directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining). + + + + """, VIT_MAE_START_DOCSTRING, ) class ViTMAEForPreTraining(ViTMAEPreTrainedModel):