From b5b4e549206e6753f9b813f0b7eb8154931a60de Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Tue, 20 Jul 2021 18:58:50 +0530 Subject: [PATCH] add and fix examples (#12810) --- docs/source/model_doc/clip.rst | 1 - src/transformers/models/clip/modeling_clip.py | 73 +++++++++++++++++++ .../models/clip/modeling_flax_clip.py | 31 +++++++- 3 files changed, 102 insertions(+), 3 deletions(-) diff --git a/docs/source/model_doc/clip.rst b/docs/source/model_doc/clip.rst index 3dbd3b73e1..d260b0f067 100644 --- a/docs/source/model_doc/clip.rst +++ b/docs/source/model_doc/clip.rst @@ -60,7 +60,6 @@ encode the text and prepare the images. The following example shows how to get t .. code-block:: - >>> import torch >>> from PIL import Image >>> import requests diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 9fb65dbafa..eb1200ff54 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -699,6 +699,18 @@ class CLIPTextModel(CLIPPreTrainedModel): r""" Returns: + Examples:: + + >>> from transformers import CLIPTokenizer, CLIPTextModel + + >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooled_output # pooled (EOS token) states """ return self.text_model( input_ids=input_ids, @@ -791,6 +803,23 @@ class CLIPVisionModel(CLIPPreTrainedModel): r""" Returns: + Examples:: + + >>> from PIL import Image + >>> import requests + >>> from transformers import CLIPProcessor, CLIPVisionModel + + >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooled_output # pooled CLS states """ return self.vision_model( pixel_values=pixel_values, @@ -847,6 +876,16 @@ class CLIPModel(CLIPPreTrainedModel): Returns: text_features (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`): The text embeddings obtained by applying the projection layer to the pooled output of :class:`~transformers.CLIPTextModel`. + + Examples:: + + >>> from transformers import CLIPTokenizer, CLIPModel + + >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + >>> text_features = model.get_text_features(**inputs) """ text_outputs = self.text_model( input_ids=input_ids, @@ -874,6 +913,22 @@ class CLIPModel(CLIPPreTrainedModel): Returns: image_features (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`): The image embeddings obtained by applying the projection layer to the pooled output of :class:`~transformers.CLIPVisionModel`. + + Examples:: + + >>> from PIL import Image + >>> import requests + >>> from transformers import CLIPProcessor, CLIPModel + + >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> image_features = model.get_image_features(**inputs) """ vision_outputs = self.vision_model( pixel_values=pixel_values, @@ -903,6 +958,24 @@ class CLIPModel(CLIPPreTrainedModel): r""" Returns: + Examples:: + + >>> from PIL import Image + >>> import requests + >>> from transformers import CLIPProcessor, CLIPModel + + >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + """ return_dict = return_dict if return_dict is not None else self.config.return_dict vision_outputs = self.vision_model( diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py index fc6418ee4b..4b3a311d1d 100644 --- a/src/transformers/models/clip/modeling_flax_clip.py +++ b/src/transformers/models/clip/modeling_flax_clip.py @@ -803,6 +803,16 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel): Returns: text_features (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, output_dim`): The text embeddings obtained by applying the projection layer to the pooled output of :class:`~transformers.FlaxCLIPTextModel`. + + Examples:: + + >>> from transformers import CLIPTokenizer, FlaxCLIPModel + + >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np") + >>> text_features = model.get_text_features(**inputs) """ if position_ids is None: position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) @@ -848,6 +858,22 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel): image_features (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, output_dim`): The image embeddings obtained by applying the projection layer to the pooled output of :class:`~transformers.FlaxCLIPVisionModel` + + Examples:: + + >>> from PIL import Image + >>> import requests + >>> from transformers import CLIPProcessor, FlaxCLIPModel + + >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="np") + + >>> image_features = model.get_image_features(**inputs) """ pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1)) @@ -907,6 +933,7 @@ FLAX_CLIP_TEXT_MODEL_DOCSTRING = """ Returns: Example:: + >>> from transformers import CLIPTokenizer, FlaxCLIPTextModel >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32") @@ -957,9 +984,9 @@ FLAX_CLIP_VISION_MODEL_DOCSTRING = """ Returns: Example:: + >>> from PIL import Image >>> import requests - >>> from transformers import CLIPProcessor, FlaxCLIPVisionModel >>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32") @@ -1078,10 +1105,10 @@ FLAX_CLIP_MODEL_DOCSTRING = """ Returns: Example:: + >>> import jax >>> from PIL import Image >>> import requests - >>> from transformers import CLIPProcessor, FlaxCLIPModel >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")