add and fix examples (#12810)
This commit is contained in:
@@ -60,7 +60,6 @@ encode the text and prepare the images. The following example shows how to get t
|
|||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
>>> import torch
|
|
||||||
>>> from PIL import Image
|
>>> from PIL import Image
|
||||||
>>> import requests
|
>>> import requests
|
||||||
|
|
||||||
|
|||||||
@@ -699,6 +699,18 @@ class CLIPTextModel(CLIPPreTrainedModel):
|
|||||||
r"""
|
r"""
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
>>> from transformers import CLIPTokenizer, CLIPTextModel
|
||||||
|
|
||||||
|
>>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
>>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
|
||||||
|
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
|
||||||
|
|
||||||
|
>>> outputs = model(**inputs)
|
||||||
|
>>> last_hidden_state = outputs.last_hidden_state
|
||||||
|
>>> pooled_output = outputs.pooled_output # pooled (EOS token) states
|
||||||
"""
|
"""
|
||||||
return self.text_model(
|
return self.text_model(
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
@@ -791,6 +803,23 @@ class CLIPVisionModel(CLIPPreTrainedModel):
|
|||||||
r"""
|
r"""
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
>>> from PIL import Image
|
||||||
|
>>> import requests
|
||||||
|
>>> from transformers import CLIPProcessor, CLIPVisionModel
|
||||||
|
|
||||||
|
>>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
|
||||||
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
|
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||||
|
|
||||||
|
>>> inputs = processor(images=image, return_tensors="pt")
|
||||||
|
|
||||||
|
>>> outputs = model(**inputs)
|
||||||
|
>>> last_hidden_state = outputs.last_hidden_state
|
||||||
|
>>> pooled_output = outputs.pooled_output # pooled CLS states
|
||||||
"""
|
"""
|
||||||
return self.vision_model(
|
return self.vision_model(
|
||||||
pixel_values=pixel_values,
|
pixel_values=pixel_values,
|
||||||
@@ -847,6 +876,16 @@ class CLIPModel(CLIPPreTrainedModel):
|
|||||||
Returns:
|
Returns:
|
||||||
text_features (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`): The text embeddings
|
text_features (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`): The text embeddings
|
||||||
obtained by applying the projection layer to the pooled output of :class:`~transformers.CLIPTextModel`.
|
obtained by applying the projection layer to the pooled output of :class:`~transformers.CLIPTextModel`.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
>>> from transformers import CLIPTokenizer, CLIPModel
|
||||||
|
|
||||||
|
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
>>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
|
||||||
|
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
|
||||||
|
>>> text_features = model.get_text_features(**inputs)
|
||||||
"""
|
"""
|
||||||
text_outputs = self.text_model(
|
text_outputs = self.text_model(
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
@@ -874,6 +913,22 @@ class CLIPModel(CLIPPreTrainedModel):
|
|||||||
Returns:
|
Returns:
|
||||||
image_features (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`): The image embeddings
|
image_features (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`): The image embeddings
|
||||||
obtained by applying the projection layer to the pooled output of :class:`~transformers.CLIPVisionModel`.
|
obtained by applying the projection layer to the pooled output of :class:`~transformers.CLIPVisionModel`.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
>>> from PIL import Image
|
||||||
|
>>> import requests
|
||||||
|
>>> from transformers import CLIPProcessor, CLIPModel
|
||||||
|
|
||||||
|
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
|
||||||
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
|
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||||
|
|
||||||
|
>>> inputs = processor(images=image, return_tensors="pt")
|
||||||
|
|
||||||
|
>>> image_features = model.get_image_features(**inputs)
|
||||||
"""
|
"""
|
||||||
vision_outputs = self.vision_model(
|
vision_outputs = self.vision_model(
|
||||||
pixel_values=pixel_values,
|
pixel_values=pixel_values,
|
||||||
@@ -903,6 +958,24 @@ class CLIPModel(CLIPPreTrainedModel):
|
|||||||
r"""
|
r"""
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
>>> from PIL import Image
|
||||||
|
>>> import requests
|
||||||
|
>>> from transformers import CLIPProcessor, CLIPModel
|
||||||
|
|
||||||
|
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
|
||||||
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
|
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||||
|
|
||||||
|
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
|
||||||
|
|
||||||
|
>>> outputs = model(**inputs)
|
||||||
|
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
|
||||||
|
>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.return_dict
|
return_dict = return_dict if return_dict is not None else self.config.return_dict
|
||||||
vision_outputs = self.vision_model(
|
vision_outputs = self.vision_model(
|
||||||
|
|||||||
@@ -803,6 +803,16 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
|
|||||||
Returns:
|
Returns:
|
||||||
text_features (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, output_dim`): The text embeddings
|
text_features (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, output_dim`): The text embeddings
|
||||||
obtained by applying the projection layer to the pooled output of :class:`~transformers.FlaxCLIPTextModel`.
|
obtained by applying the projection layer to the pooled output of :class:`~transformers.FlaxCLIPTextModel`.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
>>> from transformers import CLIPTokenizer, FlaxCLIPModel
|
||||||
|
|
||||||
|
>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
>>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
|
||||||
|
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
|
||||||
|
>>> text_features = model.get_text_features(**inputs)
|
||||||
"""
|
"""
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
|
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
|
||||||
@@ -848,6 +858,22 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
|
|||||||
image_features (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, output_dim`): The image embeddings
|
image_features (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, output_dim`): The image embeddings
|
||||||
obtained by applying the projection layer to the pooled output of
|
obtained by applying the projection layer to the pooled output of
|
||||||
:class:`~transformers.FlaxCLIPVisionModel`
|
:class:`~transformers.FlaxCLIPVisionModel`
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
>>> from PIL import Image
|
||||||
|
>>> import requests
|
||||||
|
>>> from transformers import CLIPProcessor, FlaxCLIPModel
|
||||||
|
|
||||||
|
>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
|
||||||
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
|
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||||
|
|
||||||
|
>>> inputs = processor(images=image, return_tensors="np")
|
||||||
|
|
||||||
|
>>> image_features = model.get_image_features(**inputs)
|
||||||
"""
|
"""
|
||||||
pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
|
pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
|
||||||
|
|
||||||
@@ -907,6 +933,7 @@ FLAX_CLIP_TEXT_MODEL_DOCSTRING = """
|
|||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
Example::
|
Example::
|
||||||
|
|
||||||
>>> from transformers import CLIPTokenizer, FlaxCLIPTextModel
|
>>> from transformers import CLIPTokenizer, FlaxCLIPTextModel
|
||||||
|
|
||||||
>>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
|
>>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
@@ -957,9 +984,9 @@ FLAX_CLIP_VISION_MODEL_DOCSTRING = """
|
|||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
Example::
|
Example::
|
||||||
|
|
||||||
>>> from PIL import Image
|
>>> from PIL import Image
|
||||||
>>> import requests
|
>>> import requests
|
||||||
|
|
||||||
>>> from transformers import CLIPProcessor, FlaxCLIPVisionModel
|
>>> from transformers import CLIPProcessor, FlaxCLIPVisionModel
|
||||||
|
|
||||||
>>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
|
>>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
@@ -1078,10 +1105,10 @@ FLAX_CLIP_MODEL_DOCSTRING = """
|
|||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
Example::
|
Example::
|
||||||
|
|
||||||
>>> import jax
|
>>> import jax
|
||||||
>>> from PIL import Image
|
>>> from PIL import Image
|
||||||
>>> import requests
|
>>> import requests
|
||||||
|
|
||||||
>>> from transformers import CLIPProcessor, FlaxCLIPModel
|
>>> from transformers import CLIPProcessor, FlaxCLIPModel
|
||||||
|
|
||||||
>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
|
|||||||
Reference in New Issue
Block a user