From 9e29080439f02af7cfbb4ab165bf7a9524ef8904 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Wed, 12 Oct 2022 17:05:12 +0200 Subject: [PATCH] [X-CLIP] Fix doc tests (#19523) * Fix XCLIP doc tests * Add model to doc test list * Fix tests --- .../models/auto/processing_auto.py | 2 +- .../models/x_clip/modeling_x_clip.py | 137 ++++++++++++++---- tests/models/x_clip/test_modeling_x_clip.py | 2 +- utils/documentation_tests.txt | 1 + 4 files changed, 111 insertions(+), 31 deletions(-) diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 8281d6a3bd..d98855dad4 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -62,7 +62,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict( ("wav2vec2_with_lm", "Wav2Vec2ProcessorWithLM"), ("wavlm", "Wav2Vec2Processor"), ("whisper", "WhisperProcessor"), - ("xclip", "CLIPProcessor"), + ("xclip", "XCLIPProcessor"), ] ) diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index 00ae9d7206..27306af627 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -1061,21 +1061,46 @@ class XCLIPVisionModel(XCLIPPreTrainedModel): Examples: ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import XCLIPProcessor, XCLIPVisionModel + >>> from decord import VideoReader, cpu + >>> import torch + >>> import numpy as np + >>> from transformers import AutoProcessor, XCLIPVisionModel + >>> from huggingface_hub import hf_hub_download + + >>> np.random.seed(0) + + + >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len): + ... converted_len = int(clip_len * frame_sample_rate) + ... end_idx = np.random.randint(converted_len, seg_len) + ... start_idx = end_idx - converted_len + ... indices = np.linspace(start_idx, end_idx, num=clip_len) + ... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) + ... return indices + + + >>> # video clip consists of 300 frames (10 seconds at 30 FPS) + >>> file_path = hf_hub_download( + ... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" + ... ) + >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0)) + + >>> # sample 16 frames + >>> vr.seek(0) + >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr)) + >>> video = vr.get_batch(indices).asnumpy() + + >>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32") >>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32") - >>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32") - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) + >>> pixel_values = processor(videos=list(video), return_tensors="pt").pixel_values - >>> inputs = processor(images=image, return_tensors="pt") + >>> batch_size, num_frames, num_channels, height, width = pixel_values.shape + >>> pixel_values = pixel_values.reshape(-1, num_channels, height, width) - >>> outputs = model(**inputs) + >>> outputs = model(pixel_values) >>> last_hidden_state = outputs.last_hidden_state - >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" return self.vision_model( pixel_values=pixel_values, @@ -1288,10 +1313,10 @@ class XCLIPModel(XCLIPPreTrainedModel): Examples: ```python - >>> from transformers import CLIPTokenizer, XCLIPModel + >>> from transformers import AutoTokenizer, AutoModel - >>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32") - >>> tokenizer = CLIPTokenizer.from_pretrained("microsoft/xclip-base-patch32") + >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32") + >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32") >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) @@ -1334,17 +1359,40 @@ class XCLIPModel(XCLIPPreTrainedModel): Examples: ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import XCLIPProcessor, XCLIPModel + >>> from decord import VideoReader, cpu + >>> import torch + >>> import numpy as np - >>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32") - >>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32") + >>> from transformers import AutoProcessor, AutoModel + >>> from huggingface_hub import hf_hub_download - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) + >>> np.random.seed(0) - >>> inputs = processor(images=image, return_tensors="pt") + + >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len): + ... converted_len = int(clip_len * frame_sample_rate) + ... end_idx = np.random.randint(converted_len, seg_len) + ... start_idx = end_idx - converted_len + ... indices = np.linspace(start_idx, end_idx, num=clip_len) + ... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) + ... return indices + + + >>> # video clip consists of 300 frames (10 seconds at 30 FPS) + >>> file_path = hf_hub_download( + ... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" + ... ) + >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0)) + + >>> # sample 16 frames + >>> vr.seek(0) + >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr)) + >>> video = vr.get_batch(indices).asnumpy() + + >>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32") + >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32") + + >>> inputs = processor(videos=list(video), return_tensors="pt") >>> video_features = model.get_video_features(**inputs) ```""" @@ -1399,23 +1447,54 @@ class XCLIPModel(XCLIPPreTrainedModel): Examples: ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import XCLIPProcessor, XCLIPModel + >>> from decord import VideoReader, cpu + >>> import torch + >>> import numpy as np - >>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32") - >>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32") + >>> from transformers import AutoProcessor, AutoModel + >>> from huggingface_hub import hf_hub_download - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) + >>> np.random.seed(0) + + + >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len): + ... converted_len = int(clip_len * frame_sample_rate) + ... end_idx = np.random.randint(converted_len, seg_len) + ... start_idx = end_idx - converted_len + ... indices = np.linspace(start_idx, end_idx, num=clip_len) + ... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) + ... return indices + + + >>> # video clip consists of 300 frames (10 seconds at 30 FPS) + >>> file_path = hf_hub_download( + ... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" + ... ) + >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0)) + + >>> # sample 16 frames + >>> vr.seek(0) + >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr)) + >>> video = vr.get_batch(indices).asnumpy() + + >>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32") + >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32") >>> inputs = processor( - ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... text=["playing sports", "eating spaghetti", "go shopping"], + ... videos=list(video), + ... return_tensors="pt", + ... padding=True, ... ) - >>> outputs = model(**inputs) + >>> # forward pass + >>> with torch.no_grad(): + ... outputs = model(**inputs) + >>> logits_per_video = outputs.logits_per_video # this is the video-text similarity score >>> probs = logits_per_video.softmax(dim=1) # we can take the softmax to get the label probabilities + >>> print(probs) + tensor([[1.9496e-04, 9.9960e-01, 2.0825e-04]]) ```""" # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 0a70fdcb44..0e9826d781 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -667,6 +667,6 @@ class XCLIPModelIntegrationTest(unittest.TestCase): torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), ) - expected_logits = torch.tensor([[14.3819, 20.6031, 15.0526]], device=torch_device) + expected_logits = torch.tensor([[14.0181, 20.2771, 14.4776]], device=torch_device) self.assertTrue(torch.allclose(outputs.logits_per_video, expected_logits, atol=1e-3)) diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index 7f28f75072..49107cd6f8 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -102,3 +102,4 @@ src/transformers/models/wavlm/modeling_wavlm.py src/transformers/models/whisper/modeling_whisper.py src/transformers/models/whisper/modeling_tf_whisper.py src/transformers/models/yolos/modeling_yolos.py +src/transformers/models/x_clip/modeling_x_clip.py