Add VideoMAE (#17821)
* First draft * Add VideoMAEForVideoClassification * Improve conversion script * Add VideoMAEForPreTraining * Add VideoMAEFeatureExtractor * Improve VideoMAEFeatureExtractor * Improve docs * Add first draft of model tests * Improve VideoMAEForPreTraining * Fix base_model_prefix * Make model take pixel_values of shape (B, T, C, H, W) * Add loss computation of VideoMAEForPreTraining * Improve tests * Improve model testsé * Make all tests pass * Add VideoMAE to main README * Add tests for VideoMAEFeatureExtractor * Add integration test * Improve conversion script * Rename patch embedding class * Remove VideoMAELayer from init * Update design of patch embeddings * Improve comments * Improve conversion script * Improve conversion script * Add conversion of pretrained model * Add loss verification of pretrained model * Add loss verification of unnormalized targets * Add integration test for pretraining model * Apply suggestions from code review * Fix bug to make feature extractor resize only shorter edge * Address more comments * Improve normalization of videos * Add doc examples * Move constants to dedicated script * Remove scripts * Transfer checkpoints, fix docs * Update script * Update image mean and std * Fix doc tests * Set return_tensors to NumPy by default * Revert the previous change Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
This commit is contained in:
@@ -48,49 +48,91 @@ SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
|
||||
def prepare_image_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
|
||||
"""This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
|
||||
or a list of PyTorch tensors if one specifies torchify=True.
|
||||
|
||||
One can specify whether the images are of the same resolution or not.
|
||||
"""
|
||||
|
||||
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
|
||||
|
||||
if equal_resolution:
|
||||
image_inputs = []
|
||||
for i in range(feature_extract_tester.batch_size):
|
||||
image_inputs.append(
|
||||
np.random.randint(
|
||||
255,
|
||||
size=(
|
||||
feature_extract_tester.num_channels,
|
||||
feature_extract_tester.max_resolution,
|
||||
feature_extract_tester.max_resolution,
|
||||
),
|
||||
dtype=np.uint8,
|
||||
)
|
||||
)
|
||||
else:
|
||||
image_inputs = []
|
||||
|
||||
# To avoid getting image width/height 0
|
||||
min_resolution = feature_extract_tester.min_resolution
|
||||
if getattr(feature_extract_tester, "size_divisor", None):
|
||||
# If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
|
||||
min_resolution = max(feature_extract_tester.size_divisor, min_resolution)
|
||||
|
||||
for i in range(feature_extract_tester.batch_size):
|
||||
image_inputs = []
|
||||
for i in range(feature_extract_tester.batch_size):
|
||||
if equal_resolution:
|
||||
width = height = feature_extract_tester.max_resolution
|
||||
else:
|
||||
# To avoid getting image width/height 0
|
||||
min_resolution = feature_extract_tester.min_resolution
|
||||
if getattr(feature_extract_tester, "size_divisor", None):
|
||||
# If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
|
||||
min_resolution = max(feature_extract_tester.size_divisor, min_resolution)
|
||||
width, height = np.random.choice(np.arange(min_resolution, feature_extract_tester.max_resolution), 2)
|
||||
image_inputs.append(
|
||||
np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8)
|
||||
image_inputs.append(
|
||||
np.random.randint(
|
||||
255,
|
||||
size=(
|
||||
feature_extract_tester.num_channels,
|
||||
width,
|
||||
height,
|
||||
),
|
||||
dtype=np.uint8,
|
||||
)
|
||||
)
|
||||
|
||||
if not numpify and not torchify:
|
||||
# PIL expects the channel dimension as last dimension
|
||||
image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
|
||||
image_inputs = [Image.fromarray(np.moveaxis(image, 0, -1)) for image in image_inputs]
|
||||
|
||||
if torchify:
|
||||
image_inputs = [torch.from_numpy(x) for x in image_inputs]
|
||||
image_inputs = [torch.from_numpy(image) for image in image_inputs]
|
||||
|
||||
return image_inputs
|
||||
|
||||
|
||||
def prepare_video(feature_extract_tester, width=10, height=10, numpify=False, torchify=False):
|
||||
"""This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
|
||||
|
||||
video = []
|
||||
for i in range(feature_extract_tester.num_frames):
|
||||
video.append(np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8))
|
||||
|
||||
if not numpify and not torchify:
|
||||
# PIL expects the channel dimension as last dimension
|
||||
video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
|
||||
|
||||
if torchify:
|
||||
video = [torch.from_numpy(frame) for frame in video]
|
||||
|
||||
return video
|
||||
|
||||
|
||||
def prepare_video_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
|
||||
"""This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
|
||||
one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
|
||||
|
||||
One can specify whether the videos are of the same resolution or not.
|
||||
"""
|
||||
|
||||
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
|
||||
|
||||
video_inputs = []
|
||||
for i in range(feature_extract_tester.batch_size):
|
||||
if equal_resolution:
|
||||
width = height = feature_extract_tester.max_resolution
|
||||
else:
|
||||
width, height = np.random.choice(
|
||||
np.arange(feature_extract_tester.min_resolution, feature_extract_tester.max_resolution), 2
|
||||
)
|
||||
video = prepare_video(
|
||||
feature_extract_tester=feature_extract_tester,
|
||||
width=width,
|
||||
height=height,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
video_inputs.append(video)
|
||||
|
||||
return video_inputs
|
||||
|
||||
|
||||
class FeatureExtractionSavingTestMixin:
|
||||
def test_feat_extract_to_json_string(self):
|
||||
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
||||
|
||||
Reference in New Issue
Block a user