Fix ViT-MAE decoder interpolate (#33330)
* Fix ViT-MAE decoder interpolate * Add unit test for `interpolate_pos_encoding` w/ custom sizes * [run_slow] vit_mae
This commit is contained in:
@@ -298,12 +298,16 @@ class ViTMAEModelIntegrationTest(unittest.TestCase):
|
||||
def default_image_processor(self):
|
||||
return ViTImageProcessor.from_pretrained("facebook/vit-mae-base")
|
||||
|
||||
@cached_property
|
||||
def default_model(self):
|
||||
return ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
|
||||
|
||||
@slow
|
||||
def test_inference_for_pretraining(self):
|
||||
# make random mask reproducible across the PT and TF model
|
||||
np.random.seed(2)
|
||||
|
||||
model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
|
||||
model = self.default_model
|
||||
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
@@ -313,11 +317,11 @@ class ViTMAEModelIntegrationTest(unittest.TestCase):
|
||||
# (this way we can ensure that the PT and TF models operate on the same inputs)
|
||||
vit_mae_config = ViTMAEConfig()
|
||||
num_patches = int((vit_mae_config.image_size // vit_mae_config.patch_size) ** 2)
|
||||
noise = np.random.uniform(size=(1, num_patches))
|
||||
noise = torch.from_numpy(np.random.uniform(size=(1, num_patches))).to(device=torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs, noise=torch.from_numpy(noise).to(device=torch_device))
|
||||
outputs = model(**inputs, noise=noise)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 196, 768))
|
||||
@@ -339,7 +343,7 @@ class ViTMAEModelIntegrationTest(unittest.TestCase):
|
||||
# make random mask reproducible across the PT and TF model
|
||||
np.random.seed(2)
|
||||
|
||||
model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
|
||||
model = self.default_model
|
||||
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
@@ -349,14 +353,38 @@ class ViTMAEModelIntegrationTest(unittest.TestCase):
|
||||
# (this way we can ensure that the PT and TF models operate on the same inputs)
|
||||
vit_mae_config = ViTMAEConfig()
|
||||
num_patches = (image.height // vit_mae_config.patch_size) * (image.width // vit_mae_config.patch_size)
|
||||
noise = np.random.uniform(size=(1, num_patches))
|
||||
noise = torch.from_numpy(np.random.uniform(size=(1, num_patches))).to(device=torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(
|
||||
**inputs, noise=torch.from_numpy(noise).to(device=torch_device), interpolate_pos_encoding=True
|
||||
)
|
||||
outputs = model(**inputs, noise=noise, interpolate_pos_encoding=True)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 1200, 768))
|
||||
self.assertEqual(outputs.logits.shape, expected_shape)
|
||||
|
||||
@slow
|
||||
def test_inference_interpolate_pos_encoding_custom_sizes(self):
|
||||
# Ensure custom sizes are correctly handled when interpolating the position embeddings
|
||||
|
||||
# make random mask reproducible across the PT and TF model
|
||||
np.random.seed(2)
|
||||
|
||||
model = self.default_model
|
||||
image_processor = self.default_image_processor
|
||||
|
||||
image = prepare_img()
|
||||
inputs = image_processor(images=image, return_tensors="pt", size={"height": 256, "width": 256}).to(
|
||||
torch_device
|
||||
)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(
|
||||
**inputs,
|
||||
interpolate_pos_encoding=True,
|
||||
)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 256, 768))
|
||||
self.assertEqual(outputs.logits.shape, expected_shape)
|
||||
|
||||
Reference in New Issue
Block a user