Update old existing feature extractor references (#24552)
* Update old existing feature extractor references * Typo * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review * Address comments from review - update 'feature extractor' Co-authored by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -49,7 +49,7 @@ if is_vision_available():
|
||||
import PIL
|
||||
from PIL import Image
|
||||
|
||||
from transformers import BeitFeatureExtractor
|
||||
from transformers import BeitImageProcessor
|
||||
|
||||
|
||||
class BeitModelTester:
|
||||
@@ -342,18 +342,16 @@ def prepare_img():
|
||||
@require_vision
|
||||
class BeitModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return (
|
||||
BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
|
||||
)
|
||||
def default_image_processor(self):
|
||||
return BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_masked_image_modeling_head(self):
|
||||
model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(torch_device)
|
||||
pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
|
||||
|
||||
# prepare bool_masked_pos
|
||||
bool_masked_pos = torch.ones((1, 196), dtype=torch.bool).to(torch_device)
|
||||
@@ -377,9 +375,9 @@ class BeitModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_image_classification_head_imagenet_1k(self):
|
||||
model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -403,9 +401,9 @@ class BeitModelIntegrationTest(unittest.TestCase):
|
||||
torch_device
|
||||
)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -428,11 +426,11 @@ class BeitModelIntegrationTest(unittest.TestCase):
|
||||
model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
|
||||
model = model.to(torch_device)
|
||||
|
||||
feature_extractor = BeitFeatureExtractor(do_resize=True, size=640, do_center_crop=False)
|
||||
image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)
|
||||
|
||||
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
|
||||
image = Image.open(ds[0]["file"])
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -471,11 +469,11 @@ class BeitModelIntegrationTest(unittest.TestCase):
|
||||
model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
|
||||
model = model.to(torch_device)
|
||||
|
||||
feature_extractor = BeitFeatureExtractor(do_resize=True, size=640, do_center_crop=False)
|
||||
image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)
|
||||
|
||||
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
|
||||
image = Image.open(ds[0]["file"])
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -483,10 +481,10 @@ class BeitModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
outputs.logits = outputs.logits.detach().cpu()
|
||||
|
||||
segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
|
||||
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
|
||||
expected_shape = torch.Size((500, 300))
|
||||
self.assertEqual(segmentation[0].shape, expected_shape)
|
||||
|
||||
segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
|
||||
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
|
||||
expected_shape = torch.Size((160, 160))
|
||||
self.assertEqual(segmentation[0].shape, expected_shape)
|
||||
|
||||
@@ -33,7 +33,7 @@ if is_flax_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import BeitFeatureExtractor
|
||||
from transformers import BeitImageProcessor
|
||||
|
||||
|
||||
class FlaxBeitModelTester(unittest.TestCase):
|
||||
@@ -219,18 +219,16 @@ def prepare_img():
|
||||
@require_flax
|
||||
class FlaxBeitModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return (
|
||||
BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
|
||||
)
|
||||
def default_image_processor(self):
|
||||
return BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_masked_image_modeling_head(self):
|
||||
model = FlaxBeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
|
||||
pixel_values = image_processor(images=image, return_tensors="np").pixel_values
|
||||
|
||||
# prepare bool_masked_pos
|
||||
bool_masked_pos = np.ones((1, 196), dtype=bool)
|
||||
@@ -253,9 +251,9 @@ class FlaxBeitModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_image_classification_head_imagenet_1k(self):
|
||||
model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224")
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="np")
|
||||
inputs = image_processor(images=image, return_tensors="np")
|
||||
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
@@ -276,9 +274,9 @@ class FlaxBeitModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_image_classification_head_imagenet_22k(self):
|
||||
model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-large-patch16-224-pt22k-ft22k")
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="np")
|
||||
inputs = image_processor(images=image, return_tensors="np")
|
||||
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
|
||||
@@ -297,7 +297,7 @@ def prepare_img():
|
||||
@require_vision
|
||||
class BitModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
BitImageProcessor.from_pretrained(BIT_PRETRAINED_MODEL_ARCHIVE_LIST[0]) if is_vision_available() else None
|
||||
)
|
||||
@@ -306,9 +306,9 @@ class BitModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_image_classification_head(self):
|
||||
model = BitForImageClassification.from_pretrained(BIT_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -145,7 +145,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te
|
||||
pass
|
||||
|
||||
def test_call_pil(self):
|
||||
# Initialize feature_extractor
|
||||
# Initialize image processor
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PIL images
|
||||
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False)
|
||||
@@ -176,7 +176,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te
|
||||
)
|
||||
|
||||
def test_call_numpy(self):
|
||||
# Initialize feature_extractor
|
||||
# Initialize image processor
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
|
||||
@@ -207,7 +207,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te
|
||||
)
|
||||
|
||||
def test_call_pytorch(self):
|
||||
# Initialize feature_extractor
|
||||
# Initialize image processor
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PyTorch tensors
|
||||
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
|
||||
@@ -238,7 +238,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te
|
||||
)
|
||||
|
||||
def test_equivalence_pad_and_create_pixel_mask(self):
|
||||
# Initialize feature_extractors
|
||||
# Initialize image processors
|
||||
image_processing_1 = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
|
||||
# create random PyTorch tensors
|
||||
|
||||
@@ -43,7 +43,7 @@ if is_timm_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import ConditionalDetrFeatureExtractor
|
||||
from transformers import ConditionalDetrImageProcessor
|
||||
|
||||
|
||||
class ConditionalDetrModelTester:
|
||||
@@ -493,9 +493,9 @@ def prepare_img():
|
||||
@slow
|
||||
class ConditionalDetrModelIntegrationTests(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
|
||||
ConditionalDetrImageProcessor.from_pretrained("microsoft/conditional-detr-resnet-50")
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -503,9 +503,9 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
|
||||
def test_inference_no_head(self):
|
||||
model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**encoding)
|
||||
@@ -522,9 +522,9 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
|
||||
torch_device
|
||||
)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
pixel_values = encoding["pixel_values"].to(torch_device)
|
||||
pixel_mask = encoding["pixel_mask"].to(torch_device)
|
||||
|
||||
@@ -547,7 +547,7 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
|
||||
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
|
||||
|
||||
# verify postprocessing
|
||||
results = feature_extractor.post_process_object_detection(
|
||||
results = image_processor.post_process_object_detection(
|
||||
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
|
||||
)[0]
|
||||
expected_scores = torch.tensor([0.8330, 0.8313, 0.8039, 0.6829, 0.5355]).to(torch_device)
|
||||
|
||||
@@ -38,7 +38,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class ConvNextModelTester:
|
||||
@@ -285,16 +285,16 @@ def prepare_img():
|
||||
@require_vision
|
||||
class ConvNextModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return AutoFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = ConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -38,7 +38,7 @@ if is_tf_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import ConvNextFeatureExtractor
|
||||
from transformers import ConvNextImageProcessor
|
||||
|
||||
|
||||
class TFConvNextModelTester:
|
||||
@@ -279,18 +279,16 @@ def prepare_img():
|
||||
@require_vision
|
||||
class TFConvNextModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return (
|
||||
ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
|
||||
)
|
||||
def default_image_processor(self):
|
||||
return ConvNextImageProcessor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = TFConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224")
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
|
||||
@@ -38,7 +38,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class CvtConfigTester(ConfigTester):
|
||||
@@ -264,16 +264,16 @@ def prepare_img():
|
||||
@require_vision
|
||||
class CvtModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return AutoFeatureExtractor.from_pretrained(CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
def default_image_processor(self):
|
||||
return AutoImageProcessor.from_pretrained(CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = CvtForImageClassification.from_pretrained(CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -28,7 +28,7 @@ if is_tf_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class TFCvtConfigTester(ConfigTester):
|
||||
@@ -265,16 +265,16 @@ def prepare_img():
|
||||
@require_vision
|
||||
class TFCvtModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return AutoFeatureExtractor.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
def default_image_processor(self):
|
||||
return AutoImageProcessor.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = TFCvtForImageClassification.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
|
||||
@@ -44,7 +44,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import BeitFeatureExtractor
|
||||
from transformers import BeitImageProcessor
|
||||
|
||||
|
||||
class Data2VecVisionModelTester:
|
||||
@@ -327,11 +327,9 @@ def prepare_img():
|
||||
@require_vision
|
||||
class Data2VecVisionModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
BeitFeatureExtractor.from_pretrained("facebook/data2vec-vision-base-ft1k")
|
||||
if is_vision_available()
|
||||
else None
|
||||
BeitImageProcessor.from_pretrained("facebook/data2vec-vision-base-ft1k") if is_vision_available() else None
|
||||
)
|
||||
|
||||
@slow
|
||||
@@ -340,9 +338,9 @@ class Data2VecVisionModelIntegrationTest(unittest.TestCase):
|
||||
torch_device
|
||||
)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -46,7 +46,7 @@ if is_tf_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import BeitFeatureExtractor
|
||||
from transformers import BeitImageProcessor
|
||||
|
||||
|
||||
class TFData2VecVisionModelTester:
|
||||
@@ -469,20 +469,18 @@ def prepare_img():
|
||||
@require_vision
|
||||
class TFData2VecVisionModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
BeitFeatureExtractor.from_pretrained("facebook/data2vec-vision-base-ft1k")
|
||||
if is_vision_available()
|
||||
else None
|
||||
BeitImageProcessor.from_pretrained("facebook/data2vec-vision-base-ft1k") if is_vision_available() else None
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head_imagenet_1k(self):
|
||||
model = TFData2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base-ft1k")
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
|
||||
@@ -39,7 +39,7 @@ if is_timm_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class DeformableDetrModelTester:
|
||||
@@ -563,15 +563,15 @@ def prepare_img():
|
||||
@slow
|
||||
class DeformableDetrModelIntegrationTests(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return AutoFeatureExtractor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None
|
||||
|
||||
def test_inference_object_detection_head(self):
|
||||
model = DeformableDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
pixel_values = encoding["pixel_values"].to(torch_device)
|
||||
pixel_mask = encoding["pixel_mask"].to(torch_device)
|
||||
|
||||
@@ -595,7 +595,7 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
|
||||
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
|
||||
|
||||
# verify postprocessing
|
||||
results = feature_extractor.post_process_object_detection(
|
||||
results = image_processor.post_process_object_detection(
|
||||
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
|
||||
)[0]
|
||||
expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382]).to(torch_device)
|
||||
@@ -612,9 +612,9 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
|
||||
"SenseTime/deformable-detr-with-box-refine-two-stage"
|
||||
).to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
pixel_values = encoding["pixel_values"].to(torch_device)
|
||||
pixel_mask = encoding["pixel_mask"].to(torch_device)
|
||||
|
||||
@@ -639,9 +639,9 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
|
||||
|
||||
@require_torch_gpu
|
||||
def test_inference_object_detection_head_equivalence_cpu_gpu(self):
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
encoding = feature_extractor(images=image, return_tensors="pt")
|
||||
encoding = image_processor(images=image, return_tensors="pt")
|
||||
pixel_values = encoding["pixel_values"]
|
||||
pixel_mask = encoding["pixel_mask"]
|
||||
|
||||
|
||||
@@ -55,7 +55,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import DeiTFeatureExtractor
|
||||
from transformers import DeiTImageProcessor
|
||||
|
||||
|
||||
class DeiTModelTester:
|
||||
@@ -381,9 +381,9 @@ def prepare_img():
|
||||
@require_vision
|
||||
class DeiTModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224")
|
||||
DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -394,9 +394,9 @@ class DeiTModelIntegrationTest(unittest.TestCase):
|
||||
torch_device
|
||||
)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -420,10 +420,10 @@ class DeiTModelIntegrationTest(unittest.TestCase):
|
||||
model = DeiTModel.from_pretrained(
|
||||
"facebook/deit-base-distilled-patch16-224", torch_dtype=torch.float16, device_map="auto"
|
||||
)
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt")
|
||||
inputs = image_processor(images=image, return_tensors="pt")
|
||||
pixel_values = inputs.pixel_values.to(torch_device)
|
||||
|
||||
# forward pass to make sure inference works in fp16
|
||||
|
||||
@@ -46,7 +46,7 @@ if is_tf_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import DeiTFeatureExtractor
|
||||
from transformers import DeiTImageProcessor
|
||||
|
||||
|
||||
class TFDeiTModelTester:
|
||||
@@ -266,9 +266,9 @@ def prepare_img():
|
||||
@require_vision
|
||||
class DeiTModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224")
|
||||
DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -277,9 +277,9 @@ class DeiTModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_image_classification_head(self):
|
||||
model = TFDeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224")
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
|
||||
@@ -38,7 +38,7 @@ if is_timm_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import DetrFeatureExtractor
|
||||
from transformers import DetrImageProcessor
|
||||
|
||||
|
||||
class DetrModelTester:
|
||||
@@ -512,15 +512,15 @@ def prepare_img():
|
||||
@slow
|
||||
class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") if is_vision_available() else None
|
||||
|
||||
def test_inference_no_head(self):
|
||||
model = DetrModel.from_pretrained("facebook/detr-resnet-50").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**encoding)
|
||||
@@ -535,9 +535,9 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
|
||||
def test_inference_object_detection_head(self):
|
||||
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
pixel_values = encoding["pixel_values"].to(torch_device)
|
||||
pixel_mask = encoding["pixel_mask"].to(torch_device)
|
||||
|
||||
@@ -560,7 +560,7 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
|
||||
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
|
||||
|
||||
# verify postprocessing
|
||||
results = feature_extractor.post_process_object_detection(
|
||||
results = image_processor.post_process_object_detection(
|
||||
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
|
||||
)[0]
|
||||
expected_scores = torch.tensor([0.9982, 0.9960, 0.9955, 0.9988, 0.9987]).to(torch_device)
|
||||
@@ -575,9 +575,9 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
|
||||
def test_inference_panoptic_segmentation_head(self):
|
||||
model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
pixel_values = encoding["pixel_values"].to(torch_device)
|
||||
pixel_mask = encoding["pixel_mask"].to(torch_device)
|
||||
|
||||
@@ -607,7 +607,7 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
|
||||
self.assertTrue(torch.allclose(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, atol=1e-3))
|
||||
|
||||
# verify postprocessing
|
||||
results = feature_extractor.post_process_panoptic_segmentation(
|
||||
results = image_processor.post_process_panoptic_segmentation(
|
||||
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
|
||||
)[0]
|
||||
|
||||
@@ -633,9 +633,9 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
|
||||
@slow
|
||||
class DetrModelIntegrationTests(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
|
||||
DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -643,9 +643,9 @@ class DetrModelIntegrationTests(unittest.TestCase):
|
||||
def test_inference_no_head(self):
|
||||
model = DetrModel.from_pretrained("facebook/detr-resnet-50", revision="no_timm").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**encoding)
|
||||
|
||||
@@ -367,16 +367,16 @@ class DinatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
@require_torch
|
||||
class DinatModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return AutoImageProcessor.from_pretrained("shi-labs/dinat-mini-in1k-224") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = DinatForImageClassification.from_pretrained("shi-labs/dinat-mini-in1k-224").to(torch_device)
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -25,7 +25,7 @@ if is_torch_available():
|
||||
from transformers import AutoModelForImageClassification
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
@require_torch
|
||||
@@ -33,7 +33,7 @@ if is_vision_available():
|
||||
class DiTIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_for_image_classification(self):
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/dit-base-finetuned-rvlcdip")
|
||||
image_processor = AutoImageProcessor.from_pretrained("microsoft/dit-base-finetuned-rvlcdip")
|
||||
model = AutoModelForImageClassification.from_pretrained("microsoft/dit-base-finetuned-rvlcdip")
|
||||
model.to(torch_device)
|
||||
|
||||
@@ -43,7 +43,7 @@ class DiTIntegrationTest(unittest.TestCase):
|
||||
|
||||
image = dataset["train"][0]["image"].convert("RGB")
|
||||
|
||||
inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -39,7 +39,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import DPTFeatureExtractor
|
||||
from transformers import DPTImageProcessor
|
||||
|
||||
|
||||
class DPTModelTester:
|
||||
@@ -293,11 +293,11 @@ def prepare_img():
|
||||
@slow
|
||||
class DPTModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_depth_estimation(self):
|
||||
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
|
||||
image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large")
|
||||
model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(torch_device)
|
||||
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -315,11 +315,11 @@ class DPTModelIntegrationTest(unittest.TestCase):
|
||||
self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
|
||||
|
||||
def test_inference_semantic_segmentation(self):
|
||||
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large-ade")
|
||||
image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade")
|
||||
model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade").to(torch_device)
|
||||
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -336,11 +336,11 @@ class DPTModelIntegrationTest(unittest.TestCase):
|
||||
self.assertTrue(torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4))
|
||||
|
||||
def test_post_processing_semantic_segmentation(self):
|
||||
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large-ade")
|
||||
image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade")
|
||||
model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade").to(torch_device)
|
||||
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -348,10 +348,10 @@ class DPTModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
outputs.logits = outputs.logits.detach().cpu()
|
||||
|
||||
segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
|
||||
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
|
||||
expected_shape = torch.Size((500, 300))
|
||||
self.assertEqual(segmentation[0].shape, expected_shape)
|
||||
|
||||
segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
|
||||
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
|
||||
expected_shape = torch.Size((480, 480))
|
||||
self.assertEqual(segmentation[0].shape, expected_shape)
|
||||
|
||||
@@ -39,7 +39,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import DPTFeatureExtractor
|
||||
from transformers import DPTImageProcessor
|
||||
|
||||
|
||||
class DPTModelTester:
|
||||
@@ -314,11 +314,11 @@ def prepare_img():
|
||||
@slow
|
||||
class DPTModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_depth_estimation(self):
|
||||
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
|
||||
image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
|
||||
model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(torch_device)
|
||||
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -444,7 +444,7 @@ def prepare_img():
|
||||
@require_vision
|
||||
class EfficientFormerModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
|
||||
if is_vision_available()
|
||||
@@ -457,9 +457,9 @@ class EfficientFormerModelIntegrationTest(unittest.TestCase):
|
||||
torch_device
|
||||
)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -478,9 +478,9 @@ class EfficientFormerModelIntegrationTest(unittest.TestCase):
|
||||
"snap-research/efficientformer-l1-300"
|
||||
).to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -37,7 +37,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import GLPNFeatureExtractor
|
||||
from transformers import GLPNImageProcessor
|
||||
|
||||
|
||||
class GLPNConfigTester(ConfigTester):
|
||||
@@ -337,11 +337,11 @@ def prepare_img():
|
||||
class GLPNModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_inference_depth_estimation(self):
|
||||
feature_extractor = GLPNFeatureExtractor.from_pretrained(GLPN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
image_processor = GLPNImageProcessor.from_pretrained(GLPN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
model = GLPNForDepthEstimation.from_pretrained(GLPN_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
|
||||
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -49,7 +49,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import ImageGPTFeatureExtractor
|
||||
from transformers import ImageGPTImageProcessor
|
||||
|
||||
|
||||
class ImageGPTModelTester:
|
||||
@@ -535,16 +535,16 @@ def prepare_img():
|
||||
@require_vision
|
||||
class ImageGPTModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return ImageGPTFeatureExtractor.from_pretrained("openai/imagegpt-small") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return ImageGPTImageProcessor.from_pretrained("openai/imagegpt-small") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_causal_lm_head(self):
|
||||
model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -45,7 +45,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import LayoutLMv3FeatureExtractor
|
||||
from transformers import LayoutLMv3ImageProcessor
|
||||
|
||||
|
||||
class LayoutLMv3ModelTester:
|
||||
@@ -382,16 +382,16 @@ def prepare_img():
|
||||
@require_torch
|
||||
class LayoutLMv3ModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return LayoutLMv3FeatureExtractor(apply_ocr=False) if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return LayoutLMv3ImageProcessor(apply_ocr=False) if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_no_head(self):
|
||||
model = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(torch_device)
|
||||
pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
|
||||
|
||||
input_ids = torch.tensor([[1, 2]])
|
||||
bbox = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).unsqueeze(0)
|
||||
|
||||
@@ -51,7 +51,7 @@ if is_tf_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import LayoutLMv3FeatureExtractor
|
||||
from transformers import LayoutLMv3ImageProcessor
|
||||
|
||||
|
||||
class TFLayoutLMv3ModelTester:
|
||||
@@ -482,16 +482,16 @@ def prepare_img():
|
||||
@require_tf
|
||||
class TFLayoutLMv3ModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return LayoutLMv3FeatureExtractor(apply_ocr=False) if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return LayoutLMv3ImageProcessor(apply_ocr=False) if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_no_head(self):
|
||||
model = TFLayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base")
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
pixel_values = feature_extractor(images=image, return_tensors="tf").pixel_values
|
||||
pixel_values = image_processor(images=image, return_tensors="tf").pixel_values
|
||||
|
||||
input_ids = tf.constant([[1, 2]])
|
||||
bbox = tf.expand_dims(tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]]), axis=0)
|
||||
|
||||
@@ -36,7 +36,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytes
|
||||
if is_pytesseract_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import LayoutLMv2FeatureExtractor, LayoutXLMProcessor
|
||||
from transformers import LayoutLMv2ImageProcessor, LayoutXLMProcessor
|
||||
|
||||
|
||||
@require_pytesseract
|
||||
@@ -47,7 +47,7 @@ class LayoutXLMProcessorTest(unittest.TestCase):
|
||||
rust_tokenizer_class = LayoutXLMTokenizerFast
|
||||
|
||||
def setUp(self):
|
||||
feature_extractor_map = {
|
||||
image_processor_map = {
|
||||
"do_resize": True,
|
||||
"size": 224,
|
||||
"apply_ocr": True,
|
||||
@@ -56,7 +56,7 @@ class LayoutXLMProcessorTest(unittest.TestCase):
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
||||
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
|
||||
fp.write(json.dumps(feature_extractor_map) + "\n")
|
||||
fp.write(json.dumps(image_processor_map) + "\n")
|
||||
|
||||
# taken from `test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_save_pretrained`
|
||||
self.tokenizer_pretrained_name = "hf-internal-testing/tiny-random-layoutxlm"
|
||||
@@ -70,8 +70,8 @@ class LayoutXLMProcessorTest(unittest.TestCase):
|
||||
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
|
||||
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
|
||||
|
||||
def get_feature_extractor(self, **kwargs):
|
||||
return LayoutLMv2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
|
||||
def get_image_processor(self, **kwargs):
|
||||
return LayoutLMv2ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdirname)
|
||||
@@ -88,10 +88,10 @@ class LayoutXLMProcessorTest(unittest.TestCase):
|
||||
return image_inputs
|
||||
|
||||
def test_save_load_pretrained_default(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizers = self.get_tokenizers()
|
||||
for tokenizer in tokenizers:
|
||||
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||
processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
processor = LayoutXLMProcessor.from_pretrained(self.tmpdirname)
|
||||
@@ -99,16 +99,16 @@ class LayoutXLMProcessorTest(unittest.TestCase):
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||
self.assertIsInstance(processor.tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast))
|
||||
|
||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
||||
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
|
||||
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
||||
self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
|
||||
|
||||
def test_save_load_pretrained_additional_features(self):
|
||||
processor = LayoutXLMProcessor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
|
||||
processor = LayoutXLMProcessor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
|
||||
# slow tokenizer
|
||||
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
|
||||
image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
|
||||
|
||||
processor = LayoutXLMProcessor.from_pretrained(
|
||||
self.tmpdirname,
|
||||
@@ -122,12 +122,12 @@ class LayoutXLMProcessorTest(unittest.TestCase):
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||
self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizer)
|
||||
|
||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
||||
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
|
||||
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
|
||||
self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
|
||||
|
||||
# fast tokenizer
|
||||
tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
|
||||
image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
|
||||
|
||||
processor = LayoutXLMProcessor.from_pretrained(
|
||||
self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
|
||||
@@ -136,14 +136,14 @@ class LayoutXLMProcessorTest(unittest.TestCase):
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||
self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizerFast)
|
||||
|
||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
||||
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
|
||||
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
|
||||
self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
|
||||
|
||||
def test_model_input_names(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = LayoutXLMProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
processor = LayoutXLMProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
@@ -215,15 +215,15 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
|
||||
def test_processor_case_1(self):
|
||||
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
|
||||
|
||||
feature_extractor = LayoutLMv2FeatureExtractor()
|
||||
image_processor = LayoutLMv2ImageProcessor()
|
||||
tokenizers = self.get_tokenizers
|
||||
images = self.get_images
|
||||
|
||||
for tokenizer in tokenizers:
|
||||
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||
processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||
|
||||
# not batched
|
||||
input_feat_extract = feature_extractor(images[0], return_tensors="pt")
|
||||
input_feat_extract = image_processor(images[0], return_tensors="pt")
|
||||
input_processor = processor(images[0], return_tensors="pt")
|
||||
|
||||
# verify keys
|
||||
@@ -245,7 +245,7 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
|
||||
self.assertSequenceEqual(decoding, expected_decoding)
|
||||
|
||||
# batched
|
||||
input_feat_extract = feature_extractor(images, return_tensors="pt")
|
||||
input_feat_extract = image_processor(images, return_tensors="pt")
|
||||
input_processor = processor(images, padding=True, return_tensors="pt")
|
||||
|
||||
# verify keys
|
||||
@@ -270,12 +270,12 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
|
||||
def test_processor_case_2(self):
|
||||
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
|
||||
|
||||
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
||||
image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
|
||||
tokenizers = self.get_tokenizers
|
||||
images = self.get_images
|
||||
|
||||
for tokenizer in tokenizers:
|
||||
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||
processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||
|
||||
# not batched
|
||||
words = ["hello", "world"]
|
||||
@@ -324,12 +324,12 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
|
||||
def test_processor_case_3(self):
|
||||
# case 3: token classification (training), apply_ocr=False
|
||||
|
||||
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
||||
image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
|
||||
tokenizers = self.get_tokenizers
|
||||
images = self.get_images
|
||||
|
||||
for tokenizer in tokenizers:
|
||||
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||
processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||
|
||||
# not batched
|
||||
words = ["weirdly", "world"]
|
||||
@@ -389,12 +389,12 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
|
||||
def test_processor_case_4(self):
|
||||
# case 4: visual question answering (inference), apply_ocr=True
|
||||
|
||||
feature_extractor = LayoutLMv2FeatureExtractor()
|
||||
image_processor = LayoutLMv2ImageProcessor()
|
||||
tokenizers = self.get_tokenizers
|
||||
images = self.get_images
|
||||
|
||||
for tokenizer in tokenizers:
|
||||
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||
processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||
|
||||
# not batched
|
||||
question = "What's his name?"
|
||||
@@ -440,12 +440,12 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
|
||||
def test_processor_case_5(self):
|
||||
# case 5: visual question answering (inference), apply_ocr=False
|
||||
|
||||
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
||||
image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
|
||||
tokenizers = self.get_tokenizers
|
||||
images = self.get_images
|
||||
|
||||
for tokenizer in tokenizers:
|
||||
processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||
processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||
|
||||
# not batched
|
||||
question = "What's his name?"
|
||||
|
||||
@@ -46,7 +46,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import LevitFeatureExtractor
|
||||
from transformers import LevitImageProcessor
|
||||
|
||||
|
||||
class LevitConfigTester(ConfigTester):
|
||||
@@ -409,8 +409,8 @@ def prepare_img():
|
||||
@require_vision
|
||||
class LevitModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return LevitFeatureExtractor.from_pretrained(LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
def default_image_processor(self):
|
||||
return LevitImageProcessor.from_pretrained(LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
@@ -418,9 +418,9 @@ class LevitModelIntegrationTest(unittest.TestCase):
|
||||
torch_device
|
||||
)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -545,9 +545,9 @@ class Mask2FormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te
|
||||
self.assertEqual(segmentation[0].shape, target_sizes[0])
|
||||
|
||||
def test_post_process_instance_segmentation(self):
|
||||
feature_extractor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
|
||||
image_processor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
|
||||
outputs = self.image_processor_tester.get_fake_mask2former_outputs()
|
||||
segmentation = feature_extractor.post_process_instance_segmentation(outputs, threshold=0)
|
||||
segmentation = image_processor.post_process_instance_segmentation(outputs, threshold=0)
|
||||
|
||||
self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
|
||||
for el in segmentation:
|
||||
@@ -556,7 +556,7 @@ class Mask2FormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te
|
||||
self.assertEqual(type(el["segments_info"]), list)
|
||||
self.assertEqual(el["segmentation"].shape, (384, 384))
|
||||
|
||||
segmentation = feature_extractor.post_process_instance_segmentation(
|
||||
segmentation = image_processor.post_process_instance_segmentation(
|
||||
outputs, threshold=0, return_binary_maps=True
|
||||
)
|
||||
|
||||
|
||||
@@ -325,14 +325,14 @@ class Mask2FormerModelIntegrationTest(unittest.TestCase):
|
||||
return "facebook/mask2former-swin-small-coco-instance"
|
||||
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return Mask2FormerImageProcessor.from_pretrained(self.model_checkpoints) if is_vision_available() else None
|
||||
|
||||
def test_inference_no_head(self):
|
||||
model = Mask2FormerModel.from_pretrained(self.model_checkpoints).to(torch_device)
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(image, return_tensors="pt").to(torch_device)
|
||||
inputs_shape = inputs["pixel_values"].shape
|
||||
# check size is divisible by 32
|
||||
self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
|
||||
@@ -371,9 +371,9 @@ class Mask2FormerModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
def test_inference_universal_segmentation_head(self):
|
||||
model = Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).to(torch_device).eval()
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(image, return_tensors="pt").to(torch_device)
|
||||
inputs_shape = inputs["pixel_values"].shape
|
||||
# check size is divisible by 32
|
||||
self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
|
||||
@@ -408,9 +408,9 @@ class Mask2FormerModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
def test_with_segmentation_maps_and_loss(self):
|
||||
model = Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).to(torch_device).eval()
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
|
||||
inputs = feature_extractor(
|
||||
inputs = image_processor(
|
||||
[np.zeros((3, 800, 1333)), np.zeros((3, 800, 1333))],
|
||||
segmentation_maps=[np.zeros((384, 384)).astype(np.float32), np.zeros((384, 384)).astype(np.float32)],
|
||||
return_tensors="pt",
|
||||
|
||||
@@ -574,9 +574,9 @@ class MaskFormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Tes
|
||||
self.assertEqual(segmentation[0].shape, target_sizes[0])
|
||||
|
||||
def test_post_process_instance_segmentation(self):
|
||||
feature_extractor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
|
||||
image_processor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
|
||||
outputs = self.image_processor_tester.get_fake_maskformer_outputs()
|
||||
segmentation = feature_extractor.post_process_instance_segmentation(outputs, threshold=0)
|
||||
segmentation = image_processor.post_process_instance_segmentation(outputs, threshold=0)
|
||||
|
||||
self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
|
||||
for el in segmentation:
|
||||
@@ -587,7 +587,7 @@ class MaskFormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Tes
|
||||
el["segmentation"].shape, (self.image_processor_tester.height, self.image_processor_tester.width)
|
||||
)
|
||||
|
||||
segmentation = feature_extractor.post_process_instance_segmentation(
|
||||
segmentation = image_processor.post_process_instance_segmentation(
|
||||
outputs, threshold=0, return_binary_maps=True
|
||||
)
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@ if is_torch_available():
|
||||
from transformers import MaskFormerForInstanceSegmentation, MaskFormerModel
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import MaskFormerFeatureExtractor
|
||||
from transformers import MaskFormerImageProcessor
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
@@ -326,18 +326,18 @@ def prepare_img():
|
||||
@slow
|
||||
class MaskFormerModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
MaskFormerFeatureExtractor.from_pretrained("facebook/maskformer-swin-small-coco")
|
||||
MaskFormerImageProcessor.from_pretrained("facebook/maskformer-swin-small-coco")
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
|
||||
def test_inference_no_head(self):
|
||||
model = MaskFormerModel.from_pretrained("facebook/maskformer-swin-small-coco").to(torch_device)
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(image, return_tensors="pt").to(torch_device)
|
||||
inputs_shape = inputs["pixel_values"].shape
|
||||
# check size is divisible by 32
|
||||
self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
|
||||
@@ -380,9 +380,9 @@ class MaskFormerModelIntegrationTest(unittest.TestCase):
|
||||
.to(torch_device)
|
||||
.eval()
|
||||
)
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(image, return_tensors="pt").to(torch_device)
|
||||
inputs_shape = inputs["pixel_values"].shape
|
||||
# check size is divisible by 32
|
||||
self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
|
||||
@@ -424,9 +424,9 @@ class MaskFormerModelIntegrationTest(unittest.TestCase):
|
||||
.to(torch_device)
|
||||
.eval()
|
||||
)
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(image, return_tensors="pt").to(torch_device)
|
||||
inputs_shape = inputs["pixel_values"].shape
|
||||
# check size is divisible by 32
|
||||
self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
|
||||
@@ -460,9 +460,9 @@ class MaskFormerModelIntegrationTest(unittest.TestCase):
|
||||
.to(torch_device)
|
||||
.eval()
|
||||
)
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
|
||||
inputs = feature_extractor(
|
||||
inputs = image_processor(
|
||||
[np.zeros((3, 800, 1333)), np.zeros((3, 800, 1333))],
|
||||
segmentation_maps=[np.zeros((384, 384)).astype(np.float32), np.zeros((384, 384)).astype(np.float32)],
|
||||
return_tensors="pt",
|
||||
|
||||
@@ -64,7 +64,7 @@ class MgpstrProcessorTest(unittest.TestCase):
|
||||
image_processor_map = {
|
||||
"do_normalize": False,
|
||||
"do_resize": True,
|
||||
"feature_extractor_type": "ViTFeatureExtractor",
|
||||
"image_processor_type": "ViTImageProcessor",
|
||||
"resample": 3,
|
||||
"size": {"height": 32, "width": 128},
|
||||
}
|
||||
|
||||
@@ -37,7 +37,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import MobileNetV1FeatureExtractor
|
||||
from transformers import MobileNetV1ImageProcessor
|
||||
|
||||
|
||||
class MobileNetV1ConfigTester(ConfigTester):
|
||||
@@ -240,20 +240,18 @@ def prepare_img():
|
||||
@require_vision
|
||||
class MobileNetV1ModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
MobileNetV1FeatureExtractor.from_pretrained("google/mobilenet_v1_1.0_224")
|
||||
if is_vision_available()
|
||||
else None
|
||||
MobileNetV1ImageProcessor.from_pretrained("google/mobilenet_v1_1.0_224") if is_vision_available() else None
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = MobileNetV1ForImageClassification.from_pretrained("google/mobilenet_v1_1.0_224").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -37,7 +37,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import MobileNetV2FeatureExtractor
|
||||
from transformers import MobileNetV2ImageProcessor
|
||||
|
||||
|
||||
class MobileNetV2ConfigTester(ConfigTester):
|
||||
@@ -295,20 +295,18 @@ def prepare_img():
|
||||
@require_vision
|
||||
class MobileNetV2ModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
MobileNetV2FeatureExtractor.from_pretrained("google/mobilenet_v2_1.0_224")
|
||||
if is_vision_available()
|
||||
else None
|
||||
MobileNetV2ImageProcessor.from_pretrained("google/mobilenet_v2_1.0_224") if is_vision_available() else None
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = MobileNetV2ForImageClassification.from_pretrained("google/mobilenet_v2_1.0_224").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -327,10 +325,10 @@ class MobileNetV2ModelIntegrationTest(unittest.TestCase):
|
||||
model = MobileNetV2ForSemanticSegmentation.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
|
||||
model = model.to(torch_device)
|
||||
|
||||
feature_extractor = MobileNetV2FeatureExtractor.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
|
||||
image_processor = MobileNetV2ImageProcessor.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
|
||||
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -37,7 +37,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import MobileViTFeatureExtractor
|
||||
from transformers import MobileViTImageProcessor
|
||||
|
||||
|
||||
class MobileViTConfigTester(ConfigTester):
|
||||
@@ -298,16 +298,16 @@ def prepare_img():
|
||||
@require_vision
|
||||
class MobileViTModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-xx-small") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return MobileViTImageProcessor.from_pretrained("apple/mobilevit-xx-small") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = MobileViTForImageClassification.from_pretrained("apple/mobilevit-xx-small").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -326,10 +326,10 @@ class MobileViTModelIntegrationTest(unittest.TestCase):
|
||||
model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
|
||||
model = model.to(torch_device)
|
||||
|
||||
feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
|
||||
image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
|
||||
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -356,10 +356,10 @@ class MobileViTModelIntegrationTest(unittest.TestCase):
|
||||
model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
|
||||
model = model.to(torch_device)
|
||||
|
||||
feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
|
||||
image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
|
||||
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -367,10 +367,10 @@ class MobileViTModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
outputs.logits = outputs.logits.detach().cpu()
|
||||
|
||||
segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(50, 60)])
|
||||
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(50, 60)])
|
||||
expected_shape = torch.Size((50, 60))
|
||||
self.assertEqual(segmentation[0].shape, expected_shape)
|
||||
|
||||
segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
|
||||
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
|
||||
expected_shape = torch.Size((32, 32))
|
||||
self.assertEqual(segmentation[0].shape, expected_shape)
|
||||
|
||||
@@ -40,7 +40,7 @@ if is_tf_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import MobileViTFeatureExtractor
|
||||
from transformers import MobileViTImageProcessor
|
||||
|
||||
|
||||
class TFMobileViTConfigTester(ConfigTester):
|
||||
@@ -381,9 +381,9 @@ class TFMobileViTModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_image_classification_head(self):
|
||||
model = TFMobileViTForImageClassification.from_pretrained("apple/mobilevit-xx-small")
|
||||
|
||||
feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-xx-small")
|
||||
image_processor = MobileViTImageProcessor.from_pretrained("apple/mobilevit-xx-small")
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
|
||||
# forward pass
|
||||
outputs = model(**inputs, training=False)
|
||||
@@ -401,10 +401,10 @@ class TFMobileViTModelIntegrationTest(unittest.TestCase):
|
||||
# `from_pt` will be removed
|
||||
model = TFMobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
|
||||
|
||||
feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
|
||||
image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
|
||||
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
|
||||
# forward pass
|
||||
outputs = model(inputs.pixel_values, training=False)
|
||||
|
||||
@@ -364,16 +364,16 @@ class NatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
@require_torch
|
||||
class NatModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = NatForImageClassification.from_pretrained("shi-labs/nat-mini-in1k-224").to(torch_device)
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -61,7 +61,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import PerceiverFeatureExtractor
|
||||
from transformers import PerceiverImageProcessor
|
||||
|
||||
|
||||
class PerceiverModelTester:
|
||||
@@ -899,13 +899,13 @@ class PerceiverModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification(self):
|
||||
feature_extractor = PerceiverFeatureExtractor()
|
||||
image_processor = PerceiverImageProcessor()
|
||||
model = PerceiverForImageClassificationLearned.from_pretrained("deepmind/vision-perceiver-learned")
|
||||
model.to(torch_device)
|
||||
|
||||
# prepare inputs
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(image, return_tensors="pt").pixel_values.to(torch_device)
|
||||
inputs = image_processor(image, return_tensors="pt").pixel_values.to(torch_device)
|
||||
input_mask = None
|
||||
|
||||
# forward pass
|
||||
@@ -923,13 +923,13 @@ class PerceiverModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_fourier(self):
|
||||
feature_extractor = PerceiverFeatureExtractor()
|
||||
image_processor = PerceiverImageProcessor()
|
||||
model = PerceiverForImageClassificationFourier.from_pretrained("deepmind/vision-perceiver-fourier")
|
||||
model.to(torch_device)
|
||||
|
||||
# prepare inputs
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(image, return_tensors="pt").pixel_values.to(torch_device)
|
||||
inputs = image_processor(image, return_tensors="pt").pixel_values.to(torch_device)
|
||||
input_mask = None
|
||||
|
||||
# forward pass
|
||||
@@ -947,13 +947,13 @@ class PerceiverModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_conv(self):
|
||||
feature_extractor = PerceiverFeatureExtractor()
|
||||
image_processor = PerceiverImageProcessor()
|
||||
model = PerceiverForImageClassificationConvProcessing.from_pretrained("deepmind/vision-perceiver-conv")
|
||||
model.to(torch_device)
|
||||
|
||||
# prepare inputs
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(image, return_tensors="pt").pixel_values.to(torch_device)
|
||||
inputs = image_processor(image, return_tensors="pt").pixel_values.to(torch_device)
|
||||
input_mask = None
|
||||
|
||||
# forward pass
|
||||
|
||||
@@ -37,7 +37,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import PoolFormerFeatureExtractor
|
||||
from transformers import PoolFormerImageProcessor
|
||||
|
||||
|
||||
class PoolFormerConfigTester(ConfigTester):
|
||||
@@ -237,10 +237,10 @@ def prepare_img():
|
||||
class PoolFormerModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
feature_extractor = PoolFormerFeatureExtractor()
|
||||
image_processor = PoolFormerImageProcessor()
|
||||
model = PoolFormerForImageClassification.from_pretrained("sail/poolformer_s12").to(torch_device)
|
||||
|
||||
inputs = feature_extractor(images=prepare_img(), return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=prepare_img(), return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -33,7 +33,7 @@ if is_flax_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class FlaxRegNetModelTester(unittest.TestCase):
|
||||
@@ -215,16 +215,16 @@ def prepare_img():
|
||||
@require_flax
|
||||
class FlaxRegNetModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return AutoFeatureExtractor.from_pretrained("facebook/regnet-y-040") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return AutoImageProcessor.from_pretrained("facebook/regnet-y-040") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = FlaxRegNetForImageClassification.from_pretrained("facebook/regnet-y-040")
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="np")
|
||||
inputs = image_processor(images=image, return_tensors="np")
|
||||
|
||||
outputs = model(**inputs)
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class RegNetModelTester:
|
||||
@@ -248,9 +248,9 @@ def prepare_img():
|
||||
@require_vision
|
||||
class RegNetModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
AutoFeatureExtractor.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
AutoImageProcessor.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -259,9 +259,9 @@ class RegNetModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_image_classification_head(self):
|
||||
model = RegNetForImageClassification.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -38,7 +38,7 @@ if is_tf_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class TFRegNetModelTester:
|
||||
@@ -267,9 +267,9 @@ def prepare_img():
|
||||
@require_vision
|
||||
class RegNetModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
AutoFeatureExtractor.from_pretrained(TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
AutoImageProcessor.from_pretrained(TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -278,9 +278,9 @@ class RegNetModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_image_classification_head(self):
|
||||
model = TFRegNetForImageClassification.from_pretrained(TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
|
||||
# forward pass
|
||||
outputs = model(**inputs, training=False)
|
||||
|
||||
@@ -32,7 +32,7 @@ if is_flax_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class FlaxResNetModelTester(unittest.TestCase):
|
||||
@@ -206,16 +206,16 @@ def prepare_img():
|
||||
@require_flax
|
||||
class FlaxResNetModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return AutoFeatureExtractor.from_pretrained("microsoft/resnet-50") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return AutoImageProcessor.from_pretrained("microsoft/resnet-50") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = FlaxResNetForImageClassification.from_pretrained("microsoft/resnet-50")
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="np")
|
||||
inputs = image_processor(images=image, return_tensors="np")
|
||||
|
||||
outputs = model(**inputs)
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class ResNetModelTester:
|
||||
@@ -301,9 +301,9 @@ def prepare_img():
|
||||
@require_vision
|
||||
class ResNetModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
AutoFeatureExtractor.from_pretrained(RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
AutoImageProcessor.from_pretrained(RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -312,9 +312,9 @@ class ResNetModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_image_classification_head(self):
|
||||
model = ResNetForImageClassification.from_pretrained(RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -41,7 +41,7 @@ if is_tf_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class TFResNetModelTester:
|
||||
@@ -229,9 +229,9 @@ def prepare_img():
|
||||
@require_vision
|
||||
class TFResNetModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
AutoFeatureExtractor.from_pretrained(TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
AutoImageProcessor.from_pretrained(TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -240,9 +240,9 @@ class TFResNetModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_image_classification_head(self):
|
||||
model = TFResNetForImageClassification.from_pretrained(TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
|
||||
@@ -42,7 +42,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import SegformerFeatureExtractor
|
||||
from transformers import SegformerImageProcessor
|
||||
|
||||
|
||||
class SegformerConfigTester(ConfigTester):
|
||||
@@ -365,7 +365,7 @@ class SegformerModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_inference_image_segmentation_ade(self):
|
||||
# only resize + normalize
|
||||
feature_extractor = SegformerFeatureExtractor(
|
||||
image_processor = SegformerImageProcessor(
|
||||
image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
|
||||
)
|
||||
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to(
|
||||
@@ -373,7 +373,7 @@ class SegformerModelIntegrationTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
image = prepare_img()
|
||||
encoded_inputs = feature_extractor(images=image, return_tensors="pt")
|
||||
encoded_inputs = image_processor(images=image, return_tensors="pt")
|
||||
pixel_values = encoded_inputs.pixel_values.to(torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
@@ -394,7 +394,7 @@ class SegformerModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_inference_image_segmentation_city(self):
|
||||
# only resize + normalize
|
||||
feature_extractor = SegformerFeatureExtractor(
|
||||
image_processor = SegformerImageProcessor(
|
||||
image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
|
||||
)
|
||||
model = SegformerForSemanticSegmentation.from_pretrained(
|
||||
@@ -402,7 +402,7 @@ class SegformerModelIntegrationTest(unittest.TestCase):
|
||||
).to(torch_device)
|
||||
|
||||
image = prepare_img()
|
||||
encoded_inputs = feature_extractor(images=image, return_tensors="pt")
|
||||
encoded_inputs = image_processor(images=image, return_tensors="pt")
|
||||
pixel_values = encoded_inputs.pixel_values.to(torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
@@ -423,7 +423,7 @@ class SegformerModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_post_processing_semantic_segmentation(self):
|
||||
# only resize + normalize
|
||||
feature_extractor = SegformerFeatureExtractor(
|
||||
image_processor = SegformerImageProcessor(
|
||||
image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
|
||||
)
|
||||
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to(
|
||||
@@ -431,7 +431,7 @@ class SegformerModelIntegrationTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
image = prepare_img()
|
||||
encoded_inputs = feature_extractor(images=image, return_tensors="pt")
|
||||
encoded_inputs = image_processor(images=image, return_tensors="pt")
|
||||
pixel_values = encoded_inputs.pixel_values.to(torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
@@ -439,10 +439,10 @@ class SegformerModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
outputs.logits = outputs.logits.detach().cpu()
|
||||
|
||||
segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
|
||||
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
|
||||
expected_shape = torch.Size((500, 300))
|
||||
self.assertEqual(segmentation[0].shape, expected_shape)
|
||||
|
||||
segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
|
||||
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
|
||||
expected_shape = torch.Size((128, 128))
|
||||
self.assertEqual(segmentation[0].shape, expected_shape)
|
||||
|
||||
@@ -39,7 +39,7 @@ if is_tf_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import SegformerFeatureExtractor
|
||||
from transformers import SegformerImageProcessor
|
||||
|
||||
|
||||
class TFSegformerConfigTester(ConfigTester):
|
||||
@@ -454,13 +454,13 @@ class TFSegformerModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_inference_image_segmentation_ade(self):
|
||||
# only resize + normalize
|
||||
feature_extractor = SegformerFeatureExtractor(
|
||||
image_processor = SegformerImageProcessor(
|
||||
image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
|
||||
)
|
||||
model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
|
||||
|
||||
image = prepare_img()
|
||||
encoded_inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
encoded_inputs = image_processor(images=image, return_tensors="tf")
|
||||
pixel_values = encoded_inputs.pixel_values
|
||||
|
||||
outputs = model(pixel_values, training=False)
|
||||
@@ -480,7 +480,7 @@ class TFSegformerModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_inference_image_segmentation_city(self):
|
||||
# only resize + normalize
|
||||
feature_extractor = SegformerFeatureExtractor(
|
||||
image_processor = SegformerImageProcessor(
|
||||
image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
|
||||
)
|
||||
model = TFSegformerForSemanticSegmentation.from_pretrained(
|
||||
@@ -488,7 +488,7 @@ class TFSegformerModelIntegrationTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
image = prepare_img()
|
||||
encoded_inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
encoded_inputs = image_processor(images=image, return_tensors="tf")
|
||||
pixel_values = encoded_inputs.pixel_values
|
||||
|
||||
outputs = model(pixel_values, training=False)
|
||||
|
||||
@@ -283,16 +283,16 @@ def prepare_img():
|
||||
@require_vision
|
||||
class SwiftFormerModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return ViTImageProcessor.from_pretrained("MBZUAI/swiftformer-xs") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = SwiftFormerForImageClassification.from_pretrained("MBZUAI/swiftformer-xs").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -39,7 +39,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class SwinModelTester:
|
||||
@@ -482,9 +482,9 @@ class SwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
@require_torch
|
||||
class SwinModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
|
||||
AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -492,10 +492,10 @@ class SwinModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = SwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224").to(torch_device)
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -45,7 +45,7 @@ if is_tf_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class TFSwinModelTester:
|
||||
@@ -382,9 +382,9 @@ class TFSwinModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
||||
@require_tf
|
||||
class TFSwinModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
|
||||
AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -392,10 +392,10 @@ class TFSwinModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = TFSwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
|
||||
# forward pass
|
||||
outputs = model(inputs)
|
||||
|
||||
@@ -36,7 +36,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class Swinv2ModelTester:
|
||||
@@ -412,9 +412,9 @@ class Swinv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
@require_torch
|
||||
class Swinv2ModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
AutoFeatureExtractor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
|
||||
AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -424,10 +424,10 @@ class Swinv2ModelIntegrationTest(unittest.TestCase):
|
||||
model = Swinv2ForImageClassification.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256").to(
|
||||
torch_device
|
||||
)
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -39,7 +39,7 @@ if is_timm_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class TableTransformerModelTester:
|
||||
@@ -501,13 +501,13 @@ def prepare_img():
|
||||
@slow
|
||||
class TableTransformerModelIntegrationTests(unittest.TestCase):
|
||||
def test_table_detection(self):
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/table-transformer-detection")
|
||||
image_processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-detection")
|
||||
model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")
|
||||
model.to(torch_device)
|
||||
|
||||
file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png")
|
||||
image = Image.open(file_path).convert("RGB")
|
||||
inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -45,7 +45,7 @@ if is_torch_available():
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import VideoMAEFeatureExtractor
|
||||
from transformers import VideoMAEImageProcessor
|
||||
|
||||
|
||||
class TimesformerModelTester:
|
||||
@@ -339,10 +339,10 @@ def prepare_video():
|
||||
@require_vision
|
||||
class TimesformerModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
# logits were tested with a different mean and std, so we use the same here
|
||||
return (
|
||||
VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
|
||||
VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -353,9 +353,9 @@ class TimesformerModelIntegrationTest(unittest.TestCase):
|
||||
torch_device
|
||||
)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
video = prepare_video()
|
||||
inputs = feature_extractor(video[:8], return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(video[:8], return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -564,7 +564,7 @@ def prepare_audio(num_samples=1):
|
||||
@require_vision
|
||||
class TvltModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_processors(self):
|
||||
# logits were tested with a different mean and std, so we use the same here
|
||||
return (
|
||||
TvltImageProcessor() if is_vision_available() else None,
|
||||
@@ -574,7 +574,7 @@ class TvltModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_for_base_model(self):
|
||||
model = TvltModel.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
|
||||
|
||||
image_processor, audio_feature_extractor = self.default_feature_extractor
|
||||
image_processor, audio_feature_extractor = self.default_processors
|
||||
video = prepare_video()
|
||||
audio = prepare_audio()
|
||||
video_inputs = image_processor(video, return_tensors="pt").to(torch_device)
|
||||
@@ -596,7 +596,7 @@ class TvltModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_for_pretraining(self):
|
||||
model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
|
||||
|
||||
image_processor, audio_feature_extractor = self.default_feature_extractor
|
||||
image_processor, audio_feature_extractor = self.default_processors
|
||||
video = prepare_video()
|
||||
video_mixed = prepare_video()
|
||||
audio = prepare_audio()
|
||||
|
||||
@@ -42,7 +42,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class VanModelTester:
|
||||
@@ -254,16 +254,16 @@ def prepare_img():
|
||||
@require_vision
|
||||
class VanModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return AutoFeatureExtractor.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
def default_image_processor(self):
|
||||
return AutoImageProcessor.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = VanForImageClassification.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -46,7 +46,7 @@ if is_torch_available():
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import VideoMAEFeatureExtractor
|
||||
from transformers import VideoMAEImageProcessor
|
||||
|
||||
|
||||
class VideoMAEModelTester:
|
||||
@@ -359,10 +359,10 @@ def prepare_video():
|
||||
@require_vision
|
||||
class VideoMAEModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
# logits were tested with a different mean and std, so we use the same here
|
||||
return (
|
||||
VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
|
||||
VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
@@ -373,9 +373,9 @@ class VideoMAEModelIntegrationTest(unittest.TestCase):
|
||||
torch_device
|
||||
)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
video = prepare_video()
|
||||
inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(video, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -393,9 +393,9 @@ class VideoMAEModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_for_pretraining(self):
|
||||
model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base-short").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
video = prepare_video()
|
||||
inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(video, return_tensors="pt").to(torch_device)
|
||||
|
||||
# add boolean mask, indicating which patches to mask
|
||||
local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
|
||||
|
||||
@@ -48,7 +48,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import ViTFeatureExtractor
|
||||
from transformers import ViTImageProcessor
|
||||
|
||||
|
||||
@require_flax
|
||||
@@ -462,12 +462,12 @@ class FlaxViT2GPT2ModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_coco_en(self):
|
||||
loc = "ydshieh/vit-gpt2-coco-en"
|
||||
|
||||
feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
|
||||
image_processor = ViTImageProcessor.from_pretrained(loc)
|
||||
tokenizer = AutoTokenizer.from_pretrained(loc)
|
||||
model = FlaxVisionEncoderDecoderModel.from_pretrained(loc)
|
||||
|
||||
img = prepare_img()
|
||||
pixel_values = feature_extractor(images=img, return_tensors="np").pixel_values
|
||||
pixel_values = image_processor(images=img, return_tensors="np").pixel_values
|
||||
|
||||
decoder_input_ids = np.array([[model.config.decoder_start_token_id]])
|
||||
logits = model(pixel_values, decoder_input_ids)[0]
|
||||
|
||||
@@ -45,7 +45,7 @@ if is_tf_available():
|
||||
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoFeatureExtractor,
|
||||
AutoImageProcessor,
|
||||
AutoTokenizer,
|
||||
TFAutoModel,
|
||||
TFAutoModelForCausalLM,
|
||||
@@ -64,7 +64,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import ViTFeatureExtractor
|
||||
from transformers import ViTImageProcessor
|
||||
|
||||
|
||||
@require_tf
|
||||
@@ -828,11 +828,11 @@ class TFVisionEncoderDecoderModelSaveLoadTests(unittest.TestCase):
|
||||
load_weight_prefix = TFVisionEncoderDecoderModel.load_weight_prefix
|
||||
|
||||
config = self.get_encoder_decoder_config()
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
|
||||
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
|
||||
decoder_tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
|
||||
img = prepare_img()
|
||||
pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values
|
||||
pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
|
||||
decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dirname:
|
||||
@@ -893,13 +893,13 @@ class TFViT2GPT2ModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_coco_en(self):
|
||||
loc = "ydshieh/vit-gpt2-coco-en"
|
||||
|
||||
feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
|
||||
image_processor = ViTImageProcessor.from_pretrained(loc)
|
||||
tokenizer = AutoTokenizer.from_pretrained(loc)
|
||||
model = TFVisionEncoderDecoderModel.from_pretrained(loc)
|
||||
|
||||
# We will verify our results on an image of cute cats
|
||||
img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values
|
||||
pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
|
||||
|
||||
decoder_input_ids = tf.constant([[model.config.decoder_start_token_id]])
|
||||
|
||||
|
||||
@@ -62,7 +62,7 @@ if is_vision_available():
|
||||
import PIL
|
||||
from PIL import Image
|
||||
|
||||
from transformers import ViTFeatureExtractor
|
||||
from transformers import ViTImageProcessor
|
||||
|
||||
|
||||
@require_torch
|
||||
@@ -749,7 +749,7 @@ class ViT2GPT2ModelIntegrationTest(unittest.TestCase):
|
||||
def test_inference_coco_en(self):
|
||||
loc = "ydshieh/vit-gpt2-coco-en"
|
||||
|
||||
feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
|
||||
image_processor = ViTImageProcessor.from_pretrained(loc)
|
||||
tokenizer = AutoTokenizer.from_pretrained(loc)
|
||||
model = VisionEncoderDecoderModel.from_pretrained(loc)
|
||||
model.to(torch_device)
|
||||
@@ -757,7 +757,7 @@ class ViT2GPT2ModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
# We will verify our results on an image of cute cats
|
||||
img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(torch_device)
|
||||
pixel_values = image_processor(images=img, return_tensors="pt").pixel_values.to(torch_device)
|
||||
|
||||
decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]]).to(torch_device)
|
||||
|
||||
|
||||
@@ -170,10 +170,10 @@ class VisionTextDualEncoderProcessorTest(unittest.TestCase):
|
||||
self.assertListEqual(decoded_tok, decoded_processor)
|
||||
|
||||
def test_model_input_names(self):
|
||||
feature_extractor = self.get_image_processor()
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
@@ -38,7 +38,7 @@ if is_tf_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import ViTFeatureExtractor
|
||||
from transformers import ViTImageProcessor
|
||||
|
||||
|
||||
class TFViTModelTester:
|
||||
@@ -228,16 +228,16 @@ def prepare_img():
|
||||
@require_vision
|
||||
class TFViTModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
|
||||
@@ -45,7 +45,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import ViTFeatureExtractor
|
||||
from transformers import ViTImageProcessor
|
||||
|
||||
|
||||
class ViTModelTester:
|
||||
@@ -264,16 +264,16 @@ def prepare_img():
|
||||
@require_vision
|
||||
class ViTModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -295,9 +295,9 @@ class ViTModelIntegrationTest(unittest.TestCase):
|
||||
# to visualize self-attention on higher resolution images.
|
||||
model = ViTModel.from_pretrained("facebook/dino-vits8").to(torch_device)
|
||||
|
||||
feature_extractor = ViTFeatureExtractor.from_pretrained("facebook/dino-vits8", size=480)
|
||||
image_processor = ViTImageProcessor.from_pretrained("facebook/dino-vits8", size=480)
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt")
|
||||
inputs = image_processor(images=image, return_tensors="pt")
|
||||
pixel_values = inputs.pixel_values.to(torch_device)
|
||||
|
||||
# forward pass
|
||||
@@ -322,10 +322,10 @@ class ViTModelIntegrationTest(unittest.TestCase):
|
||||
A small test to make sure that inference work in half precision without any problem.
|
||||
"""
|
||||
model = ViTModel.from_pretrained("facebook/dino-vits8", torch_dtype=torch.float16, device_map="auto")
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt")
|
||||
inputs = image_processor(images=image, return_tensors="pt")
|
||||
pixel_values = inputs.pixel_values.to(torch_device)
|
||||
|
||||
# forward pass to make sure inference works in fp16
|
||||
|
||||
@@ -243,7 +243,7 @@ def prepare_img():
|
||||
@require_vision
|
||||
class ViTModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
ViTHybridImageProcessor.from_pretrained(VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
if is_vision_available()
|
||||
@@ -256,9 +256,9 @@ class ViTModelIntegrationTest(unittest.TestCase):
|
||||
torch_device
|
||||
)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -275,12 +275,12 @@ class ViTModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_accelerate
|
||||
def test_accelerate_inference(self):
|
||||
feature_extractor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
|
||||
image_processor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
|
||||
model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", device_map="auto")
|
||||
|
||||
image = prepare_img()
|
||||
|
||||
inputs = feature_extractor(images=image, return_tensors="pt")
|
||||
inputs = image_processor(images=image, return_tensors="pt")
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
# model predicts one of the 1000 ImageNet classes
|
||||
|
||||
@@ -46,7 +46,7 @@ if is_tf_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import ViTFeatureExtractor
|
||||
from transformers import ViTImageProcessor
|
||||
|
||||
|
||||
class TFViTMAEModelTester:
|
||||
@@ -424,8 +424,8 @@ def prepare_img():
|
||||
@require_vision
|
||||
class TFViTMAEModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return ViTFeatureExtractor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_for_pretraining(self):
|
||||
@@ -434,9 +434,9 @@ class TFViTMAEModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
|
||||
# prepare a noise vector that will be also used for testing the TF model
|
||||
# (this way we can ensure that the PT and TF models operate on the same inputs)
|
||||
|
||||
@@ -42,7 +42,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import ViTFeatureExtractor
|
||||
from transformers import ViTImageProcessor
|
||||
|
||||
|
||||
class ViTMAEModelTester:
|
||||
@@ -296,8 +296,8 @@ def prepare_img():
|
||||
@require_vision
|
||||
class ViTMAEModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return ViTFeatureExtractor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_for_pretraining(self):
|
||||
@@ -306,9 +306,9 @@ class ViTMAEModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# prepare a noise vector that will be also used for testing the TF model
|
||||
# (this way we can ensure that the PT and TF models operate on the same inputs)
|
||||
|
||||
@@ -38,7 +38,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import ViTFeatureExtractor
|
||||
from transformers import ViTImageProcessor
|
||||
|
||||
|
||||
class ViTMSNModelTester:
|
||||
@@ -220,17 +220,17 @@ def prepare_img():
|
||||
@require_vision
|
||||
class ViTMSNModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return ViTFeatureExtractor.from_pretrained("facebook/vit-msn-small") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return ViTImageProcessor.from_pretrained("facebook/vit-msn-small") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
torch.manual_seed(2)
|
||||
model = ViTMSNForImageClassification.from_pretrained("facebook/vit-msn-small").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -38,7 +38,7 @@ if is_torch_available():
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class YolosModelTester:
|
||||
@@ -345,16 +345,16 @@ def prepare_img():
|
||||
@require_vision
|
||||
class YolosModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return AutoFeatureExtractor.from_pretrained("hustvl/yolos-small") if is_vision_available() else None
|
||||
def default_image_processor(self):
|
||||
return AutoImageProcessor.from_pretrained("hustvl/yolos-small") if is_vision_available() else None
|
||||
|
||||
@slow
|
||||
def test_inference_object_detection_head(self):
|
||||
model = YolosForObjectDetection.from_pretrained("hustvl/yolos-small").to(torch_device)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
@@ -375,7 +375,7 @@ class YolosModelIntegrationTest(unittest.TestCase):
|
||||
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
|
||||
|
||||
# verify postprocessing
|
||||
results = feature_extractor.post_process_object_detection(
|
||||
results = image_processor.post_process_object_detection(
|
||||
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
|
||||
)[0]
|
||||
expected_scores = torch.tensor([0.9994, 0.9790, 0.9964, 0.9972, 0.9861]).to(torch_device)
|
||||
|
||||
Reference in New Issue
Block a user