Update examples with image processors (#21155)

* Update examples to use image processors * Small fixes * Resolve conflicts
2023-01-19 15:14:58 +00:00
parent fc8a93507c
commit 4bc18e7a83
12 changed files with 124 additions and 137 deletions
--- a/examples/pytorch/contrastive-image-text/README.md
+++ b/examples/pytorch/contrastive-image-text/README.md
@@ -52,15 +52,15 @@ ds = datasets.load_dataset("ydshieh/coco_dataset_script", "2017", data_dir=COCO_

 ### Create a model from a vision encoder model and a text decoder model
 Next, we create a [VisionTextDualEncoderModel](https://huggingface.co/docs/transformers/model_doc/vision-text-dual-encoder#visiontextdualencoder).
-The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder. 
+The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder.
 Here is an example of how to load the model using pre-trained vision and text models.

 ```python3
 from transformers import (
-    VisionTextDualEncoderModel, 
-    VisionTextDualEncoderProcessor, 
-    AutoTokenizer, 
-    AutoFeatureExtractor
+    VisionTextDualEncoderModel,
+    VisionTextDualEncoderProcessor,
+    AutoTokenizer,
+    AutoImageProcessor
 )

 model = VisionTextDualEncoderModel.from_vision_text_pretrained(
@@ -68,8 +68,8 @@ model = VisionTextDualEncoderModel.from_vision_text_pretrained(
 )

 tokenizer = AutoTokenizer.from_pretrained("roberta-base")
-feat_ext = AutoFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32")
-processor = VisionTextDualEncoderProcessor(feat_ext, tokenizer)
+image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
+processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)

 # save the model and processor
 model.save_pretrained("clip-roberta")
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -38,7 +38,7 @@ from torchvision.transforms.functional import InterpolationMode

 import transformers
 from transformers import (
-    AutoFeatureExtractor,
+    AutoImageProcessor,
    AutoModel,
    AutoTokenizer,
    HfArgumentParser,
@@ -74,7 +74,7 @@ class ModelArguments:
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
-    feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )
@@ -308,7 +308,7 @@ def main():
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

-    # 5. Load pretrained model, tokenizer, and feature extractor
+    # 5. Load pretrained model, tokenizer, and image processor
    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
@@ -323,9 +323,9 @@ def main():
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

-    # Load feature_extractor, in this script we only use this to get the mean and std for normalization.
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.feature_extractor_name or model_args.model_name_or_path,
+    # Load image_processor, in this script we only use this to get the mean and std for normalization.
+    image_processor = AutoImageProcessor.from_pretrained(
+        model_args.image_processor_name or model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
@@ -386,7 +386,7 @@ def main():
    # 7. Preprocessing the datasets.
    # Initialize torchvision transforms and jit it for faster processing.
    image_transformations = Transform(
-        config.vision_config.image_size, feature_extractor.image_mean, feature_extractor.image_std
+        config.vision_config.image_size, image_processor.image_mean, image_processor.image_std
    )
    image_transformations = torch.jit.script(image_transformations)