Update examples with image processors (#21155)
* Update examples to use image processors * Small fixes * Resolve conflicts
This commit is contained in:
@@ -52,15 +52,15 @@ ds = datasets.load_dataset("ydshieh/coco_dataset_script", "2017", data_dir=COCO_
|
||||
|
||||
### Create a model from a vision encoder model and a text decoder model
|
||||
Next, we create a [VisionTextDualEncoderModel](https://huggingface.co/docs/transformers/model_doc/vision-text-dual-encoder#visiontextdualencoder).
|
||||
The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder.
|
||||
The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder.
|
||||
Here is an example of how to load the model using pre-trained vision and text models.
|
||||
|
||||
```python3
|
||||
from transformers import (
|
||||
VisionTextDualEncoderModel,
|
||||
VisionTextDualEncoderProcessor,
|
||||
AutoTokenizer,
|
||||
AutoFeatureExtractor
|
||||
VisionTextDualEncoderModel,
|
||||
VisionTextDualEncoderProcessor,
|
||||
AutoTokenizer,
|
||||
AutoImageProcessor
|
||||
)
|
||||
|
||||
model = VisionTextDualEncoderModel.from_vision_text_pretrained(
|
||||
@@ -68,8 +68,8 @@ model = VisionTextDualEncoderModel.from_vision_text_pretrained(
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||
feat_ext = AutoFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32")
|
||||
processor = VisionTextDualEncoderProcessor(feat_ext, tokenizer)
|
||||
image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
||||
processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
|
||||
|
||||
# save the model and processor
|
||||
model.save_pretrained("clip-roberta")
|
||||
|
||||
@@ -38,7 +38,7 @@ from torchvision.transforms.functional import InterpolationMode
|
||||
|
||||
import transformers
|
||||
from transformers import (
|
||||
AutoFeatureExtractor,
|
||||
AutoImageProcessor,
|
||||
AutoModel,
|
||||
AutoTokenizer,
|
||||
HfArgumentParser,
|
||||
@@ -74,7 +74,7 @@ class ModelArguments:
|
||||
tokenizer_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
||||
)
|
||||
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
||||
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
||||
cache_dir: Optional[str] = field(
|
||||
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
|
||||
)
|
||||
@@ -308,7 +308,7 @@ def main():
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
|
||||
# 5. Load pretrained model, tokenizer, and feature extractor
|
||||
# 5. Load pretrained model, tokenizer, and image processor
|
||||
if model_args.tokenizer_name:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
|
||||
@@ -323,9 +323,9 @@ def main():
|
||||
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
|
||||
)
|
||||
|
||||
# Load feature_extractor, in this script we only use this to get the mean and std for normalization.
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
||||
model_args.feature_extractor_name or model_args.model_name_or_path,
|
||||
# Load image_processor, in this script we only use this to get the mean and std for normalization.
|
||||
image_processor = AutoImageProcessor.from_pretrained(
|
||||
model_args.image_processor_name or model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
@@ -386,7 +386,7 @@ def main():
|
||||
# 7. Preprocessing the datasets.
|
||||
# Initialize torchvision transforms and jit it for faster processing.
|
||||
image_transformations = Transform(
|
||||
config.vision_config.image_size, feature_extractor.image_mean, feature_extractor.image_std
|
||||
config.vision_config.image_size, image_processor.image_mean, image_processor.image_std
|
||||
)
|
||||
image_transformations = torch.jit.script(image_transformations)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user