Update examples with image processors (#21155)

* Update examples to use image processors

* Small fixes

* Resolve conflicts
This commit is contained in:
amyeroberts
2023-01-19 15:14:58 +00:00
committed by GitHub
parent fc8a93507c
commit 4bc18e7a83
12 changed files with 124 additions and 137 deletions

View File

@@ -52,15 +52,15 @@ ds = datasets.load_dataset("ydshieh/coco_dataset_script", "2017", data_dir=COCO_
### Create a model from a vision encoder model and a text decoder model
Next, we create a [VisionTextDualEncoderModel](https://huggingface.co/docs/transformers/model_doc/vision-text-dual-encoder#visiontextdualencoder).
The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder.
The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder.
Here is an example of how to load the model using pre-trained vision and text models.
```python3
from transformers import (
VisionTextDualEncoderModel,
VisionTextDualEncoderProcessor,
AutoTokenizer,
AutoFeatureExtractor
VisionTextDualEncoderModel,
VisionTextDualEncoderProcessor,
AutoTokenizer,
AutoImageProcessor
)
model = VisionTextDualEncoderModel.from_vision_text_pretrained(
@@ -68,8 +68,8 @@ model = VisionTextDualEncoderModel.from_vision_text_pretrained(
)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
feat_ext = AutoFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32")
processor = VisionTextDualEncoderProcessor(feat_ext, tokenizer)
image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
# save the model and processor
model.save_pretrained("clip-roberta")

View File

@@ -38,7 +38,7 @@ from torchvision.transforms.functional import InterpolationMode
import transformers
from transformers import (
AutoFeatureExtractor,
AutoImageProcessor,
AutoModel,
AutoTokenizer,
HfArgumentParser,
@@ -74,7 +74,7 @@ class ModelArguments:
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
)
@@ -308,7 +308,7 @@ def main():
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# 5. Load pretrained model, tokenizer, and feature extractor
# 5. Load pretrained model, tokenizer, and image processor
if model_args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
@@ -323,9 +323,9 @@ def main():
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)
# Load feature_extractor, in this script we only use this to get the mean and std for normalization.
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.feature_extractor_name or model_args.model_name_or_path,
# Load image_processor, in this script we only use this to get the mean and std for normalization.
image_processor = AutoImageProcessor.from_pretrained(
model_args.image_processor_name or model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
@@ -386,7 +386,7 @@ def main():
# 7. Preprocessing the datasets.
# Initialize torchvision transforms and jit it for faster processing.
image_transformations = Transform(
config.vision_config.image_size, feature_extractor.image_mean, feature_extractor.image_std
config.vision_config.image_size, image_processor.image_mean, image_processor.image_std
)
image_transformations = torch.jit.script(image_transformations)

View File

@@ -38,7 +38,7 @@ import transformers
from transformers import (
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
AutoConfig,
AutoFeatureExtractor,
AutoImageProcessor,
AutoModelForImageClassification,
HfArgumentParser,
Trainer,
@@ -141,7 +141,7 @@ class ModelArguments:
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
use_auth_token: bool = field(
default=False,
metadata={
@@ -283,19 +283,19 @@ def main():
use_auth_token=True if model_args.use_auth_token else None,
ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.feature_extractor_name or model_args.model_name_or_path,
image_processor = AutoImageProcessor.from_pretrained(
model_args.image_processor_name or model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# Define torchvision transforms to be applied to each image.
if "shortest_edge" in feature_extractor.size:
size = feature_extractor.size["shortest_edge"]
if "shortest_edge" in image_processor.size:
size = image_processor.size["shortest_edge"]
else:
size = (feature_extractor.size["height"], feature_extractor.size["width"])
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
size = (image_processor.size["height"], image_processor.size["width"])
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
_train_transforms = Compose(
[
RandomResizedCrop(size),
@@ -352,7 +352,7 @@ def main():
train_dataset=dataset["train"] if training_args.do_train else None,
eval_dataset=dataset["validation"] if training_args.do_eval else None,
compute_metrics=compute_metrics,
tokenizer=feature_extractor,
tokenizer=image_processor,
data_collator=collate_fn,
)

View File

@@ -41,13 +41,7 @@ from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from huggingface_hub import Repository, create_repo
from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoModelForImageClassification,
SchedulerType,
get_scheduler,
)
from transformers import AutoConfig, AutoImageProcessor, AutoModelForImageClassification, SchedulerType, get_scheduler
from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
from transformers.utils.versions import require_version
@@ -294,7 +288,7 @@ def main():
label2id = {label: str(i) for i, label in enumerate(labels)}
id2label = {str(i): label for i, label in enumerate(labels)}
# Load pretrained model and feature extractor
# Load pretrained model and image processor
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
@@ -305,7 +299,7 @@ def main():
label2id=label2id,
finetuning_task="image-classification",
)
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_name_or_path)
image_processor = AutoImageProcessor.from_pretrained(args.model_name_or_path)
model = AutoModelForImageClassification.from_pretrained(
args.model_name_or_path,
from_tf=bool(".ckpt" in args.model_name_or_path),
@@ -316,11 +310,11 @@ def main():
# Preprocessing the datasets
# Define torchvision transforms to be applied to each image.
if "shortest_edge" in feature_extractor.size:
size = feature_extractor.size["shortest_edge"]
if "shortest_edge" in image_processor.size:
size = image_processor.size["shortest_edge"]
else:
size = (feature_extractor.size["height"], feature_extractor.size["width"])
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
size = (image_processor.size["height"], image_processor.size["width"])
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
train_transforms = Compose(
[
RandomResizedCrop(size),
@@ -505,7 +499,7 @@ def main():
save_function=accelerator.save,
)
if accelerator.is_main_process:
feature_extractor.save_pretrained(args.output_dir)
image_processor.save_pretrained(args.output_dir)
repo.push_to_hub(
commit_message=f"Training in progress {completed_steps} steps",
blocking=False,
@@ -547,7 +541,7 @@ def main():
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
)
if accelerator.is_main_process:
feature_extractor.save_pretrained(args.output_dir)
image_processor.save_pretrained(args.output_dir)
repo.push_to_hub(
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
)
@@ -568,7 +562,7 @@ def main():
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
)
if accelerator.is_main_process:
feature_extractor.save_pretrained(args.output_dir)
image_processor.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)

View File

@@ -29,7 +29,7 @@ from transformers import (
HfArgumentParser,
Trainer,
TrainingArguments,
ViTFeatureExtractor,
ViTImageProcessor,
ViTMAEConfig,
ViTMAEForPreTraining,
)
@@ -102,7 +102,7 @@ class DataTrainingArguments:
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/feature extractor we are going to pre-train.
Arguments pertaining to which model/config/image processor we are going to pre-train.
"""
model_name_or_path: str = field(
@@ -132,7 +132,7 @@ class ModelArguments:
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
use_auth_token: bool = field(
default=False,
metadata={
@@ -230,7 +230,7 @@ def main():
ds["train"] = split["train"]
ds["validation"] = split["test"]
# Load pretrained model and feature extractor
# Load pretrained model and image processor
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
@@ -260,13 +260,13 @@ def main():
}
)
# create feature extractor
if model_args.feature_extractor_name:
feature_extractor = ViTFeatureExtractor.from_pretrained(model_args.feature_extractor_name, **config_kwargs)
# create image processor
if model_args.image_processor_name:
image_processor = ViTImageProcessor.from_pretrained(model_args.image_processor_name, **config_kwargs)
elif model_args.model_name_or_path:
feature_extractor = ViTFeatureExtractor.from_pretrained(model_args.model_name_or_path, **config_kwargs)
image_processor = ViTImageProcessor.from_pretrained(model_args.model_name_or_path, **config_kwargs)
else:
feature_extractor = ViTFeatureExtractor()
image_processor = ViTImageProcessor()
# create model
if model_args.model_name_or_path:
@@ -298,17 +298,17 @@ def main():
# transformations as done in original MAE paper
# source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py
if "shortest_edge" in feature_extractor.size:
size = feature_extractor.size["shortest_edge"]
if "shortest_edge" in image_processor.size:
size = image_processor.size["shortest_edge"]
else:
size = (feature_extractor.size["height"], feature_extractor.size["width"])
size = (image_processor.size["height"], image_processor.size["width"])
transforms = Compose(
[
Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
RandomResizedCrop(size, scale=(0.2, 1.0), interpolation=InterpolationMode.BICUBIC),
RandomHorizontalFlip(),
ToTensor(),
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
]
)
@@ -349,7 +349,7 @@ def main():
args=training_args,
train_dataset=ds["train"] if training_args.do_train else None,
eval_dataset=ds["validation"] if training_args.do_eval else None,
tokenizer=feature_extractor,
tokenizer=image_processor,
data_collator=collate_fn,
)

View File

@@ -27,10 +27,10 @@ from torchvision.transforms import Compose, Lambda, Normalize, RandomHorizontalF
import transformers
from transformers import (
CONFIG_MAPPING,
FEATURE_EXTRACTOR_MAPPING,
IMAGE_PROCESSOR_MAPPING,
MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
AutoConfig,
AutoFeatureExtractor,
AutoImageProcessor,
AutoModelForMaskedImageModeling,
HfArgumentParser,
Trainer,
@@ -115,7 +115,7 @@ class DataTrainingArguments:
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/feature extractor we are going to pre-train.
Arguments pertaining to which model/config/image processor we are going to pre-train.
"""
model_name_or_path: str = field(
@@ -152,7 +152,7 @@ class ModelArguments:
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
use_auth_token: bool = field(
default=False,
metadata={
@@ -334,17 +334,16 @@ def main():
}
)
# create feature extractor
if model_args.feature_extractor_name:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.feature_extractor_name, **config_kwargs)
# create image processor
if model_args.image_processor_name:
image_processor = AutoImageProcessor.from_pretrained(model_args.image_processor_name, **config_kwargs)
elif model_args.model_name_or_path:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.model_name_or_path, **config_kwargs)
image_processor = AutoImageProcessor.from_pretrained(model_args.model_name_or_path, **config_kwargs)
else:
FEATURE_EXTRACTOR_TYPES = {
conf.model_type: feature_extractor_class
for conf, feature_extractor_class in FEATURE_EXTRACTOR_MAPPING.items()
IMAGE_PROCESSOR_TYPES = {
conf.model_type: image_processor_class for conf, image_processor_class in IMAGE_PROCESSOR_MAPPING.items()
}
feature_extractor = FEATURE_EXTRACTOR_TYPES[model_args.model_type]()
image_processor = IMAGE_PROCESSOR_TYPES[model_args.model_type]()
# create model
if model_args.model_name_or_path:
@@ -382,7 +381,7 @@ def main():
RandomResizedCrop(model_args.image_size, scale=(0.67, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)),
RandomHorizontalFlip(),
ToTensor(),
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
]
)
@@ -427,7 +426,7 @@ def main():
args=training_args,
train_dataset=ds["train"] if training_args.do_train else None,
eval_dataset=ds["validation"] if training_args.do_eval else None,
tokenizer=feature_extractor,
tokenizer=image_processor,
data_collator=collate_fn,
)

View File

@@ -40,7 +40,7 @@ from datasets import Dataset, DatasetDict, Image
# your images can of course have a different extension
# semantic segmentation maps are typically stored in the png format
image_paths_train = ["path/to/image_1.jpg/jpg", "path/to/image_2.jpg/jpg", ..., "path/to/image_n.jpg/jpg"]
image_paths_train = ["path/to/image_1.jpg/jpg", "path/to/image_2.jpg/jpg", ..., "path/to/image_n.jpg/jpg"]
label_paths_train = ["path/to/annotation_1.png", "path/to/annotation_2.png", ..., "path/to/annotation_n.png"]
# same for validation
@@ -52,7 +52,7 @@ def create_dataset(image_paths, label_paths):
"label": sorted(label_paths)})
dataset = dataset.cast_column("image", Image())
dataset = dataset.cast_column("label", Image())
return dataset
# step 1: create Dataset objects
@@ -91,7 +91,7 @@ You can easily upload this by clicking on "Add file" in the "Files and versions"
## PyTorch version, Trainer
Based on the script [`run_semantic_segmentation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py).
Based on the script [`run_semantic_segmentation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py).
The script leverages the [🤗 Trainer API](https://huggingface.co/docs/transformers/main_classes/trainer) to automatically take care of the training for you, running on distributed environments right away.
@@ -130,7 +130,7 @@ Note that you can replace the model and dataset by simply setting the `model_nam
Based on the script [`run_semantic_segmentation_no_trainer.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py).
The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision.
The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision.
First, run:
@@ -161,11 +161,11 @@ The resulting model can be seen here: https://huggingface.co/nielsr/segformer-fi
This means that after training, you can easily load your trained model as follows:
```python
from transformers import AutoFeatureExtractor, AutoModelForSemanticSegmentation
from transformers import AutoImageProcessor, AutoModelForSemanticSegmentation
model_name = "name_of_repo_on_the_hub_or_path_to_local_folder"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
image_processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModelForSemanticSegmentation.from_pretrained(model_name)
```
@@ -180,7 +180,7 @@ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
# prepare image for the model
inputs = feature_extractor(images=image, return_tensors="pt")
inputs = image_processor(images=image, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
@@ -201,4 +201,4 @@ For visualization of the segmentation maps, we refer to the [example notebook](h
Some datasets, like [`scene_parse_150`](https://huggingface.co/datasets/scene_parse_150), contain a "background" label that is not part of the classes. The Scene Parse 150 dataset for instance contains labels between 0 and 150, with 0 being the background class, and 1 to 150 being actual class names (like "tree", "person", etc.). For these kind of datasets, one replaces the background label (0) by 255, which is the `ignore_index` of the PyTorch model's loss function, and reduces all labels by 1. This way, the `labels` are PyTorch tensors containing values between 0 and 149, and 255 for all background/padding.
In case you're training on such a dataset, make sure to set the ``reduce_labels`` flag, which will take care of this.
In case you're training on such a dataset, make sure to set the ``reduce_labels`` flag, which will take care of this.

View File

@@ -34,7 +34,7 @@ import transformers
from huggingface_hub import hf_hub_download
from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoImageProcessor,
AutoModelForSemanticSegmentation,
HfArgumentParser,
Trainer,
@@ -240,7 +240,7 @@ class ModelArguments:
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
use_auth_token: bool = field(
default=False,
metadata={
@@ -358,7 +358,7 @@ def main():
references=labels,
num_labels=len(id2label),
ignore_index=0,
reduce_labels=feature_extractor.do_reduce_labels,
reduce_labels=image_processor.do_reduce_labels,
)
# add per category metrics as individual key-value pairs
per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
@@ -385,8 +385,8 @@ def main():
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.feature_extractor_name or model_args.model_name_or_path,
image_processor = AutoImageProcessor.from_pretrained(
model_args.image_processor_name or model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
@@ -395,11 +395,11 @@ def main():
# Define torchvision transforms to be applied to each image + target.
# Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9
# Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py
if "shortest_edge" in feature_extractor.size:
if "shortest_edge" in image_processor.size:
# We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable.
size = (feature_extractor.size["shortest_edge"], feature_extractor.size["shortest_edge"])
size = (image_processor.size["shortest_edge"], image_processor.size["shortest_edge"])
else:
size = (feature_extractor.size["height"], feature_extractor.size["width"])
size = (image_processor.size["height"], image_processor.size["width"])
train_transforms = Compose(
[
ReduceLabels() if data_args.reduce_labels else Identity(),
@@ -407,7 +407,7 @@ def main():
RandomHorizontalFlip(flip_prob=0.5),
PILToTensor(),
ConvertImageDtype(torch.float),
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
]
)
# Define torchvision transform to be applied to each image.
@@ -418,7 +418,7 @@ def main():
Resize(size=size),
PILToTensor(),
ConvertImageDtype(torch.float),
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
]
)
@@ -477,7 +477,7 @@ def main():
train_dataset=dataset["train"] if training_args.do_train else None,
eval_dataset=dataset["validation"] if training_args.do_eval else None,
compute_metrics=compute_metrics,
tokenizer=feature_extractor,
tokenizer=image_processor,
data_collator=default_data_collator,
)

View File

@@ -39,7 +39,7 @@ from accelerate.utils import set_seed
from huggingface_hub import Repository, create_repo, hf_hub_download
from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoImageProcessor,
AutoModelForSemanticSegmentation,
SchedulerType,
default_data_collator,
@@ -397,20 +397,20 @@ def main():
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}
# Load pretrained model and feature extractor
# Load pretrained model and image processor
config = AutoConfig.from_pretrained(args.model_name_or_path, id2label=id2label, label2id=label2id)
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_name_or_path)
image_processor = AutoImageProcessor.from_pretrained(args.model_name_or_path)
model = AutoModelForSemanticSegmentation.from_pretrained(args.model_name_or_path, config=config)
# Preprocessing the datasets
# Define torchvision transforms to be applied to each image + target.
# Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9
# Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py
if "shortest_edge" in feature_extractor.size:
if "shortest_edge" in image_processor.size:
# We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable.
size = (feature_extractor.size["shortest_edge"], feature_extractor.size["shortest_edge"])
size = (image_processor.size["shortest_edge"], image_processor.size["shortest_edge"])
else:
size = (feature_extractor.size["height"], feature_extractor.size["width"])
size = (image_processor.size["height"], image_processor.size["width"])
train_transforms = Compose(
[
ReduceLabels() if args.reduce_labels else Identity(),
@@ -418,7 +418,7 @@ def main():
RandomHorizontalFlip(flip_prob=0.5),
PILToTensor(),
ConvertImageDtype(torch.float),
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
]
)
# Define torchvision transform to be applied to each image.
@@ -429,7 +429,7 @@ def main():
Resize(size=size),
PILToTensor(),
ConvertImageDtype(torch.float),
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
]
)
@@ -602,7 +602,7 @@ def main():
save_function=accelerator.save,
)
if accelerator.is_main_process:
feature_extractor.save_pretrained(args.output_dir)
image_processor.save_pretrained(args.output_dir)
repo.push_to_hub(
commit_message=f"Training in progress {completed_steps} steps",
blocking=False,
@@ -657,7 +657,7 @@ def main():
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
)
if accelerator.is_main_process:
feature_extractor.save_pretrained(args.output_dir)
image_processor.save_pretrained(args.output_dir)
repo.push_to_hub(
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
)
@@ -678,7 +678,7 @@ def main():
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
)
if accelerator.is_main_process:
feature_extractor.save_pretrained(args.output_dir)
image_processor.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)