[LlavaNext] Fix llava next unsafe imports (#29773)

* path llava-next

* styling

* styling
This commit is contained in:
Arthur
2024-03-21 21:47:58 +09:00
committed by GitHub
parent 2ddceef9a2
commit 73a73b415e
4 changed files with 41 additions and 41 deletions

View File

@@ -748,6 +748,44 @@ def get_size_dict(
return size_dict
def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
"""
Selects the best resolution from a list of possible resolutions based on the original size.
This is done by calculating the effective and wasted resolution for each possible resolution.
The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
Args:
original_size (tuple):
The original size of the image in the format (height, width).
possible_resolutions (list):
A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
Returns:
tuple: The best fit resolution in the format (height, width).
"""
original_height, original_width = original_size
best_fit = None
max_effective_resolution = 0
min_wasted_resolution = float("inf")
for height, width in possible_resolutions:
scale = min(width / original_width, height / original_height)
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
wasted_resolution = (width * height) - effective_resolution
if effective_resolution > max_effective_resolution or (
effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
):
max_effective_resolution = effective_resolution
min_wasted_resolution = wasted_resolution
best_fit = (height, width)
return best_fit
ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
if ImageProcessingMixin.push_to_hub.__doc__ is not None:
ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(

View File

@@ -77,7 +77,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
("layoutlmv3", "LayoutLMv3ImageProcessor"),
("levit", "LevitImageProcessor"),
("llava", "CLIPImageProcessor"),
("llava_next", "CLIPImageProcessor"),
("llava_next", "LlavaNextImageProcessor"),
("mask2former", "Mask2FormerImageProcessor"),
("maskformer", "MaskFormerImageProcessor"),
("mgp-str", "ViTImageProcessor"),

View File

@@ -19,7 +19,7 @@ from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution
from ...image_transforms import (
convert_to_rgb,
get_resize_output_image_size,
@@ -51,44 +51,6 @@ if is_vision_available():
from PIL import Image
def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
"""
Selects the best resolution from a list of possible resolutions based on the original size.
This is done by calculating the effective and wasted resolution for each possible resolution.
The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
Args:
original_size (tuple):
The original size of the image in the format (height, width).
possible_resolutions (list):
A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
Returns:
tuple: The best fit resolution in the format (height, width).
"""
original_height, original_width = original_size
best_fit = None
max_effective_resolution = 0
min_wasted_resolution = float("inf")
for height, width in possible_resolutions:
scale = min(width / original_width, height / original_height)
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
wasted_resolution = (width * height) - effective_resolution
if effective_resolution > max_effective_resolution or (
effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
):
max_effective_resolution = effective_resolution
min_wasted_resolution = wasted_resolution
best_fit = (height, width)
return best_fit
def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
"""
Divides an image into patches of a specified size.

View File

@@ -24,6 +24,7 @@ from torch import nn
from ... import PreTrainedModel
from ...activations import ACT2FN
from ...cache_utils import Cache
from ...image_processing_utils import select_best_resolution
from ...modeling_outputs import ModelOutput
from ...utils import (
add_start_docstrings,
@@ -33,7 +34,6 @@ from ...utils import (
)
from ..auto import AutoModel, AutoModelForCausalLM
from .configuration_llava_next import LlavaNextConfig
from .image_processing_llava_next import select_best_resolution
logger = logging.get_logger(__name__)