[LlavaNext] Fix llava next unsafe imports (#29773)
* path llava-next * styling * styling
This commit is contained in:
@@ -748,6 +748,44 @@ def get_size_dict(
|
||||
return size_dict
|
||||
|
||||
|
||||
def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
|
||||
"""
|
||||
Selects the best resolution from a list of possible resolutions based on the original size.
|
||||
|
||||
This is done by calculating the effective and wasted resolution for each possible resolution.
|
||||
|
||||
The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
|
||||
|
||||
Args:
|
||||
original_size (tuple):
|
||||
The original size of the image in the format (height, width).
|
||||
possible_resolutions (list):
|
||||
A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
|
||||
|
||||
Returns:
|
||||
tuple: The best fit resolution in the format (height, width).
|
||||
"""
|
||||
original_height, original_width = original_size
|
||||
best_fit = None
|
||||
max_effective_resolution = 0
|
||||
min_wasted_resolution = float("inf")
|
||||
|
||||
for height, width in possible_resolutions:
|
||||
scale = min(width / original_width, height / original_height)
|
||||
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
|
||||
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
|
||||
wasted_resolution = (width * height) - effective_resolution
|
||||
|
||||
if effective_resolution > max_effective_resolution or (
|
||||
effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
|
||||
):
|
||||
max_effective_resolution = effective_resolution
|
||||
min_wasted_resolution = wasted_resolution
|
||||
best_fit = (height, width)
|
||||
|
||||
return best_fit
|
||||
|
||||
|
||||
ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
|
||||
if ImageProcessingMixin.push_to_hub.__doc__ is not None:
|
||||
ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
|
||||
|
||||
@@ -77,7 +77,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||
("layoutlmv3", "LayoutLMv3ImageProcessor"),
|
||||
("levit", "LevitImageProcessor"),
|
||||
("llava", "CLIPImageProcessor"),
|
||||
("llava_next", "CLIPImageProcessor"),
|
||||
("llava_next", "LlavaNextImageProcessor"),
|
||||
("mask2former", "Mask2FormerImageProcessor"),
|
||||
("maskformer", "MaskFormerImageProcessor"),
|
||||
("mgp-str", "ViTImageProcessor"),
|
||||
|
||||
@@ -19,7 +19,7 @@ from typing import Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
|
||||
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution
|
||||
from ...image_transforms import (
|
||||
convert_to_rgb,
|
||||
get_resize_output_image_size,
|
||||
@@ -51,44 +51,6 @@ if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
|
||||
"""
|
||||
Selects the best resolution from a list of possible resolutions based on the original size.
|
||||
|
||||
This is done by calculating the effective and wasted resolution for each possible resolution.
|
||||
|
||||
The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
|
||||
|
||||
Args:
|
||||
original_size (tuple):
|
||||
The original size of the image in the format (height, width).
|
||||
possible_resolutions (list):
|
||||
A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
|
||||
|
||||
Returns:
|
||||
tuple: The best fit resolution in the format (height, width).
|
||||
"""
|
||||
original_height, original_width = original_size
|
||||
best_fit = None
|
||||
max_effective_resolution = 0
|
||||
min_wasted_resolution = float("inf")
|
||||
|
||||
for height, width in possible_resolutions:
|
||||
scale = min(width / original_width, height / original_height)
|
||||
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
|
||||
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
|
||||
wasted_resolution = (width * height) - effective_resolution
|
||||
|
||||
if effective_resolution > max_effective_resolution or (
|
||||
effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
|
||||
):
|
||||
max_effective_resolution = effective_resolution
|
||||
min_wasted_resolution = wasted_resolution
|
||||
best_fit = (height, width)
|
||||
|
||||
return best_fit
|
||||
|
||||
|
||||
def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
|
||||
"""
|
||||
Divides an image into patches of a specified size.
|
||||
|
||||
@@ -24,6 +24,7 @@ from torch import nn
|
||||
from ... import PreTrainedModel
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache
|
||||
from ...image_processing_utils import select_best_resolution
|
||||
from ...modeling_outputs import ModelOutput
|
||||
from ...utils import (
|
||||
add_start_docstrings,
|
||||
@@ -33,7 +34,6 @@ from ...utils import (
|
||||
)
|
||||
from ..auto import AutoModel, AutoModelForCausalLM
|
||||
from .configuration_llava_next import LlavaNextConfig
|
||||
from .image_processing_llava_next import select_best_resolution
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
Reference in New Issue
Block a user