[llava] one pixel is missing from padding when length is odd (#37819)
* [fix] one pixel should be added when length is odd * [fix] add vision_aspect_ratio args & typo * [fix] style * [fix] do not fix fast file directly * [fix] convert using modular * remove duplicate codes * match unpad logic with pad logic * test odd-sized images for llava & aria * test unpad odd-sized padding for llava family * fix style * add kwarg to onvision modular * move vision_aspect_ratio from image_processor to processor (llava_onevision)
This commit is contained in:
@@ -18,12 +18,11 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
from typing import Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution
|
||||
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_patch_output_size, select_best_resolution
|
||||
from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format
|
||||
from ...image_utils import (
|
||||
ChannelDimension,
|
||||
@@ -71,23 +70,6 @@ def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> Li
|
||||
return patches
|
||||
|
||||
|
||||
def _get_patch_output_size(image, target_resolution, input_data_format):
|
||||
original_height, original_width = get_image_size(image, channel_dim=input_data_format)
|
||||
target_height, target_width = target_resolution
|
||||
|
||||
scale_w = target_width / original_width
|
||||
scale_h = target_height / original_height
|
||||
|
||||
if scale_w < scale_h:
|
||||
new_width = target_width
|
||||
new_height = min(math.ceil(original_height * scale_w), target_height)
|
||||
else:
|
||||
new_height = target_height
|
||||
new_width = min(math.ceil(original_width * scale_h), target_width)
|
||||
|
||||
return new_height, new_width
|
||||
|
||||
|
||||
class AriaImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
A vision processor for the Aria model that handles image preprocessing.
|
||||
@@ -375,7 +357,7 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
Returns:
|
||||
np.array: The resized and padded image.
|
||||
"""
|
||||
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
|
||||
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
|
||||
|
||||
# Resize the image
|
||||
resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
|
||||
@@ -389,12 +371,12 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
Pad an image to a target resolution while maintaining aspect ratio.
|
||||
"""
|
||||
target_height, target_width = target_resolution
|
||||
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
|
||||
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
|
||||
|
||||
paste_x = (target_width - new_width) // 2
|
||||
paste_y = (target_height - new_height) // 2
|
||||
paste_x, r_x = divmod(target_width - new_width, 2)
|
||||
paste_y, r_y = divmod(target_height - new_height, 2)
|
||||
|
||||
padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x)))
|
||||
padded_image = self.pad(image, padding=((paste_y, paste_y + r_y), (paste_x, paste_x + r_x)))
|
||||
|
||||
return padded_image
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
@@ -20,7 +19,7 @@ import numpy as np
|
||||
from ...activations import ACT2FN
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...generation import GenerationMixin
|
||||
from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution
|
||||
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_patch_output_size, select_best_resolution
|
||||
from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format
|
||||
from ...image_utils import (
|
||||
ChannelDimension,
|
||||
@@ -461,23 +460,6 @@ class AriaProjector(nn.Module):
|
||||
return out
|
||||
|
||||
|
||||
def _get_patch_output_size(image, target_resolution, input_data_format):
|
||||
original_height, original_width = get_image_size(image, channel_dim=input_data_format)
|
||||
target_height, target_width = target_resolution
|
||||
|
||||
scale_w = target_width / original_width
|
||||
scale_h = target_height / original_height
|
||||
|
||||
if scale_w < scale_h:
|
||||
new_width = target_width
|
||||
new_height = min(math.ceil(original_height * scale_w), target_height)
|
||||
else:
|
||||
new_height = target_height
|
||||
new_width = min(math.ceil(original_width * scale_h), target_width)
|
||||
|
||||
return new_height, new_width
|
||||
|
||||
|
||||
class AriaImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
A vision processor for the Aria model that handles image preprocessing.
|
||||
@@ -765,7 +747,7 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
Returns:
|
||||
np.array: The resized and padded image.
|
||||
"""
|
||||
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
|
||||
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
|
||||
|
||||
# Resize the image
|
||||
resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
|
||||
@@ -779,12 +761,12 @@ class AriaImageProcessor(BaseImageProcessor):
|
||||
Pad an image to a target resolution while maintaining aspect ratio.
|
||||
"""
|
||||
target_height, target_width = target_resolution
|
||||
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
|
||||
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
|
||||
|
||||
paste_x = (target_width - new_width) // 2
|
||||
paste_y = (target_height - new_height) // 2
|
||||
paste_x, r_x = divmod(target_width - new_width, 2)
|
||||
paste_y, r_y = divmod(target_height - new_height, 2)
|
||||
|
||||
padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x)))
|
||||
padded_image = self.pad(image, padding=((paste_y, paste_y + r_y), (paste_x, paste_x + r_x)))
|
||||
|
||||
return padded_image
|
||||
|
||||
|
||||
Reference in New Issue
Block a user