Add fuyu model (#26911)

* initial commit * add processor, add fuyu naming * add draft processor * fix processor * remove dropout to fix loading of weights * add image processing fixes from Pedro * fix * fix processor * add basic processing fuyu test * add documentation and TODO * address comments, add tests, add doc * replace assert with torch asserts * add Mixins and fix tests * clean imports * add model tester, clean imports * fix embedding test * add updated tests from pre-release model * Processor: return input_ids used for inference * separate processing and model tests * relax test tolerance for embeddings * add test for logit comparison * make sure fuyu image processor is imported in the init * fix formattingh * more formatting issues * and more * fixups * remove some stuff * nits * update init * remove the fuyu file * Update integration test with release model * Update conversion script. The projection is not used, as confirmed by the authors. * improve geenration * Remove duplicate function * Trickle down patches to model call * processing fuyu updates * remove things * fix prepare_inputs_for_generation to fix generate() * remove model_input * update * add generation tests * nits * draft leverage automodel and autoconfig * nits * fix dtype patch * address comments, update READMEs and doc, include tests * add working processing test, remove refs to subsequences * add tests, remove Sequence classification * processing * update * update the conversion script * more processing cleanup * safe import * take out ModelTesterMixin for early release * more cl;eanup * more cleanup * more cleanup * and more * register a buffer * nits * add postprocessing of generate output * nits * updates * add one working test * fix test * make fixup works * fixup * Arthur's updates * nits * update * update * fix processor * update tests * passe more fixups * fix * nits * don't import torch * skip fuyu config for now * fixup done * fixup * update * oups * nits * Use input embeddings * no buffer * update * styling processing fuyu * fix test * update licence * protect torch import * fixup and update not doctested * kwargs should be passed * udpates * update the impofixuprts in the test * protect import * protecting imports * protect imports in type checking * add testing decorators * protect top level import structure * fix typo * fix check init * move requires_backend to functions * Imports * Protect types --------- Co-authored-by: Pedro Cuenca <pedro@huggingface.co> Co-authored-by: ArthurZucker <arthur.zucker@gmail.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Lysandre <lysandre@huggingface.co>
2023-10-19 00:24:11 +02:00
parent 5a73316bed
commit caa0ff0bf1
33 changed files with 2277 additions and 1 deletions
--- a/tests/models/fuyu/init.py
+++ b/tests/models/fuyu/init.py
--- a/tests/models/fuyu/test_image_processing_fuyu.py
+++ b/tests/models/fuyu/test_image_processing_fuyu.py
@@ -0,0 +1,65 @@
+import unittest
+
+import numpy as np
+
+from transformers import is_torch_available, is_vision_available
+from transformers.testing_utils import (
+    require_torch,
+    require_torchvision,
+    require_vision,
+)
+
+
+if is_torch_available() and is_vision_available():
+    import torch
+
+    from transformers import FuyuImageProcessor
+
+if is_vision_available():
+    from PIL import Image
+
+
+@require_torch
+@require_vision
+@require_torchvision
+class TestFuyuImageProcessor(unittest.TestCase):
+    def setUp(self):
+        self.processor = FuyuImageProcessor(target_height=160, target_width=320, padding_value=1.0)
+        self.batch_size = 3
+        self.channels = 3
+        self.height = 300
+        self.width = 300
+
+        self.image_input = torch.rand(self.batch_size, self.channels, self.height, self.width)
+
+        self.image_patch_dim_h = 30
+        self.image_patch_dim_w = 30
+        self.sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+        self.sample_image_pil = Image.fromarray(self.sample_image)
+
+    def test_patches(self):
+        expected_num_patches = self.processor.get_num_patches(
+            img_h=self.height, img_w=self.width, patch_dim_h=self.image_patch_dim_h, patch_dim_w=self.image_patch_dim_w
+        )
+
+        patches_final = self.processor.patchify_image(
+            image=self.image_input, patch_dim_h=self.image_patch_dim_h, patch_dim_w=self.image_patch_dim_w
+        )
+        assert (
+            patches_final.shape[1] == expected_num_patches
+        ), f"Expected {expected_num_patches} patches, got {patches_final.shape[1]}."
+
+    def test_scale_to_target_aspect_ratio(self):
+        scaled_image = self.processor._scale_to_target_aspect_ratio(self.sample_image)
+        self.assertEqual(scaled_image.shape[0], 74)
+        self.assertEqual(scaled_image.shape[1], 160)
+
+    def test_apply_transformation_numpy(self):
+        transformed_image = self.processor.apply_transformation(self.sample_image)
+        self.assertEqual(transformed_image.shape[0], 160)
+        self.assertEqual(transformed_image.shape[1], 320)
+
+    def test_apply_transformation_pil(self):
+        transformed_image = self.processor.apply_transformation(self.sample_image_pil)
+        self.assertEqual(transformed_image.shape[0], 160)
+        self.assertEqual(transformed_image.shape[1], 320)
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@@ -0,0 +1,362 @@
+import io
+import unittest
+
+import requests
+
+from transformers import AutoTokenizer, FuyuConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
+
+from ...test_modeling_common import ids_tensor, random_attention_mask
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+if is_torch_available() and is_vision_available():
+    from transformers import FuyuImageProcessor, FuyuProcessor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import FuyuForCausalLM
+
+
+# Copied from transformers.tests.llama.test_modelling_llama.LlamaModelTest with Llama->Fuyu
+class FuyuModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        image_size=300,
+        patch_size=30,
+        num_channels=3,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return FuyuConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = FuyuForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = FuyuForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = FuyuForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = FuyuForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+@require_torch_gpu
+@slow
+class FuyuIntegrationTest(unittest.TestCase):  # , ModelTesterMixin)
+    """
+    Currently, all these tests depend on a value of max_tokens_to_generate of 10.
+    """
+
+    all_model_classes = ("FuyuForCausalLM") if is_torch_available() else ()
+
+    def setUp(self):
+        self.pretrained_model_name = "huggingface/new_model_release_weights"
+        tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_name)
+        image_processor = FuyuImageProcessor()
+
+        self.processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
+        self.model = FuyuForCausalLM.from_pretrained(self.pretrained_model_name)
+        self.bus_image_url = (
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
+        )
+        self.bus_image_pil = Image.open(io.BytesIO(requests.get(self.bus_image_url).content))
+
+    @slow
+    @require_torch_gpu
+    def test_model_8b_chat_greedy_generation_bus_captioning(self):
+        EXPECTED_TEXT_COMPLETION = """A bus parked on the side of a road.|ENDOFTEXT|"""
+        text_prompt_coco_captioning = "Generate a coco-style caption.\n"
+
+        model_inputs_bus_captioning = self.processor(text=text_prompt_coco_captioning, images=self.bus_image_pil)
+        generated_tokens = self.model.generate(**model_inputs_bus_captioning, max_new_tokens=10)
+        text = self.processor.tokenizer.batch_decode(generated_tokens)
+        end_sequence = text[0].split("\x04")[1]
+        clean_sequence = (
+            end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
+            if "|ENDOFTEXT|" in end_sequence
+            else end_sequence
+        )
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, clean_sequence[1:])
+
+
+"""
+    @slow
+    @require_torch_gpu
+    def test_model_8b_chat_greedy_generation_bus_color(self):
+        EXPECTED_TEXT_COMPLETION = "The bus is blue.\n|ENDOFTEXT|"
+        text_prompt_bus_color = "What color is the bus?\n"
+        model_inputs_bus_color = self.processor(text=text_prompt_bus_color, images=self.bus_image_pil)
+
+        generated_tokens = self.model.generate(**model_inputs_bus_color, max_new_tokens=10)
+        text = self.processor.tokenizer.batch_decode(generated_tokens)
+        end_sequence = text[0].split("\x04")[1]
+        clean_sequence = (
+            end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
+            if "|ENDOFTEXT|" in end_sequence
+            else end_sequence
+        )
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, clean_sequence)
+
+    @slow
+    @require_torch_gpu
+    def test_model_8b_chat_greedy_generation_chart_vqa(self):
+        # fmt: off
+        EXPECTED_TEXT_TOKENS = ["The","life expectancy","at","birth","of male","s in","","20","18","is","","80",".","7",".","\n","|ENDOFTEXT|",]
+        # fmt: on
+        expected_text_completion = " ".join(EXPECTED_TEXT_TOKENS)  # TODO make sure the end string matches
+
+        text_prompt_chart_vqa = "What is the highest life expectancy at birth of male?\n"
+
+        chart_image_url = (
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/chart.png"
+        )
+        chart_image_pil = Image.open(io.BytesIO(requests.get(chart_image_url).content))
+
+        model_inputs_chart_vqa = self.processor(text=text_prompt_chart_vqa, images=chart_image_pil)
+        generated_tokens = self.model.generate(**model_inputs_chart_vqa, max_new_tokens=10)
+        text = self.processor.tokenizer.batch_decode(generated_tokens)
+        end_sequence = text[0].split("\x04")[1]
+        clean_sequence = (
+            end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
+            if "|ENDOFTEXT|" in end_sequence
+            else end_sequence
+        )
+        self.assertEqual(expected_text_completion, clean_sequence)
+
+    @slow
+    @require_torch_gpu
+    def test_model_8b_chat_greedy_generation_bounding_box(self):
+        EXPECTED_TEXT_COMPLETION = "\x00194213202244\x01|ENDOFTEXT|"
+        text_prompt_bbox = "When presented with a box, perform OCR to extract text contained within it. If provided with text, generate the corresponding bounding box.\\nWilliams"  # noqa: E231
+
+        bbox_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bbox_sample_image.png"
+        bbox_image_pil = Image.open(io.BytesIO(requests.get(bbox_image_url).content))
+
+        model_inputs_bbox = self.processor(text=text_prompt_bbox, images=bbox_image_pil)
+        generated_tokens = self.model.generate(**model_inputs_bbox, max_new_tokens=10)
+        text = self.processor.tokenizer.batch_decode(generated_tokens)
+        end_sequence = text[0].split("\x04")[1]
+        clean_sequence = (
+            end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
+            if "|ENDOFTEXT|" in end_sequence
+            else end_sequence
+        )
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, clean_sequence)
+"""
--- a/tests/models/fuyu/test_processing_fuyu.py
+++ b/tests/models/fuyu/test_processing_fuyu.py
@@ -0,0 +1,126 @@
+import io
+import unittest
+
+import requests
+
+from transformers import AutoTokenizer, is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_torch_gpu, slow
+
+
+if is_vision_available():
+    from PIL import Image
+
+if is_vision_available() and is_torch_available():
+    from transformers import FuyuImageProcessor, FuyuProcessor
+
+if is_torch_available():
+    import torch
+
+    from transformers.models.fuyu.processing_fuyu import construct_full_unpacked_stream, full_unpacked_stream_to_tensor
+
+
+@require_torch
+@require_torch_gpu
+@slow
+class FuyuProcessingTest(unittest.TestCase):  # TODO Which mixins do we add here?
+    """ """
+
+    def setUp(self):
+        pretrained_model_name = "huggingface/pre_release_model"
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
+        image_processor = FuyuImageProcessor()
+
+        processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
+        text_prompt = "Generate a coco-style caption.\\n"
+        bus_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
+        bus_image_pil = Image.open(io.BytesIO(requests.get(bus_image_url).content))
+
+        self.one_image_bus_model_inputs = processor(text=text_prompt, images=bus_image_pil)
+
+    def test_fuyu_processing(self):
+        """
+        Test to ensure that the standard processing on a gold example matches adept's code.
+        """
+        # fmt: off
+        EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64)
+        EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64)
+        # fmt: on
+        torch.testing.assert_close(
+            self.one_image_bus_model_inputs["image_patches_indices"], EXPECTED_IMAGE_PATCH_INPUTS
+        )
+        torch.testing.assert_close(self.one_image_bus_model_inputs["input_ids"], EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS)
+
+
+@require_torch
+class TestImageTextProcessingUtils(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 2
+        self.new_seq_len = 8
+        self.num_sub_sequences = 1
+
+        self.all_bi_tokens_to_place = [4, 6]
+        self.full_unpacked_stream = [torch.tensor([1, 2, 3, 4]), torch.tensor([5, 6, 7, 8, 9, 10])]
+        self.fill_value = 0
+
+        self.num_real_text_tokens = [[3, 2], [2, 4]]
+        # Here the input stream is padded to avoid inconsistencies (current model release matches)
+        self.input_stream = torch.tensor([[[1, 2, 3], [4, 5, 0]], [[6, 7, 0], [8, 9, 10]]])
+        self.image_tokens = [
+            [torch.tensor([1, 2]), torch.tensor([3])],
+            [torch.tensor([4, 5, 6]), torch.tensor([7, 8])],
+        ]
+
+    def test_full_unpacked_stream_to_tensor(self):
+        result = full_unpacked_stream_to_tensor(
+            self.all_bi_tokens_to_place,
+            self.full_unpacked_stream,
+            self.fill_value,
+            self.batch_size,
+            self.new_seq_len,
+            offset=0,
+        )
+        EXPECTED_TENSOR = torch.tensor([[1, 2, 3, 4, 0, 0, 0, 0], [5, 6, 7, 8, 9, 10, 0, 0]])
+        self.assertTrue(torch.equal(result, EXPECTED_TENSOR))
+
+    def test_construct_full_unpacked_stream(self):
+        result = construct_full_unpacked_stream(
+            self.num_real_text_tokens, self.input_stream, self.image_tokens, self.batch_size, self.num_sub_sequences
+        )
+        EXPECTED_UNPACKED_STREAM = [torch.tensor([1, 2, 1, 2, 3]), torch.tensor([4, 5, 6, 6, 7])]
+        for i in range(len(result)):
+            self.assertTrue(torch.equal(result[i], EXPECTED_UNPACKED_STREAM[i]))
+
+
+@require_torch
+class TestProcessImagesForModelInput(unittest.TestCase):
+    def setUp(self):
+        """
+        Adding a mix of present and absent images.
+        """
+        self.image_processor = FuyuImageProcessor()
+
+        self.image_input = torch.randn([1, 1, 3, 64, 64])
+        self.image_present = torch.tensor([[1]])
+        self.image_unpadded_h = torch.tensor([[45]])  # Adjusted for subsequence of 1
+        self.image_unpadded_w = torch.tensor([[50]])  # Adjusted for subsequence of 1
+        self.image_patch_dim_h = 16
+        self.image_patch_dim_w = 16
+        self.image_placeholder_id = 999
+        self.image_newline_id = 888
+        self.variable_sized = True
+
+    def test_process_images_for_model_input_fixed_sized(self):
+        self.variable_sized = False
+        result = self.image_processor.process_images_for_model_input(
+            image_input=self.image_input,
+            image_present=self.image_present,
+            image_unpadded_h=self.image_unpadded_h,
+            image_unpadded_w=self.image_unpadded_w,
+            image_patch_dim_h=self.image_patch_dim_h,
+            image_patch_dim_w=self.image_patch_dim_w,
+            image_placeholder_id=self.image_placeholder_id,
+            image_newline_id=self.image_newline_id,
+            variable_sized=self.variable_sized,
+        )
+        print(result["images"][0][0])
+        self.assertEqual(result["images"][0][0].shape, torch.Size([3, 64, 64]))