diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index d4f3f2373b..66c7c95e6b 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -1539,16 +1539,25 @@ class Blip2Model(Blip2PreTrainedModel): # step 3: use the language model, conditioned on the query outputs and the prompt language_model_inputs = self.language_projection(query_output) - language_model_attention_mask = torch.ones( - language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device - ) + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) - inputs_embeds = torch.cat([language_model_inputs, inputs_embeds], dim=1) if attention_mask is None: attention_mask = torch.ones_like(input_ids) - expected_device = language_model_attention_mask.device - attention_mask = torch.cat([language_model_attention_mask, attention_mask.to(expected_device)], dim=1) + + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(language_model_inputs.device) + language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.to(language_model_inputs.device).masked_scatter( + special_image_mask, language_model_inputs + ) if self.config.use_decoder_only_language_model: outputs = self.language_model( @@ -2026,9 +2035,6 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin): ) vision_outputs = vision_outputs.to_tuple() if not return_dict else vision_outputs query_outputs = query_outputs.to_tuple() if not return_dict else query_outputs - language_model_attention_mask = torch.ones( - language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device - ) if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids) @@ -2036,34 +2042,19 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin): if attention_mask is None: attention_mask = torch.ones_like(input_ids) - # if the model already has "image_token_id" then the input is expanded to account for image embeds - # otherwise we expand manually by concatenating - if getattr(self.config, "image_token_id", None) is not None: - if input_ids is None: - special_image_mask = inputs_embeds == self.get_input_embeddings()( - torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) - ) - special_image_mask = special_image_mask.all(-1) - else: - special_image_mask = input_ids == self.config.image_token_id - - special_image_mask = ( - special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(language_model_inputs.device) - ) - language_model_inputs = language_model_inputs.to(inputs_embeds.dtype) - inputs_embeds = inputs_embeds.to(language_model_inputs.device).masked_scatter( - special_image_mask, language_model_inputs + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) ) + special_image_mask = special_image_mask.all(-1) else: - logger.warning_once( - "Expanding inputs for image tokens in BLIP-2 should be done in processing. " - "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." - ) - inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) - attention_mask = torch.cat( - [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1 - ) + special_image_mask = input_ids == self.config.image_token_id + + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(language_model_inputs.device) + language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.to(language_model_inputs.device).masked_scatter( + special_image_mask, language_model_inputs + ) if self.config.use_decoder_only_language_model: outputs = self.language_model( @@ -2172,15 +2163,11 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin): query_output = query_output.to(image_embeds.dtype) language_model_inputs = self.language_projection(query_output) - language_attention_mask = torch.ones( - language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device - ) if inputs_embeds is None: if input_ids is None: - start_tokens = [self.config.text_config.bos_token_id] - if getattr(self.config, "image_token_id", None) is not None: - start_tokens = [self.config.image_token_id] * self.config.num_query_tokens + start_tokens + image_tokens = [self.config.image_token_index] * self.config.num_query_tokens + start_tokens = image_tokens + [self.config.text_config.bos_token_id] input_ids = torch.tensor([start_tokens], dtype=torch.long, device=image_embeds.device) input_ids = input_ids.repeat(batch_size, 1) inputs_embeds = self.get_input_embeddings()(input_ids) @@ -2188,53 +2175,24 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin): if attention_mask is None: attention_mask = torch.ones_like(input_ids) - # if the model already has "image_token_id" then the input is expanded to account for image embeds - # otherwise we expand manually by concatenating - if getattr(self.config, "image_token_id", None) is not None: - if input_ids is None: - special_image_mask = inputs_embeds == self.get_input_embeddings()( - torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) - ) - special_image_mask = special_image_mask.all(-1) - else: - special_image_mask = input_ids == self.config.image_token_id - - special_image_mask = ( - special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(language_model_inputs.device) + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) ) - language_model_inputs = language_model_inputs.to(inputs_embeds.dtype) - inputs_embeds = inputs_embeds.to(language_model_inputs.device).masked_scatter( - special_image_mask, language_model_inputs - ) - - attention_mask = attention_mask.to(language_attention_mask.device) + special_image_mask = special_image_mask.all(-1) else: - logger.warning_once( - "Expanding inputs for image tokens in BLIP-2 should be done in processing. " - "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." - ) - inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) - attention_mask = torch.cat( - [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1 - ) + special_image_mask = input_ids == self.config.image_token_id - # add image_embeds length to max_length, so that the final max_length in counted only on token embeds - # -1 is to account for the prepended BOS after `generate.` - # TODO (joao, raushan): refactor `generate` to avoid these operations with VLMs - if not self.language_model.config.is_encoder_decoder: - generate_kwargs["max_length"] = ( - generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1 - ) - generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - if input_ids is not None: - input_ids = input_ids.to(language_model_inputs.device) inputs["input_ids"] = input_ids outputs = self.language_model.generate(**inputs, **generate_kwargs) + return outputs @@ -2362,8 +2320,13 @@ class Blip2ForImageTextRetrieval(Blip2PreTrainedModel): if use_image_text_matching_head: query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) - query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=query_tokens.device) - attention_mask = torch.cat([query_attention_mask, attention_mask], dim=1) + if self.config.image_token_index is not None: + input_ids = input_ids[:, self.config.num_query_tokens :] + else: + query_attention_mask = torch.ones( + query_tokens.size()[:-1], dtype=torch.long, device=query_tokens.device + ) + attention_mask = torch.cat([query_attention_mask, attention_mask], dim=1) query_embeds = self.embeddings( input_ids=input_ids, @@ -2395,6 +2358,10 @@ class Blip2ForImageTextRetrieval(Blip2PreTrainedModel): image_embeds = query_outputs[0] if not return_dict else query_outputs.last_hidden_state image_embeds = image_embeds.to(dtype=self.vision_projection.weight.dtype) + if self.config.image_token_index is not None: + input_ids = input_ids[:, self.config.num_query_tokens :] + attention_mask = attention_mask[:, self.config.num_query_tokens :] + query_embeds = self.embeddings( input_ids=input_ids, ) diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index 810311d492..71a1f5ef4d 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -112,11 +112,13 @@ class Blip2Processor(ProcessorMixin): tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) + # BC for explicit return_tensors - if "return_tensors" in output_kwargs["common_kwargs"]: - return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None) - else: - return_tensors = None + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + max_length = output_kwargs["text_kwargs"].pop("max_length", None) + if max_length is not None: + output_kwargs["text_kwargs"]["max_length"] = max_length - self.num_query_tokens + encoding = BatchFeature(tensor_type=return_tensors) if text is not None: if isinstance(text, str): @@ -124,40 +126,28 @@ class Blip2Processor(ProcessorMixin): elif not isinstance(text, list) and not isinstance(text[0], str): raise ValueError("Invalid input text. Please provide a string, or a list of strings") - text_encoding = {} + # We need this hacky manipulation because BLIP expects image tokens to be at the beginning even before BOS token + text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) - return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) - _text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None) - output_kwargs["text_kwargs"]["return_tensors"] = return_tensors - - # if we know how many query tokens, expand text inside processor. We need this hacky manipulation - # because BLIP expects image tokens to be at the beginning even before BOS token - if self.num_query_tokens is not None: + if images is not None and self.num_query_tokens is not None: + # Image tokens should not be padded/truncated or prepended with special BOS token image_tokens = self.image_token.content * self.num_query_tokens - image_token_encoding = self.tokenizer( - [image_tokens] * len(text), add_special_tokens=False, return_tensors=None - ) - for k in _text_encoding: - text_encoding[k] = [ - img_encoding + txt_encoding - for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k]) - ] - else: - text_encoding = _text_encoding - logger.warning_once( - "Expanding inputs for image tokens in BLIP-2 should be done in processing. " - "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." - ) + output_kwargs["text_kwargs"]["add_special_tokens"] = False + output_kwargs["text_kwargs"]["padding"] = False + output_kwargs["text_kwargs"]["truncation"] = False + image_text_encoding = self.tokenizer(image_tokens, **output_kwargs["text_kwargs"]) + for k in text_encoding: + text_encoding[k] = [image_text_encoding[k] + sample for sample in text_encoding[k]] + encoding.update(text_encoding) - # cast to desired return tensors type - encoding.update(BatchEncoding(text_encoding, tensor_type=return_tensors)) - # add pixel_values encoding. If we also have text_encoding, update image encoding and return it. + # Now add pixel_values encoding. If we also have text_encoding, update image encoding and return it. # else, return the text encoding. - if images is not None: image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"]) encoding.update(image_encoding) + + # Cast to desired return tensors type + encoding = BatchFeature(encoding, tensor_type=return_tensors) return encoding # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index 55d7d32f0f..72482558d7 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -799,7 +799,7 @@ class InstructBlipQFormerLayer(GradientCheckpointingLayer): self.chunk_size_feed_forward, self.seq_len_dim, attention_output[:, query_length:, :], - ) + ).to(layer_output.device) layer_output = torch.cat([layer_output, layer_output_text], dim=1) else: layer_output = apply_chunking_to_forward( @@ -1560,9 +1560,6 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati ) vision_outputs = vision_outputs.to_tuple() if not return_dict else vision_outputs query_outputs = query_outputs.to_tuple() if not return_dict else query_outputs - language_model_attention_mask = torch.ones( - language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device - ) if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids) @@ -1570,30 +1567,17 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati if attention_mask is None: attention_mask = torch.ones_like(input_ids) - # if the model already has "image_token_id" then the input is expanded to account for image embeds - # otherwise we expand manually by concatenating - if getattr(self.config, "image_token_id", None) is not None: - if input_ids is None: - special_image_mask = inputs_embeds == self.get_input_embeddings()( - torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) - ) - special_image_mask = special_image_mask.all(-1) - else: - special_image_mask = input_ids == self.config.image_token_id - - special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) - language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) else: - logger.warning_once( - "Expanding inputs for image tokens in InstructBLIP should be done in processing. " - "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." - ) - inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) - attention_mask = torch.cat( - [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1 - ) + special_image_mask = input_ids == self.config.image_token_id + + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) if self.config.use_decoder_only_language_model: outputs = self.language_model( @@ -1682,54 +1666,29 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati interpolate_pos_encoding=interpolate_pos_encoding, return_dict=True, ) - language_attention_mask = torch.ones( - language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device - ) if inputs_embeds is None: if input_ids is None: - start_tokens = [self.config.text_config.bos_token_id] - if getattr(self.config, "image_token_id", None) is not None: - start_tokens = [self.config.image_token_id] * self.config.num_query_tokens + start_tokens - input_ids = torch.tensor([start_tokens], dtype=torch.long, device=language_model_inputs.device) + image_tokens = [self.config.image_token_index] * self.config.num_query_tokens + start_tokens = image_tokens + [self.config.text_config.bos_token_id] + input_ids = torch.tensor([start_tokens], dtype=torch.long, device=pixel_values.device) input_ids = input_ids.repeat(batch_size, 1) inputs_embeds = self.get_input_embeddings()(input_ids) if attention_mask is None: attention_mask = torch.ones_like(input_ids) - # if the model already has "image_token_id" then the input is expanded to account for image embeds - # otherwise we expand manually by concatenating - if getattr(self.config, "image_token_id", None) is not None: - if input_ids is None: - special_image_mask = inputs_embeds == self.get_input_embeddings()( - torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) - ) - special_image_mask = special_image_mask.all(-1) - else: - special_image_mask = input_ids == self.config.image_token_id - - special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) - language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) else: - logger.warning_once( - "Expanding inputs for image tokens in InstructBLIP should be done in processing. " - "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." - ) - inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) - attention_mask = torch.cat( - [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1 - ) + special_image_mask = input_ids == self.config.image_token_id - # add image_embeds length to max_length, so that the final max_length in counted only on token embeds - # -1 is to account for the prepended BOS after `generate.` - if not self.language_model.config.is_encoder_decoder: - generate_kwargs["max_length"] = ( - generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1 - ) - generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index 1b2026f6dd..30a06da209 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -22,7 +22,7 @@ from typing import Union from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack -from ...tokenization_utils_base import AddedToken, BatchEncoding, PreTokenizedInput, TextInput +from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput from ...utils import logging from ..auto import AutoTokenizer @@ -78,6 +78,7 @@ class InstructBlipProcessor(ProcessorMixin): else: self.image_token = tokenizer.image_token self.num_query_tokens = num_query_tokens + super().__init__(image_processor, tokenizer, qformer_tokenizer) def __call__( @@ -111,52 +112,40 @@ class InstructBlipProcessor(ProcessorMixin): **kwargs, ) - encoding = BatchFeature() - + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + encoding = {} if text is not None: if isinstance(text, str): text = [text] elif not isinstance(text, list) and not isinstance(text[0], str): raise ValueError("Invalid input text. Please provide a string, or a list of strings") - # we have to concatenate lists - so we keep track of return_tensors here - return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) - _text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None) - output_kwargs["text_kwargs"]["return_tensors"] = return_tensors - # if we know how many query tokens, expand text inside processor. We need this hacky manipulation - # because BLIP expects image tokens to be at the beginning even before BOS token - if self.num_query_tokens is not None and images is not None: - text_encoding = {} - image_tokens = self.image_token.content * self.num_query_tokens - image_token_encoding = self.tokenizer( - [image_tokens] * len(text), add_special_tokens=False, return_tensors=None - ) - for k in _text_encoding: - text_encoding[k] = [ - img_encoding + txt_encoding - for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k]) - ] - else: - text_encoding = _text_encoding - if images is not None: - logger.warning_once( - "Expanding inputs for image tokens in InstructBLIP should be done in processing. " - "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." - ) - - # cast to desired return tensors type after concatenating - text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors) - - encoding.update(text_encoding) qformer_text_encoding = self.qformer_tokenizer(text, **output_kwargs["text_kwargs"]) encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids") encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask") + # We need this hacky manipulation because BLIP expects image tokens to be at the beginning even before BOS token + if output_kwargs["text_kwargs"].get("max_length") is not None: + output_kwargs["text_kwargs"]["max_length"] -= self.num_query_tokens + text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) + + if images is not None: + # Image tokens should not be padded/truncated or prepended with special BOS token + image_tokens = self.image_token.content * self.num_query_tokens + output_kwargs["text_kwargs"]["add_special_tokens"] = False + output_kwargs["text_kwargs"]["padding"] = False + output_kwargs["text_kwargs"]["truncation"] = False + image_text_encoding = self.tokenizer(image_tokens, **output_kwargs["text_kwargs"]) + for k in text_encoding: + text_encoding[k] = [image_text_encoding[k] + sample for sample in text_encoding[k]] + encoding.update(text_encoding) + if images is not None: image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"]) encoding.update(image_encoding) + # Cast to desired return tensors type + encoding = BatchFeature(encoding, tensor_type=return_tensors) return encoding # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index 6f5b0f8f04..5d3aeec452 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -660,7 +660,7 @@ class InstructBlipVideoQFormerLayer(GradientCheckpointingLayer): self.chunk_size_feed_forward, self.seq_len_dim, attention_output[:, query_length:, :], - ) + ).to(layer_output.device) layer_output = torch.cat([layer_output, layer_output_text], dim=1) else: layer_output = apply_chunking_to_forward( @@ -1527,9 +1527,6 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel ) vision_outputs = vision_outputs.to_tuple() if not return_dict else vision_outputs query_outputs = query_outputs.to_tuple() if not return_dict else query_outputs - language_model_attention_mask = torch.ones( - language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device - ) if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids) @@ -1537,30 +1534,17 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel if attention_mask is None: attention_mask = torch.ones_like(input_ids) - # if the model already has "video_token_id" then the input is expanded to account for image embeds - # otherwise we expand manually by concatenating - if getattr(self.config, "video_token_id", None) is not None: - if input_ids is None: - special_image_mask = inputs_embeds == self.get_input_embeddings()( - torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) - ) - special_image_mask = special_image_mask.all(-1) - else: - special_image_mask = input_ids == self.config.video_token_id - - special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) - language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) else: - logger.warning_once( - "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. " - "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.54." - ) - inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) - attention_mask = torch.cat( - [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1 - ) + special_image_mask = input_ids == self.config.video_token_id + + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) if self.config.use_decoder_only_language_model: outputs = self.language_model( @@ -1650,54 +1634,28 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel return_dict=True, ) - language_attention_mask = torch.ones( - language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device - ) - if inputs_embeds is None: if input_ids is None: - start_tokens = [self.config.text_config.bos_token_id] - if getattr(self.config, "video_token_id", None) is not None: - start_tokens = [self.config.video_token_id] * self.config.num_query_tokens * 4 + start_tokens - input_ids = torch.tensor([start_tokens], dtype=torch.long, device=language_model_inputs.device) + video_tokens = [self.config.video_token_index] * self.config.num_query_tokens * 4 + start_tokens = video_tokens + [self.config.text_config.bos_token_id] + input_ids = torch.tensor([start_tokens], dtype=torch.long, device=pixel_values.device) input_ids = input_ids.repeat(batch_size, 1) inputs_embeds = self.get_input_embeddings()(input_ids) if attention_mask is None: attention_mask = torch.ones_like(input_ids) - # if the model already has "video_token_id" then the input is expanded to account for image embeds - # otherwise we expand manually by concatenating - if getattr(self.config, "video_token_id", None) is not None: - if input_ids is None: - special_image_mask = inputs_embeds == self.get_input_embeddings()( - torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) - ) - special_image_mask = special_image_mask.all(-1) - else: - special_image_mask = input_ids == self.config.video_token_id - - special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) - language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) else: - logger.warning_once( - "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. " - "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.54." - ) - inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) - attention_mask = torch.cat( - [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1 - ) + special_image_mask = input_ids == self.config.video_token_id - # add image_embeds length to max_length, so that the final max_length in counted only on token embeds - # -1 is to account for the prepended BOS after `generate.` - if not self.language_model.config.is_encoder_decoder: - generate_kwargs["max_length"] = ( - generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1 - ) - generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index e6f3289608..5c4404a6c4 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -464,9 +464,6 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGenera ) vision_outputs = vision_outputs.to_tuple() if not return_dict else vision_outputs query_outputs = query_outputs.to_tuple() if not return_dict else query_outputs - language_model_attention_mask = torch.ones( - language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device - ) if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids) @@ -474,30 +471,17 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGenera if attention_mask is None: attention_mask = torch.ones_like(input_ids) - # if the model already has "video_token_id" then the input is expanded to account for image embeds - # otherwise we expand manually by concatenating - if getattr(self.config, "video_token_id", None) is not None: - if input_ids is None: - special_image_mask = inputs_embeds == self.get_input_embeddings()( - torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) - ) - special_image_mask = special_image_mask.all(-1) - else: - special_image_mask = input_ids == self.config.video_token_id - - special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) - language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) else: - logger.warning_once( - "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. " - "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.54." - ) - inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) - attention_mask = torch.cat( - [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1 - ) + special_image_mask = input_ids == self.config.video_token_id + + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) if self.config.use_decoder_only_language_model: outputs = self.language_model( @@ -587,54 +571,28 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGenera return_dict=True, ) - language_attention_mask = torch.ones( - language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device - ) - if inputs_embeds is None: if input_ids is None: - start_tokens = [self.config.text_config.bos_token_id] - if getattr(self.config, "video_token_id", None) is not None: - start_tokens = [self.config.video_token_id] * self.config.num_query_tokens * 4 + start_tokens - input_ids = torch.tensor([start_tokens], dtype=torch.long, device=language_model_inputs.device) + video_tokens = [self.config.video_token_index] * self.config.num_query_tokens * 4 + start_tokens = video_tokens + [self.config.text_config.bos_token_id] + input_ids = torch.tensor([start_tokens], dtype=torch.long, device=pixel_values.device) input_ids = input_ids.repeat(batch_size, 1) inputs_embeds = self.get_input_embeddings()(input_ids) if attention_mask is None: attention_mask = torch.ones_like(input_ids) - # if the model already has "video_token_id" then the input is expanded to account for image embeds - # otherwise we expand manually by concatenating - if getattr(self.config, "video_token_id", None) is not None: - if input_ids is None: - special_image_mask = inputs_embeds == self.get_input_embeddings()( - torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) - ) - special_image_mask = special_image_mask.all(-1) - else: - special_image_mask = input_ids == self.config.video_token_id - - special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) - language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) else: - logger.warning_once( - "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. " - "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.54." - ) - inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) - attention_mask = torch.cat( - [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1 - ) + special_image_mask = input_ids == self.config.video_token_id - # add image_embeds length to max_length, so that the final max_length in counted only on token embeds - # -1 is to account for the prepended BOS after `generate.` - if not self.language_model.config.is_encoder_decoder: - generate_kwargs["max_length"] = ( - generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1 - ) - generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index e2174f248a..7ce638805b 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -23,7 +23,6 @@ from ...image_processing_utils import BatchFeature from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import ( AddedToken, - BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, @@ -99,60 +98,13 @@ class InstructBlipVideoProcessor(ProcessorMixin): if images is None and text is None: raise ValueError("You have to specify at least one of images or text.") - encoding = BatchFeature() - + encoding = {} if text is not None: if isinstance(text, str): text = [text] elif not isinstance(text, list) and not isinstance(text[0], str): raise ValueError("Invalid input text. Please provide a string, or a list of strings") - _text_encoding = self.tokenizer( - text=text, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - stride=stride, - pad_to_multiple_of=pad_to_multiple_of, - return_attention_mask=return_attention_mask, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - return_offsets_mapping=return_offsets_mapping, - return_token_type_ids=return_token_type_ids, - return_length=return_length, - verbose=verbose, - return_tensors=None, # required to concatenate below - **kwargs, - ) - - # if we know how many query tokens, expand text inside processor. We need this hacky manipulation - # because BLIP expects image tokens to be at the beginning even before BOS token - if self.num_query_tokens is not None and images is not None: - text_encoding = {} - video_tokens = ( - self.video_token.content * self.num_query_tokens * 4 - ) # InstrucBLIP works with 4 frames only - video_token_encoding = self.tokenizer( - [video_tokens] * len(text), add_special_tokens=False, return_tensors=None - ) - for k in _text_encoding: - text_encoding[k] = [ - img_encoding + txt_encoding - for img_encoding, txt_encoding in zip(video_token_encoding[k], _text_encoding[k]) - ] - else: - text_encoding = _text_encoding - if images is not None: - logger.warning_once( - "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. " - "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.54." - ) - - # cast to desired return tensors type after concatenating - text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors) - encoding.update(text_encoding) qformer_text_encoding = self.qformer_tokenizer( text=text, add_special_tokens=add_special_tokens, @@ -174,10 +126,51 @@ class InstructBlipVideoProcessor(ProcessorMixin): encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids") encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask") + # We need this hacky manipulation because BLIP expects image tokens to be at the beginning even before BOS token + # InstrucBLIP works with 4 frames only + if max_length is not None: + max_length -= self.num_query_tokens + text_encoding = self.tokenizer( + text=text, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_token_type_ids=return_token_type_ids, + return_length=return_length, + verbose=verbose, + return_tensors=None, # required to concatenate below + **kwargs, + ) + + if images is not None: + video_tokens = self.video_token.content * self.num_query_tokens * 4 + video_text_encoding = self.tokenizer( + video_tokens, + add_special_tokens=False, # required to concatenate below + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_token_type_ids=return_token_type_ids, + return_length=return_length, + return_tensors=None, + ) + for k in text_encoding: + text_encoding[k] = [video_text_encoding[k] + sample for sample in text_encoding[k]] + encoding.update(text_encoding) + if images is not None: image_encoding = self.video_processor(images, return_tensors=return_tensors) encoding.update(image_encoding) + encoding = BatchFeature(encoding, tensor_type=return_tensors) return encoding # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index f923273c02..d5606ac705 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -29,6 +29,7 @@ from transformers.testing_utils import ( require_torch, require_torch_accelerator, require_torch_fp16, + require_torch_gpu, require_torch_multi_accelerator, require_torch_sdpa, require_vision, @@ -777,7 +778,14 @@ class Blip2TextModelTester: # this model tester uses an encoder-decoder language model (T5) class Blip2ModelTester: def __init__( - self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10 + self, + parent, + vision_kwargs=None, + qformer_kwargs=None, + text_kwargs=None, + is_training=True, + num_query_tokens=10, + image_token_index=4, ): if vision_kwargs is None: vision_kwargs = {} @@ -792,11 +800,10 @@ class Blip2ModelTester: self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs) self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests - self.encoder_seq_length = ( - self.text_model_tester.encoder_seq_length + num_query_tokens - ) # need enc seq_length for gen tests + self.encoder_seq_length = self.text_model_tester.encoder_seq_length self.is_training = is_training self.num_query_tokens = num_query_tokens + self.image_token_index = image_token_index def prepare_config_and_inputs(self): _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() @@ -819,6 +826,7 @@ class Blip2ModelTester: qformer_config=self.qformer_model_tester.get_config(), text_config=self.text_model_tester.get_config(), num_query_tokens=self.num_query_tokens, + image_token_index=self.image_token_index, ) def create_and_check_for_conditional_generation( @@ -1872,37 +1880,7 @@ class Blip2ModelIntegrationTest(unittest.TestCase): self.assertEqual(predictions[0].tolist(), expected_ids_and_text[0]) self.assertEqual(generated_text, expected_ids_and_text[1]) - def test_expansion_in_processing(self): - processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") - model = Blip2ForConditionalGeneration.from_pretrained( - "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16 - ).to(torch_device) - - image = prepare_img() - prompt = "Question: which city is this? Answer:" - - # Make sure we will go the legacy path by setting these args to None - processor.num_query_tokens = None - model.config.image_token_index = None - inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16) - - predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15) - generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() - - # Add args to the config to trigger new logic when inputs are expanded in processing file - processor.num_query_tokens = model.config.num_query_tokens - processor.tokenizer.add_special_tokens({"additional_special_tokens": [""]}) - model.config.image_token_index = len(processor.tokenizer) - 1 - model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64) - - # Generate again with new inputs - inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16) - predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15) - generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip() - - self.assertTrue(generated_text_expanded == generated_text) - - @require_torch_accelerator + @require_torch_gpu def test_inference_itm(self): model_name = "Salesforce/blip2-itm-vit-g" processor = Blip2Processor.from_pretrained(model_name) diff --git a/tests/models/blip_2/test_processor_blip_2.py b/tests/models/blip_2/test_processor_blip_2.py index 5d125dc57e..d2b63f8e40 100644 --- a/tests/models/blip_2/test_processor_blip_2.py +++ b/tests/models/blip_2/test_processor_blip_2.py @@ -48,6 +48,9 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): def get_image_processor(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + def prepare_processor_dict(self): + return {"num_query_tokens": 1} + @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) @@ -84,26 +87,12 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): for key in input_feat_extract.keys(): self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) - def test_tokenizer(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - - encoded_processor = processor(text=input_str) - - encoded_tok = tokenizer(input_str, return_token_type_ids=False) - - for key in encoded_tok.keys(): - self.assertListEqual(encoded_tok[key], encoded_processor[key][0]) - def test_processor(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() + processor_kwargs = self.prepare_processor_dict() - processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor) + processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs) input_str = "lower newer" image_input = self.prepare_image_inputs() @@ -119,8 +108,9 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): def test_tokenizer_decode(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() + processor_kwargs = self.prepare_processor_dict() - processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor) + processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs) predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] @@ -132,8 +122,9 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): def test_model_input_names(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() + processor_kwargs = self.prepare_processor_dict() - processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor) + processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs) input_str = "lower newer" image_input = self.prepare_image_inputs() diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 341e570172..fe309e700c 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -809,34 +809,3 @@ class InstructBlipModelIntegrationTest(unittest.TestCase): predictions[0].tolist(), [0, 37, 1023, 753, 3, 9, 2335, 3823, 30, 8, 2608, 28, 3, 9, 1782, 5, 1] ) self.assertEqual(generated_text, "The image features a woman sitting on the beach with a dog.") - - def test_expansion_in_processing(self): - processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl") - model = InstructBlipForConditionalGeneration.from_pretrained( - "Salesforce/instructblip-flan-t5-xl", - torch_dtype=torch.bfloat16, - ).to(torch_device) - - image = prepare_img() - prompt = "What's in the image?" - - # Make sure we will go the legacy path by setting these args to None - processor.num_query_tokens = None - model.config.image_token_index = None - inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16) - - predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15) - generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() - - # Add args to the config to trigger new logic when inputs are expanded in processing file - processor.num_query_tokens = model.config.num_query_tokens - processor.tokenizer.add_special_tokens({"additional_special_tokens": [""]}) - model.config.image_token_index = len(processor.tokenizer) - 2 - model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64) - - # Generate again with new inputs - inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16) - predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15) - generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip() - - self.assertTrue(generated_text_expanded == generated_text) diff --git a/tests/models/instructblip/test_processor_instructblip.py b/tests/models/instructblip/test_processor_instructblip.py index 6675390e0b..984826aaa2 100644 --- a/tests/models/instructblip/test_processor_instructblip.py +++ b/tests/models/instructblip/test_processor_instructblip.py @@ -59,6 +59,9 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): def get_qformer_tokenizer(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer + def prepare_processor_dict(self): + return {"num_query_tokens": 1} + @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) @@ -90,9 +93,13 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() qformer_tokenizer = self.get_qformer_tokenizer() + processor_kwargs = self.prepare_processor_dict() processor = InstructBlipProcessor( - tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer + tokenizer=tokenizer, + image_processor=image_processor, + qformer_tokenizer=qformer_tokenizer, + **processor_kwargs, ) image_input = self.prepare_image_inputs() @@ -103,35 +110,17 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): for key in input_feat_extract.keys(): self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) - def test_tokenizer(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - qformer_tokenizer = self.get_qformer_tokenizer() - - processor = InstructBlipProcessor( - tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer - ) - - input_str = ["lower newer"] - - encoded_processor = processor(text=input_str) - - encoded_tokens = tokenizer(input_str, return_token_type_ids=False) - encoded_tokens_qformer = qformer_tokenizer(input_str, return_token_type_ids=False) - - for key in encoded_tokens.keys(): - self.assertListEqual(encoded_tokens[key], encoded_processor[key]) - - for key in encoded_tokens_qformer.keys(): - self.assertListEqual(encoded_tokens_qformer[key], encoded_processor["qformer_" + key]) - def test_processor(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() qformer_tokenizer = self.get_qformer_tokenizer() + processor_kwargs = self.prepare_processor_dict() processor = InstructBlipProcessor( - tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer + tokenizer=tokenizer, + image_processor=image_processor, + qformer_tokenizer=qformer_tokenizer, + **processor_kwargs, ) input_str = "lower newer" @@ -141,7 +130,7 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): self.assertListEqual( list(inputs.keys()), - ["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"], + ["qformer_input_ids", "qformer_attention_mask", "input_ids", "attention_mask", "pixel_values"], ) # test if it raises when no input is passed @@ -152,9 +141,13 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() qformer_tokenizer = self.get_qformer_tokenizer() + processor_kwargs = self.prepare_processor_dict() processor = InstructBlipProcessor( - tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer + tokenizer=tokenizer, + image_processor=image_processor, + qformer_tokenizer=qformer_tokenizer, + **processor_kwargs, ) predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] @@ -168,9 +161,13 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() qformer_tokenizer = self.get_qformer_tokenizer() + processor_kwargs = self.prepare_processor_dict() processor = InstructBlipProcessor( - tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer + tokenizer=tokenizer, + image_processor=image_processor, + qformer_tokenizer=qformer_tokenizer, + **processor_kwargs, ) input_str = "lower newer" @@ -180,5 +177,5 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): self.assertListEqual( list(inputs.keys()), - ["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"], + ["qformer_input_ids", "qformer_attention_mask", "input_ids", "attention_mask", "pixel_values"], ) diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 6b5f4b6614..5f056d4464 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -750,34 +750,3 @@ class InstructBlipVideoModelIntegrationTest(unittest.TestCase): generated_text, "Explain what is happening in this short video. a baby girl wearing glasses is reading a book on the bed 1080p", ) - - def test_expansion_in_processing(self): - processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b") - model = InstructBlipVideoForConditionalGeneration.from_pretrained( - "Salesforce/instructblip-vicuna-7b", - load_in_8bit=True, - ) - - clip = prepare_video() - prompt = "Explain what is happening in this short video." - - # Make sure we will go the legacy path by setting these args to None - processor.num_query_tokens = None - model.config.video_token_index = None - inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16) - - predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15) - generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() - - # Add args to the config to trigger new logic when inputs are expanded in processing file - processor.num_query_tokens = model.config.num_query_tokens - processor.tokenizer.add_special_tokens({"additional_special_tokens": ["