BLIPs clean-up (#35560)
* blips clean up * update processor * readability * fix processor length * fix copies * tmp * update and fix copies * why keep these, delete? * fix test fetcher * irrelevant comment * fix tests * fix tests * fix copies
This commit is contained in:
committed by
GitHub
parent
4f8f51be4e
commit
75794792ad
@@ -29,6 +29,7 @@ from transformers.testing_utils import (
|
||||
require_torch,
|
||||
require_torch_accelerator,
|
||||
require_torch_fp16,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_accelerator,
|
||||
require_torch_sdpa,
|
||||
require_vision,
|
||||
@@ -777,7 +778,14 @@ class Blip2TextModelTester:
|
||||
# this model tester uses an encoder-decoder language model (T5)
|
||||
class Blip2ModelTester:
|
||||
def __init__(
|
||||
self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
|
||||
self,
|
||||
parent,
|
||||
vision_kwargs=None,
|
||||
qformer_kwargs=None,
|
||||
text_kwargs=None,
|
||||
is_training=True,
|
||||
num_query_tokens=10,
|
||||
image_token_index=4,
|
||||
):
|
||||
if vision_kwargs is None:
|
||||
vision_kwargs = {}
|
||||
@@ -792,11 +800,10 @@ class Blip2ModelTester:
|
||||
self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs)
|
||||
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||
self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests
|
||||
self.encoder_seq_length = (
|
||||
self.text_model_tester.encoder_seq_length + num_query_tokens
|
||||
) # need enc seq_length for gen tests
|
||||
self.encoder_seq_length = self.text_model_tester.encoder_seq_length
|
||||
self.is_training = is_training
|
||||
self.num_query_tokens = num_query_tokens
|
||||
self.image_token_index = image_token_index
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
_, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
|
||||
@@ -819,6 +826,7 @@ class Blip2ModelTester:
|
||||
qformer_config=self.qformer_model_tester.get_config(),
|
||||
text_config=self.text_model_tester.get_config(),
|
||||
num_query_tokens=self.num_query_tokens,
|
||||
image_token_index=self.image_token_index,
|
||||
)
|
||||
|
||||
def create_and_check_for_conditional_generation(
|
||||
@@ -1872,37 +1880,7 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
|
||||
self.assertEqual(predictions[0].tolist(), expected_ids_and_text[0])
|
||||
self.assertEqual(generated_text, expected_ids_and_text[1])
|
||||
|
||||
def test_expansion_in_processing(self):
|
||||
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
|
||||
model = Blip2ForConditionalGeneration.from_pretrained(
|
||||
"Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
|
||||
).to(torch_device)
|
||||
|
||||
image = prepare_img()
|
||||
prompt = "Question: which city is this? Answer:"
|
||||
|
||||
# Make sure we will go the legacy path by setting these args to None
|
||||
processor.num_query_tokens = None
|
||||
model.config.image_token_index = None
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
|
||||
predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
|
||||
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
||||
|
||||
# Add args to the config to trigger new logic when inputs are expanded in processing file
|
||||
processor.num_query_tokens = model.config.num_query_tokens
|
||||
processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
|
||||
model.config.image_token_index = len(processor.tokenizer) - 1
|
||||
model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64)
|
||||
|
||||
# Generate again with new inputs
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
|
||||
generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
|
||||
|
||||
self.assertTrue(generated_text_expanded == generated_text)
|
||||
|
||||
@require_torch_accelerator
|
||||
@require_torch_gpu
|
||||
def test_inference_itm(self):
|
||||
model_name = "Salesforce/blip2-itm-vit-g"
|
||||
processor = Blip2Processor.from_pretrained(model_name)
|
||||
|
||||
@@ -48,6 +48,9 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_image_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
return {"num_query_tokens": 1}
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||
@@ -84,26 +87,12 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
for key in input_feat_extract.keys():
|
||||
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||
|
||||
def test_tokenizer(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
|
||||
input_str = "lower newer"
|
||||
|
||||
encoded_processor = processor(text=input_str)
|
||||
|
||||
encoded_tok = tokenizer(input_str, return_token_type_ids=False)
|
||||
|
||||
for key in encoded_tok.keys():
|
||||
self.assertListEqual(encoded_tok[key], encoded_processor[key][0])
|
||||
|
||||
def test_processor(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs)
|
||||
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
@@ -119,8 +108,9 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def test_tokenizer_decode(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs)
|
||||
|
||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||
|
||||
@@ -132,8 +122,9 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def test_model_input_names(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs)
|
||||
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
@@ -809,34 +809,3 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
|
||||
predictions[0].tolist(), [0, 37, 1023, 753, 3, 9, 2335, 3823, 30, 8, 2608, 28, 3, 9, 1782, 5, 1]
|
||||
)
|
||||
self.assertEqual(generated_text, "The image features a woman sitting on the beach with a dog.")
|
||||
|
||||
def test_expansion_in_processing(self):
|
||||
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
|
||||
model = InstructBlipForConditionalGeneration.from_pretrained(
|
||||
"Salesforce/instructblip-flan-t5-xl",
|
||||
torch_dtype=torch.bfloat16,
|
||||
).to(torch_device)
|
||||
|
||||
image = prepare_img()
|
||||
prompt = "What's in the image?"
|
||||
|
||||
# Make sure we will go the legacy path by setting these args to None
|
||||
processor.num_query_tokens = None
|
||||
model.config.image_token_index = None
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
|
||||
predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
|
||||
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
||||
|
||||
# Add args to the config to trigger new logic when inputs are expanded in processing file
|
||||
processor.num_query_tokens = model.config.num_query_tokens
|
||||
processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
|
||||
model.config.image_token_index = len(processor.tokenizer) - 2
|
||||
model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64)
|
||||
|
||||
# Generate again with new inputs
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
|
||||
generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
|
||||
|
||||
self.assertTrue(generated_text_expanded == generated_text)
|
||||
|
||||
@@ -59,6 +59,9 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_qformer_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
return {"num_query_tokens": 1}
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||
@@ -90,9 +93,13 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = InstructBlipProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer,
|
||||
image_processor=image_processor,
|
||||
qformer_tokenizer=qformer_tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
|
||||
image_input = self.prepare_image_inputs()
|
||||
@@ -103,35 +110,17 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
for key in input_feat_extract.keys():
|
||||
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||
|
||||
def test_tokenizer(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
|
||||
processor = InstructBlipProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
|
||||
)
|
||||
|
||||
input_str = ["lower newer"]
|
||||
|
||||
encoded_processor = processor(text=input_str)
|
||||
|
||||
encoded_tokens = tokenizer(input_str, return_token_type_ids=False)
|
||||
encoded_tokens_qformer = qformer_tokenizer(input_str, return_token_type_ids=False)
|
||||
|
||||
for key in encoded_tokens.keys():
|
||||
self.assertListEqual(encoded_tokens[key], encoded_processor[key])
|
||||
|
||||
for key in encoded_tokens_qformer.keys():
|
||||
self.assertListEqual(encoded_tokens_qformer[key], encoded_processor["qformer_" + key])
|
||||
|
||||
def test_processor(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = InstructBlipProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer,
|
||||
image_processor=image_processor,
|
||||
qformer_tokenizer=qformer_tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
|
||||
input_str = "lower newer"
|
||||
@@ -141,7 +130,7 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
self.assertListEqual(
|
||||
list(inputs.keys()),
|
||||
["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
|
||||
["qformer_input_ids", "qformer_attention_mask", "input_ids", "attention_mask", "pixel_values"],
|
||||
)
|
||||
|
||||
# test if it raises when no input is passed
|
||||
@@ -152,9 +141,13 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = InstructBlipProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer,
|
||||
image_processor=image_processor,
|
||||
qformer_tokenizer=qformer_tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
|
||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||
@@ -168,9 +161,13 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = InstructBlipProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer,
|
||||
image_processor=image_processor,
|
||||
qformer_tokenizer=qformer_tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
|
||||
input_str = "lower newer"
|
||||
@@ -180,5 +177,5 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
self.assertListEqual(
|
||||
list(inputs.keys()),
|
||||
["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
|
||||
["qformer_input_ids", "qformer_attention_mask", "input_ids", "attention_mask", "pixel_values"],
|
||||
)
|
||||
|
||||
@@ -750,34 +750,3 @@ class InstructBlipVideoModelIntegrationTest(unittest.TestCase):
|
||||
generated_text,
|
||||
"Explain what is happening in this short video. a baby girl wearing glasses is reading a book on the bed 1080p",
|
||||
)
|
||||
|
||||
def test_expansion_in_processing(self):
|
||||
processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
|
||||
model = InstructBlipVideoForConditionalGeneration.from_pretrained(
|
||||
"Salesforce/instructblip-vicuna-7b",
|
||||
load_in_8bit=True,
|
||||
)
|
||||
|
||||
clip = prepare_video()
|
||||
prompt = "Explain what is happening in this short video."
|
||||
|
||||
# Make sure we will go the legacy path by setting these args to None
|
||||
processor.num_query_tokens = None
|
||||
model.config.video_token_index = None
|
||||
inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
|
||||
predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
|
||||
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
||||
|
||||
# Add args to the config to trigger new logic when inputs are expanded in processing file
|
||||
processor.num_query_tokens = model.config.num_query_tokens
|
||||
processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<video>"]})
|
||||
model.config.video_token_index = len(processor.tokenizer) - 1
|
||||
model.resize_token_embeddings(len(processor.tokenizer), pad_to_multiple_of=64)
|
||||
|
||||
# Generate again with new inputs
|
||||
inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
|
||||
generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
|
||||
|
||||
self.assertTrue(generated_text_expanded == generated_text)
|
||||
|
||||
@@ -59,6 +59,9 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_qformer_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
return {"num_query_tokens": 1}
|
||||
|
||||
def get_video_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
|
||||
|
||||
@@ -93,9 +96,13 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
video_processor = self.get_video_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = InstructBlipVideoProcessor(
|
||||
tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
qformer_tokenizer=qformer_tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
|
||||
image_input = self.prepare_image_inputs()
|
||||
@@ -110,15 +117,17 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
video_processor = self.get_video_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = InstructBlipVideoProcessor(
|
||||
tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
qformer_tokenizer=qformer_tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
|
||||
input_str = ["lower newer"]
|
||||
|
||||
encoded_processor = processor(text=input_str)
|
||||
|
||||
encoded_tokens = tokenizer(input_str, return_token_type_ids=False)
|
||||
encoded_tokens_qformer = qformer_tokenizer(input_str, return_token_type_ids=False)
|
||||
|
||||
@@ -132,9 +141,13 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
video_processor = self.get_video_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = InstructBlipVideoProcessor(
|
||||
tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
qformer_tokenizer=qformer_tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
|
||||
input_str = "lower newer"
|
||||
@@ -144,7 +157,7 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
self.assertListEqual(
|
||||
list(inputs.keys()),
|
||||
["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
|
||||
["qformer_input_ids", "qformer_attention_mask", "input_ids", "attention_mask", "pixel_values"],
|
||||
)
|
||||
|
||||
# test if it raises when no input is passed
|
||||
@@ -155,9 +168,13 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
video_processor = self.get_video_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = InstructBlipVideoProcessor(
|
||||
tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
qformer_tokenizer=qformer_tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
|
||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||
@@ -171,9 +188,13 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
video_processor = self.get_video_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = InstructBlipVideoProcessor(
|
||||
tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
qformer_tokenizer=qformer_tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
|
||||
input_str = "lower newer"
|
||||
@@ -183,5 +204,5 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
self.assertListEqual(
|
||||
list(inputs.keys()),
|
||||
["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
|
||||
["qformer_input_ids", "qformer_attention_mask", "input_ids", "attention_mask", "pixel_values"],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user