Llava Onevision: add model (#32673)
* working version * fix copies * update * tests * update docs * codestyle * add more tests * add returns for docs * clean up * Update src/transformers/models/llava_onevision/processing_llava_onevision.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * updates * codestyle * style * shouldn't be reversed * [run-slow] llava_onevision * [run-slow] llava_onevision * add pooling in videos * [run-slow] llava_onevision * num-logits-to-keep * [run-slow] llava_onevision * [run-slow] llava_onevision * Update tests/test_modeling_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * video matched orig impl * fix tests * chat template was modified * Update docs/source/en/model_doc/llava_onevision.md Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * add morer info in the doc page --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
9230d78e76
commit
43df47d8e7
@@ -491,6 +491,7 @@ class GenerationTesterMixin:
|
||||
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
|
||||
else:
|
||||
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
|
||||
|
||||
self._check_outputs(output_generate, input_ids, model.config, use_cache=True)
|
||||
|
||||
@pytest.mark.generate
|
||||
@@ -630,6 +631,7 @@ class GenerationTesterMixin:
|
||||
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
|
||||
else:
|
||||
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
|
||||
|
||||
self._check_outputs(
|
||||
output_generate, input_ids, model.config, use_cache=True, num_return_sequences=beam_kwargs["num_beams"]
|
||||
)
|
||||
@@ -986,6 +988,7 @@ class GenerationTesterMixin:
|
||||
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
|
||||
else:
|
||||
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
|
||||
|
||||
self._check_outputs(output_generate, input_ids, model.config, use_cache=True)
|
||||
|
||||
@pytest.mark.generate
|
||||
@@ -1152,6 +1155,7 @@ class GenerationTesterMixin:
|
||||
output_assisted = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
|
||||
|
||||
# The two outputs must match and their shape must be as expected
|
||||
|
||||
self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
|
||||
for output in (output_greedy, output_assisted):
|
||||
self._check_outputs(output, input_ids, model.config, use_cache=True)
|
||||
@@ -1216,6 +1220,7 @@ class GenerationTesterMixin:
|
||||
output_prompt_lookup = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
|
||||
|
||||
# The two outputs must match and their shape must be as expected
|
||||
|
||||
self.assertListEqual(output_greedy.sequences.tolist(), output_prompt_lookup.sequences.tolist())
|
||||
for output in (output_greedy, output_prompt_lookup):
|
||||
self._check_outputs(output, input_ids, model.config, use_cache=True)
|
||||
@@ -1453,8 +1458,10 @@ class GenerationTesterMixin:
|
||||
next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :]
|
||||
|
||||
# With left-padding (length 32)
|
||||
# can hardcode pad_token to be 0 as we'll do attn masking anyway
|
||||
pad_token_id = config.pad_token_id if getattr(config, "pad_token_id") is not None else 0
|
||||
pad_size = (input_ids.shape[0], 32)
|
||||
padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * config.pad_token_id
|
||||
padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
|
||||
padded_input_ids = torch.cat((padding, input_ids), dim=1)
|
||||
padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
|
||||
model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
|
||||
@@ -1765,15 +1772,14 @@ class GenerationTesterMixin:
|
||||
}
|
||||
|
||||
max_cache_len = seq_length + max_new_tokens
|
||||
config = config.text_config if hasattr(config, "text_config") else config
|
||||
head_dim = (
|
||||
model.config.head_dim
|
||||
if hasattr(model.config, "head_dim")
|
||||
else model.config.hidden_size // model.config.num_attention_heads
|
||||
config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
|
||||
)
|
||||
num_key_value_heads = (
|
||||
model.config.num_attention_heads
|
||||
config.num_attention_heads
|
||||
if getattr(config, "num_key_value_heads", None) is None
|
||||
else model.config.num_key_value_heads
|
||||
else config.num_key_value_heads
|
||||
)
|
||||
num_hidden_layers = config.num_hidden_layers
|
||||
results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
|
||||
@@ -1922,6 +1928,7 @@ class GenerationTesterMixin:
|
||||
|
||||
def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
|
||||
batch_size, seq_length = input_ids.shape
|
||||
config = config.text_config if hasattr(config, "text_config") else config
|
||||
num_sequences_in_output = batch_size * num_return_sequences
|
||||
|
||||
gen_len = (
|
||||
|
||||
Reference in New Issue
Block a user