Llava Onevision: add model (#32673)

* working version

* fix copies

* update

* tests

* update docs

* codestyle

* add more tests

* add returns for docs

* clean up

* Update src/transformers/models/llava_onevision/processing_llava_onevision.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* updates

* codestyle

* style

* shouldn't be reversed

* [run-slow] llava_onevision

* [run-slow] llava_onevision

* add pooling in videos

* [run-slow] llava_onevision

* num-logits-to-keep

* [run-slow] llava_onevision

* [run-slow] llava_onevision

* Update tests/test_modeling_common.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* video matched orig impl

* fix tests

* chat template was modified

* Update docs/source/en/model_doc/llava_onevision.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* add morer info in the doc page

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Raushan Turganbay
2024-09-05 11:43:20 +02:00
committed by GitHub
parent 9230d78e76
commit 43df47d8e7
29 changed files with 4157 additions and 9 deletions

View File

@@ -491,6 +491,7 @@ class GenerationTesterMixin:
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
else:
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
self._check_outputs(output_generate, input_ids, model.config, use_cache=True)
@pytest.mark.generate
@@ -630,6 +631,7 @@ class GenerationTesterMixin:
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
else:
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
self._check_outputs(
output_generate, input_ids, model.config, use_cache=True, num_return_sequences=beam_kwargs["num_beams"]
)
@@ -986,6 +988,7 @@ class GenerationTesterMixin:
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
else:
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
self._check_outputs(output_generate, input_ids, model.config, use_cache=True)
@pytest.mark.generate
@@ -1152,6 +1155,7 @@ class GenerationTesterMixin:
output_assisted = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
# The two outputs must match and their shape must be as expected
self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
for output in (output_greedy, output_assisted):
self._check_outputs(output, input_ids, model.config, use_cache=True)
@@ -1216,6 +1220,7 @@ class GenerationTesterMixin:
output_prompt_lookup = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
# The two outputs must match and their shape must be as expected
self.assertListEqual(output_greedy.sequences.tolist(), output_prompt_lookup.sequences.tolist())
for output in (output_greedy, output_prompt_lookup):
self._check_outputs(output, input_ids, model.config, use_cache=True)
@@ -1453,8 +1458,10 @@ class GenerationTesterMixin:
next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :]
# With left-padding (length 32)
# can hardcode pad_token to be 0 as we'll do attn masking anyway
pad_token_id = config.pad_token_id if getattr(config, "pad_token_id") is not None else 0
pad_size = (input_ids.shape[0], 32)
padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * config.pad_token_id
padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
padded_input_ids = torch.cat((padding, input_ids), dim=1)
padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
@@ -1765,15 +1772,14 @@ class GenerationTesterMixin:
}
max_cache_len = seq_length + max_new_tokens
config = config.text_config if hasattr(config, "text_config") else config
head_dim = (
model.config.head_dim
if hasattr(model.config, "head_dim")
else model.config.hidden_size // model.config.num_attention_heads
config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
)
num_key_value_heads = (
model.config.num_attention_heads
config.num_attention_heads
if getattr(config, "num_key_value_heads", None) is None
else model.config.num_key_value_heads
else config.num_key_value_heads
)
num_hidden_layers = config.num_hidden_layers
results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
@@ -1922,6 +1928,7 @@ class GenerationTesterMixin:
def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
batch_size, seq_length = input_ids.shape
config = config.text_config if hasattr(config, "text_config") else config
num_sequences_in_output = batch_size * num_return_sequences
gen_len = (