Add FA2 and sdpa support for SigLIP (#31499)

* Rebase to main

* Fix attention implementation autoset for tex and vision configs

* Fixup

* Minor fixes

* Fix copies

* Fix attention_mask for FA2

* Add eqvivalence tests for siglip

* Remove right padding test

* Uncomment flaky

* Fix import

* Add to docs

* Fix test message

* Add sdpa

* Add sdpa equivalence test

* Add siglip sdpa to docs

* Fix typing for attention output

* Add sdpa tests

* Fix signature of FA2

* Autoset attn_implementation in config

* Rename bsz -> batch_size

* Move back autoset attn method

* Mark as flaky

* Correct attention mask padding

* [run-slow] siglip

* Add FA2 and sdpa docs

* Style fix

* Remove flaky for FA2 test

* Change attention implementation set

* Change attn_implementaiton propogation

* Fix typos

* Add modality to assert message

* Add more sdpa backends in test

* [run slow] siglip

* Add math sdpa backend for all options

* [run slow] siglip
This commit is contained in:
Pavel Iakubovskii
2024-07-08 11:10:02 +01:00
committed by GitHub
parent 076e66e479
commit a177821b24
5 changed files with 680 additions and 13 deletions

View File

@@ -18,18 +18,30 @@ import inspect
import os
import tempfile
import unittest
from typing import Tuple
import numpy as np
import requests
from parameterized import parameterized
from pytest import mark
from transformers import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
from transformers.testing_utils import (
require_flash_attn,
require_torch,
require_torch_gpu,
require_torch_sdpa,
require_vision,
slow,
torch_device,
)
from transformers.utils import is_torch_available, is_vision_available
from transformers.utils import (
is_torch_available,
is_torch_bf16_available_on_device,
is_torch_fp16_available_on_device,
is_torch_sdpa_available,
is_vision_available,
)
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
@@ -37,6 +49,7 @@ from ...test_modeling_common import (
_config_zero_init,
floats_tensor,
ids_tensor,
is_flaky,
random_attention_mask,
)
from ...test_pipeline_mixin import PipelineTesterMixin
@@ -48,6 +61,8 @@ if is_torch_available():
from transformers import SiglipForImageClassification, SiglipModel, SiglipTextModel, SiglipVisionModel
if is_torch_sdpa_available():
from torch.nn.attention import SDPBackend, sdpa_kernel
if is_vision_available():
from PIL import Image
@@ -55,6 +70,155 @@ if is_vision_available():
from transformers import SiglipProcessor
class SiglipModelTesterMixin(ModelTesterMixin):
def test_eager_matches_sdpa_inference(
self,
torch_dtype: str,
use_attention_mask_options: Tuple[bool, ...] = (True, False),
logit_keys: Tuple[str, ...] = ("logits_per_image", "logits_per_text", "image_embeds", "text_embeds"),
):
if not self.all_model_classes[0]._supports_sdpa:
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
self.skipTest(
f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
)
# Convert to torch dtype
dtypes = {
"float16": torch.float16,
"bfloat16": torch.bfloat16,
"float32": torch.float32,
}
torch_dtype = dtypes[torch_dtype]
atols = {
torch.float32: 1e-5,
torch.bfloat16: 3e-2,
torch.float16: 5e-3,
}
rtols = {
torch.float32: 1e-4,
torch.bfloat16: 3e-2,
torch.float16: 5e-3,
}
atol = atols[torch_dtype]
rtol = rtols[torch_dtype]
def get_mean_reldiff(msg, current_case, x, ref, atol, rtol):
return f"{msg} {current_case}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
# Load the model with SDPA
model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
model_sdpa = model_sdpa.eval().to(torch_device)
# Load model with eager attention
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch_dtype,
attn_implementation="eager",
)
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa and model_sdpa.config.model_type != "falcon":
raise ValueError("The SDPA model should have SDPA attention layers")
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving the model each time,
# but it would be nicer to have an efficient way to use parameterized.expand
cases = [
(use_mask, output_attentions, sdpa_backend, batch_size)
for use_mask in use_attention_mask_options
for output_attentions in [True, False]
for sdpa_backend in [
SDPBackend.MATH,
[SDPBackend.FLASH_ATTENTION, SDPBackend.MATH],
[SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH],
[SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH],
]
for batch_size in [1, 5]
]
fail_cases = []
for use_mask, output_attentions, sdpa_backend, batch_size in cases:
processed_inputs = inputs_dict.copy()
# convert to torch_dtype
if "pixel_values" in processed_inputs:
processed_inputs["pixel_values"] = processed_inputs["pixel_values"].to(torch_dtype)
# slice for different batch sizes
for key in ["pixel_values", "input_ids", "attention_mask"]:
if key in processed_inputs:
processed_inputs[key] = processed_inputs[key][:batch_size]
# set attention mask with left padding
if not use_mask:
processed_inputs.pop("attention_mask", None)
else:
dummy_attention_mask = processed_inputs["attention_mask"]
dummy_attention_mask[:] = 1
dummy_attention_mask[:, :1] = 0
processed_inputs["attention_mask"] = dummy_attention_mask
processed_inputs["output_attentions"] = output_attentions
processed_inputs["output_hidden_states"] = True
current_case = (
f"padding_side=left, use_mask={use_mask}, batch_size={batch_size}, sdpa_backend={sdpa_backend}"
)
prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
with torch.no_grad():
try:
with sdpa_kernel(sdpa_backend):
outputs_eager = model_eager(**prepared_inputs)
outputs_sdpa = model_sdpa(**prepared_inputs)
except Exception as e:
fail_cases.append(f"{current_case}: {e}")
continue
for key in logit_keys:
eager_logits = outputs_eager[key]
sdpa_logits = outputs_sdpa[key]
if use_mask:
eager_logits = eager_logits[:, 1:]
sdpa_logits = sdpa_logits[:, 1:]
is_close = torch.allclose(eager_logits, sdpa_logits, atol=atol, rtol=rtol)
if not is_close:
fail_cases.append(get_mean_reldiff(key, current_case, sdpa_logits, eager_logits, atol, rtol))
self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
class SiglipVisionModelTester:
def __init__(
self,
@@ -135,7 +299,7 @@ class SiglipVisionModelTester:
@require_torch
class SiglipVisionModelTest(ModelTesterMixin, unittest.TestCase):
class SiglipVisionModelTest(SiglipModelTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as SIGLIP does not use input_ids, inputs_embeds,
attention_mask and seq_length.
@@ -225,6 +389,17 @@ class SiglipVisionModelTest(ModelTesterMixin, unittest.TestCase):
model = SiglipVisionModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@slow
@is_flaky()
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
super().test_eager_matches_sdpa_inference(
torch_dtype=torch_dtype,
logit_keys=("pooler_output", "last_hidden_state"),
use_attention_mask_options=(False,),
)
class SiglipTextModelTester:
def __init__(
@@ -314,7 +489,7 @@ class SiglipTextModelTester:
@require_torch
class SiglipTextModelTest(ModelTesterMixin, unittest.TestCase):
class SiglipTextModelTest(SiglipModelTesterMixin, unittest.TestCase):
all_model_classes = (SiglipTextModel,) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
@@ -376,6 +551,17 @@ class SiglipTextModelTest(ModelTesterMixin, unittest.TestCase):
model = SiglipTextModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@slow
@is_flaky()
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
super().test_eager_matches_sdpa_inference(
torch_dtype=torch_dtype,
logit_keys=("pooler_output", "last_hidden_state"),
use_attention_mask_options=(False, True),
)
class SiglipModelTester:
def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
@@ -429,7 +615,7 @@ class SiglipModelTester:
@require_torch
class SiglipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (SiglipModel,) if is_torch_available() else ()
pipeline_model_mapping = {"feature-extraction": SiglipModel} if is_torch_available() else {}
fx_compatible = False
@@ -571,6 +757,100 @@ class SiglipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
model = SiglipModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@slow
def test_flash_attn_2_inference_equivalence(self):
for model_class in self.all_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_fa = model_class.from_pretrained(
tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
)
model_fa.to(torch_device)
model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
model.to(torch_device)
dummy_pixel_values = inputs_dict["pixel_values"].to(torch.bfloat16)
dummy_input_ids = inputs_dict["input_ids"]
outputs = model(pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True)
outputs_fa = model_fa(
pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True
)
self.assertTrue(
torch.allclose(outputs.logits_per_image, outputs_fa.logits_per_image, atol=4e-2, rtol=4e-2),
f"Image logits max diff: {torch.max(torch.abs(outputs.logits_per_image - outputs_fa.logits_per_image))}",
)
self.assertTrue(
torch.allclose(outputs.logits_per_text, outputs_fa.logits_per_text, atol=4e-2, rtol=4e-2),
f"Text logits max diff: {torch.max(torch.abs(outputs.logits_per_text - outputs_fa.logits_per_text))}",
)
# Test with attention mask
dummy_attention_mask = inputs_dict["attention_mask"]
if dummy_attention_mask is not None:
dummy_attention_mask[:, 1:] = 1
dummy_attention_mask[:, :1] = 0
outputs = model(
pixel_values=dummy_pixel_values,
input_ids=dummy_input_ids,
attention_mask=dummy_attention_mask,
output_hidden_states=True,
)
outputs_fa = model_fa(
pixel_values=dummy_pixel_values,
input_ids=dummy_input_ids,
attention_mask=dummy_attention_mask,
output_hidden_states=True,
)
self.assertTrue(
torch.allclose(outputs.logits_per_image, outputs_fa.logits_per_image, atol=4e-2, rtol=4e-2),
f"Logits max diff: {torch.max(torch.abs(outputs.logits_per_image - outputs_fa.logits_per_image))}",
)
self.assertTrue(
torch.allclose(outputs.logits_per_text, outputs_fa.logits_per_text, atol=4e-2, rtol=4e-2),
f"Logits max diff: {torch.max(torch.abs(outputs.logits_per_text - outputs_fa.logits_per_text))}",
)
# check with inference + dropout
model.train()
_ = model_fa(
pixel_values=dummy_pixel_values,
input_ids=dummy_input_ids,
attention_mask=dummy_attention_mask,
output_hidden_states=True,
)
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
def test_flash_attn_2_inference_equivalence_right_padding(self):
self.skipTest("SigLIP does not support right padding")
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@slow
@is_flaky()
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
super().test_eager_matches_sdpa_inference(
torch_dtype=torch_dtype,
logit_keys=("logits_per_image", "logits_per_text", "image_embeds", "text_embeds"),
use_attention_mask_options=(False, True),
)
class SiglipForImageClassificationModelTester(SiglipModelTester):
def __init__(self, parent):
@@ -594,7 +874,7 @@ class SiglipForImageClassificationModelTester(SiglipModelTester):
@require_torch
class SiglipForImageClassificationModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
class SiglipForImageClassificationModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (SiglipForImageClassification,) if is_torch_available() else ()
pipeline_model_mapping = {"image-classification": SiglipForImageClassification} if is_torch_available() else {}
fx_compatible = False
@@ -636,6 +916,15 @@ class SiglipForImageClassificationModelTest(ModelTesterMixin, PipelineTesterMixi
def test_initialization(self):
pass
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@slow
@is_flaky()
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
super().test_eager_matches_sdpa_inference(
torch_dtype=torch_dtype, logit_keys=("logits",), use_attention_mask_options=(False,)
)
# We will verify our results on an image of cute cats
def prepare_img():