[GroundingDino] Fix grounding dino loss 🚨 (#31828)

* Starting to fix GroundingDinoLoss and GroundingDinoHungarianMatcher * More updates * More updates * fixed: GroundingDinoLoss * fixed: failing tests * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/grounding_dino/test_modeling_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Addressed comments * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com> * add: cardinality loss and make box loss as copy from * change: default for reduction loss is sum * fix: vectorized generate fake box * fix copies * Addressed comments * addressed comments * addressed one-hot * Update tests/models/grounding_dino/test_modeling_grounding_dino.py Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com> * Addressed comments * fixed test * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py * Update tests/models/grounding_dino/test_modeling_grounding_dino.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * Starting to fix GroundingDinoLoss and GroundingDinoHungarianMatcher * More updates * More updates * fixed: GroundingDinoLoss * add: cardinality loss and make box loss as copy from * fix copies * Revert "Update tests/models/grounding_dino/test_modeling_grounding_dino.py" This reverts commit aa74c4c57c430e54cc74c414d6269edb65c73e83. * [run-slow] groundigdino * remove nestedtensor * [run-slow] groundig_dino * [run-slow] grounding_dino * [run-slow] grounding_dino * [run-slow] grounding_dino * check * check * add: enconder intermediate outputs to ImageLoss forward * add: GroundingDinoForObjectDetectionLoss in the loss directory * make style * fix the loss function * remove class_reduction since it sum is default * remove class_reduction * Update src/transformers/loss/loss_grounding_dino.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * simple fix * Update src/transformers/loss/loss_grounding_dino.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * minor fix * Update src/transformers/loss/loss_for_object_detection.py --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com> Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> Co-authored-by: sangbumchoi <danielsejong55@gmail.com> Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-02-27 20:15:58 +01:00
parent 482d17be60
commit 222505c7e4
4 changed files with 487 additions and 18 deletions
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -20,6 +20,8 @@ import math
 import re
 import unittest

+from datasets import load_dataset
+
 from transformers import (
    GroundingDinoConfig,
    SwinConfig,
@@ -28,6 +30,7 @@ from transformers import (
 )
 from transformers.file_utils import cached_property
 from transformers.testing_utils import (
+    is_flaky,
    require_timm,
    require_torch,
    require_torch_accelerator,
@@ -37,14 +40,14 @@ from transformers.testing_utils import (
 )

 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin


 if is_torch_available():
    import torch

-    from transformers import GroundingDinoForObjectDetection, GroundingDinoModel
+    from transformers import GroundingDinoConfig, GroundingDinoForObjectDetection, GroundingDinoModel
    from transformers.pytorch_utils import id_tensor_storage


@@ -54,6 +57,39 @@ if is_vision_available():
    from transformers import AutoProcessor


+def generate_fake_bounding_boxes(n_boxes):
+    """Generate bounding boxes in the format (center_x, center_y, width, height)"""
+    # Validate the input
+    if not isinstance(n_boxes, int):
+        raise ValueError("n_boxes must be an integer")
+    if n_boxes <= 0:
+        raise ValueError("n_boxes must be a positive integer")
+
+    # Generate random bounding boxes in the format (center_x, center_y, width, height)
+    bounding_boxes = torch.rand((n_boxes, 4))
+
+    # Extract the components
+    center_x = bounding_boxes[:, 0]
+    center_y = bounding_boxes[:, 1]
+    width = bounding_boxes[:, 2]
+    height = bounding_boxes[:, 3]
+
+    # Ensure width and height do not exceed bounds
+    width = torch.min(width, torch.tensor(1.0))
+    height = torch.min(height, torch.tensor(1.0))
+
+    # Ensure the bounding box stays within the normalized space
+    center_x = torch.where(center_x - width / 2 < 0, width / 2, center_x)
+    center_x = torch.where(center_x + width / 2 > 1, 1 - width / 2, center_x)
+    center_y = torch.where(center_y - height / 2 < 0, height / 2, center_y)
+    center_y = torch.where(center_y + height / 2 > 1, 1 - height / 2, center_y)
+
+    # Combine back into bounding boxes
+    bounding_boxes = torch.stack([center_x, center_y, width, height], dim=1)
+
+    return bounding_boxes
+
+
 class GroundingDinoModelTester:
    def __init__(
        self,
@@ -72,7 +108,7 @@ class GroundingDinoModelTester:
        num_channels=3,
        image_size=98,
        n_targets=8,
-        num_labels=3,
+        num_labels=2,
        num_feature_levels=4,
        encoder_n_points=2,
        decoder_n_points=6,
@@ -115,7 +151,11 @@ class GroundingDinoModelTester:
        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
        pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device)

-        input_ids = ids_tensor([self.batch_size, self.max_text_len], self.num_labels)
+        # When using `GroundingDino` the text input template is '{label1}. {label2}. {label3. ... {labelN}.'
+        # Therefore to avoid errors when running tests with `labels` `input_ids` have to follow this structure.
+        # Otherwise when running `build_label_maps` it will throw an error when trying to split the input_ids into segments.
+        input_ids = torch.tensor([101, 3869, 1012, 11420, 3869, 1012, 102], device=torch_device)
+        input_ids = input_ids.unsqueeze(0).expand(self.batch_size, -1)

        labels = None
        if self.use_labels:
@@ -126,7 +166,7 @@ class GroundingDinoModelTester:
                target["class_labels"] = torch.randint(
                    high=self.num_labels, size=(self.n_targets,), device=torch_device
                )
-                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
+                target["boxes"] = generate_fake_bounding_boxes(self.n_targets).to(torch_device)
                target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device)
                labels.append(target)

@@ -317,7 +357,7 @@ class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes
            )
            out_len = len(outputs)

-            correct_outlen = 10
+            correct_outlen = 12

            # loss is at first position
            if "labels" in inputs_dict:
@@ -677,6 +717,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
        self.assertListEqual(results["text_labels"], expected_labels)

    @require_torch_accelerator
+    @is_flaky()
    def test_inference_object_detection_head_equivalence_cpu_gpu(self):
        processor = self.default_processor
        image = prepare_img()
@@ -716,6 +757,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
        torch.testing.assert_close(results_cpu["scores"], result_gpu["scores"].cpu(), rtol=1e-3, atol=1e-3)
        torch.testing.assert_close(results_cpu["boxes"], result_gpu["boxes"].cpu(), rtol=1e-3, atol=1e-3)

+    @is_flaky()
    def test_cross_attention_mask(self):
        model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny").to(torch_device)

@@ -740,4 +782,56 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):

        torch.testing.assert_close(outputs1.logits, outputs_batched.logits[:1], rtol=1e-3, atol=1e-3)
        # For some reason 12 elements are > 1e-3, but the rest are fine
-        torch.testing.assert_close(outputs2.logits, outputs_batched.logits[1:], rtol=1.8e-3, atol=1.8e-3)
+        self.assertTrue(torch.allclose(outputs2.logits, outputs_batched.logits[1:], atol=1.8e-3))
+
+    def test_grounding_dino_loss(self):
+        ds = load_dataset("EduardoPacheco/aquarium-sample", split="train")
+        image_processor = self.default_processor.image_processor
+        tokenizer = self.default_processor.tokenizer
+        id2label = {0: "fish", 1: "jellyfish", 2: "penguins", 3: "sharks", 4: "puffins", 5: "stingrays", 6: "starfish"}
+        prompt = ". ".join(id2label.values()) + "."
+
+        text_inputs = tokenizer([prompt, prompt], return_tensors="pt")
+        image_inputs = image_processor(images=ds["image"], annotations=ds["annotations"], return_tensors="pt")
+
+        # Passing auxiliary_loss=True to compare with the expected loss
+        model = GroundingDinoForObjectDetection.from_pretrained(
+            "IDEA-Research/grounding-dino-tiny",
+            auxiliary_loss=True,
+        )
+        # Interested in the loss only
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**text_inputs, **image_inputs)
+
+        # Loss differs by CPU and GPU, also this can be changed in future.
+        expected_loss_dict = {
+            "loss_ce": torch.tensor(1.1147),
+            "loss_bbox": torch.tensor(0.2031),
+            "loss_giou": torch.tensor(0.5819),
+            "loss_ce_0": torch.tensor(1.1941),
+            "loss_bbox_0": torch.tensor(0.1978),
+            "loss_giou_0": torch.tensor(0.5524),
+            "loss_ce_1": torch.tensor(1.1621),
+            "loss_bbox_1": torch.tensor(0.1909),
+            "loss_giou_1": torch.tensor(0.5892),
+            "loss_ce_2": torch.tensor(1.1641),
+            "loss_bbox_2": torch.tensor(0.1892),
+            "loss_giou_2": torch.tensor(0.5626),
+            "loss_ce_3": torch.tensor(1.1943),
+            "loss_bbox_3": torch.tensor(0.1941),
+            "loss_giou_3": torch.tensor(0.5607),
+            "loss_ce_4": torch.tensor(1.0956),
+            "loss_bbox_4": torch.tensor(0.2008),
+            "loss_giou_4": torch.tensor(0.5836),
+            "loss_ce_enc": torch.tensor(16226.3164),
+            "loss_bbox_enc": torch.tensor(0.3063),
+            "loss_giou_enc": torch.tensor(0.7380),
+        }
+
+        expected_loss = torch.tensor(32482.2305)
+
+        for key in expected_loss_dict:
+            self.assertTrue(torch.allclose(outputs.loss_dict[key], expected_loss_dict[key], atol=1e-3))
+
+        self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-3))