Add support for fine-tuning CLIP-like models using contrastive-image-text example (#29070)

* add support for siglip and chinese-clip model training with contrastive-image-text example * codebase fixups
2024-02-20 05:08:31 -07:00
parent 0996a10077
commit ee3af60be0
6 changed files with 20 additions and 7 deletions
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -1070,6 +1070,7 @@ MODELS_NOT_IN_README = [
    "VisionTextDualEncoder",
    "CLIPVisionModel",
    "SiglipVisionModel",
+    "ChineseCLIPVisionModel",
 ]

 # Template for new entries to add in the main README when we have missing models.
--- a/utils/check_table.py
+++ b/utils/check_table.py
@@ -171,7 +171,7 @@ MODEL_NAMES_WITH_SAME_CONFIG = {
    "XLS-R": "Wav2Vec2",
    "XLSR-Wav2Vec2": "Wav2Vec2",
 }
-MODEL_NAMES_TO_IGNORE = ["CLIPVisionModel", "SiglipVisionModel"]
+MODEL_NAMES_TO_IGNORE = ["CLIPVisionModel", "SiglipVisionModel", "ChineseCLIPVisionModel"]


 def get_model_table_from_auto_modules() -> str: