Add support for fine-tuning CLIP-like models using contrastive-image-text example (#29070)

* add support for siglip and chinese-clip model training with contrastive-image-text example * codebase fixups
2024-02-20 05:08:31 -07:00
parent 0996a10077
commit ee3af60be0
6 changed files with 20 additions and 7 deletions
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -1070,6 +1070,7 @@ MODELS_NOT_IN_README = [
    "VisionTextDualEncoder",
    "CLIPVisionModel",
    "SiglipVisionModel",
+    "ChineseCLIPVisionModel",
 ]

 # Template for new entries to add in the main README when we have missing models.