Add support for fine-tuning CLIP-like models using contrastive-image-text example (#29070)
* add support for siglip and chinese-clip model training with contrastive-image-text example * codebase fixups
This commit is contained in:
committed by
GitHub
parent
0996a10077
commit
ee3af60be0
@@ -1070,6 +1070,7 @@ MODELS_NOT_IN_README = [
|
||||
"VisionTextDualEncoder",
|
||||
"CLIPVisionModel",
|
||||
"SiglipVisionModel",
|
||||
"ChineseCLIPVisionModel",
|
||||
]
|
||||
|
||||
# Template for new entries to add in the main README when we have missing models.
|
||||
|
||||
@@ -171,7 +171,7 @@ MODEL_NAMES_WITH_SAME_CONFIG = {
|
||||
"XLS-R": "Wav2Vec2",
|
||||
"XLSR-Wav2Vec2": "Wav2Vec2",
|
||||
}
|
||||
MODEL_NAMES_TO_IGNORE = ["CLIPVisionModel", "SiglipVisionModel"]
|
||||
MODEL_NAMES_TO_IGNORE = ["CLIPVisionModel", "SiglipVisionModel", "ChineseCLIPVisionModel"]
|
||||
|
||||
|
||||
def get_model_table_from_auto_modules() -> str:
|
||||
|
||||
Reference in New Issue
Block a user