From 9215cc62d4366072aacafa4e44028c1ca187167b Mon Sep 17 00:00:00 2001 From: Matt Date: Fri, 14 Mar 2025 13:56:21 +0000 Subject: [PATCH] Try working around the processor registration bugs (#36184) * Try working around the processor registration bugs * oops * Update error message * Clarify error * Docstring docstring docstring * The extra content is indexed by config class, so let's grab some values out of there * Commit my confusion as a TODO * Resolve my confusion * Cleanup and mostly revert to the original * Better autoclass fallback * Don't nest f-strings you lunatic * Clearer error message * Less getattr() * Revert a lot of changes to try a different approach! * Try the global registry * Check the dynamic list as well as the transformers root * Move the dynamic list somewhere safer * Move the dynamic list somewhere even safer * More import cleanup * Simplify all the register_for_auto_class methods * Set _auto_class in the register() methods * Stop setting the cls attribute in register() * Restore specifying the model class for Model derivatives only * Fix accidentally taking the .__class__ of a class * Revert register_for_auto_class changes * Fix get_possibly_dynamic_module * No more ALL_CUSTOM_CLASSES * Fix up get_possibly_dynamic_module as well * Revert unnecessary formatting changes * Trigger tests --- .../models/auto/processing_auto.py | 2 +- .../models/auto/tokenization_auto.py | 1 + src/transformers/processing_utils.py | 42 ++++++++++++++++--- tests/models/auto/test_processor_auto.py | 34 +++++++++++++++ 4 files changed, 73 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 53ddcd3d1e..c65219b0bc 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -382,6 +382,6 @@ class AutoProcessor: Args: config_class ([`PretrainedConfig`]): The configuration corresponding to the model to register. - processor_class ([`FeatureExtractorMixin`]): The processor to register. + processor_class ([`ProcessorMixin`]): The processor to register. """ PROCESSOR_MAPPING.register(config_class, processor_class, exist_ok=exist_ok) diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index cb3e921f8e..7d838b1ad2 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -990,6 +990,7 @@ class AutoTokenizer: f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}." ) + @staticmethod def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False): """ Register a new tokenizer in this mapping. diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index e709878f1c..9872887fe5 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -69,7 +69,7 @@ from .utils import ( logger = logging.get_logger(__name__) -# Dynamically import the Transformers module to grab the attribute classes of the processor form their names. +# Dynamically import the Transformers module to grab the attribute classes of the processor from their names. transformers_module = direct_transformers_import(Path(__file__).parent) @@ -470,9 +470,9 @@ class ProcessorMixin(PushToHubMixin): # Nothing is ever going to be an instance of "AutoXxx", in that case we check the base class. class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name) if isinstance(class_name, tuple): - proper_class = tuple(getattr(transformers_module, n) for n in class_name if n is not None) + proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None) else: - proper_class = getattr(transformers_module, class_name) + proper_class = self.get_possibly_dynamic_module(class_name) if not isinstance(arg, proper_class): raise TypeError( @@ -1100,11 +1100,19 @@ class ProcessorMixin(PushToHubMixin): @classmethod def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + """ + Identify and instantiate the subcomponents of Processor classes, like image processors and + tokenizers. This method uses the Processor attributes like `tokenizer_class` to figure out what class those + subcomponents should be. Note that any subcomponents must either be library classes that are accessible in + the `transformers` root, or they must be custom code that has been registered with the relevant autoclass, + via methods like `AutoTokenizer.register()`. If neither of these conditions are fulfilled, this method + will be unable to find the relevant subcomponent class and will raise an error. + """ args = [] for attribute_name in cls.attributes: class_name = getattr(cls, f"{attribute_name}_class") if isinstance(class_name, tuple): - classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name) + classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name) if attribute_name == "image_processor": # TODO: @yoni, change logic in v4.50 (when use_fast set to True by default) use_fast = kwargs.get("use_fast", None) @@ -1121,11 +1129,35 @@ class ProcessorMixin(PushToHubMixin): else: attribute_class = classes[0] else: - attribute_class = getattr(transformers_module, class_name) + attribute_class = cls.get_possibly_dynamic_module(class_name) args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs)) return args + @staticmethod + def get_possibly_dynamic_module(module_name): + if hasattr(transformers_module, module_name): + return getattr(transformers_module, module_name) + lookup_locations = [ + transformers_module.IMAGE_PROCESSOR_MAPPING, + transformers_module.TOKENIZER_MAPPING, + transformers_module.FEATURE_EXTRACTOR_MAPPING, + ] + for lookup_location in lookup_locations: + for custom_class in lookup_location._extra_content.values(): + if isinstance(custom_class, tuple): + for custom_subclass in custom_class: + if custom_subclass is not None and custom_subclass.__name__ == module_name: + return custom_subclass + elif custom_class is not None and custom_class.__name__ == module_name: + return custom_class + else: + raise ValueError( + f"Could not find module {module_name} in `transformers`. If this is a custom class, " + f"it should be registered using the relevant `AutoClass.register()` function so that " + f"other functions can find it!" + ) + @property def model_input_names(self): first_attribute = getattr(self, self.attributes[0]) diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py index fd361f160f..41a2815e4d 100644 --- a/tests/models/auto/test_processor_auto.py +++ b/tests/models/auto/test_processor_auto.py @@ -354,6 +354,40 @@ class AutoFeatureExtractorTest(unittest.TestCase): if CustomConfig in PROCESSOR_MAPPING._extra_content: del PROCESSOR_MAPPING._extra_content[CustomConfig] + def test_dynamic_processor_with_specific_dynamic_subcomponents(self): + class NewFeatureExtractor(Wav2Vec2FeatureExtractor): + pass + + class NewTokenizer(BertTokenizer): + pass + + class NewProcessor(ProcessorMixin): + feature_extractor_class = "NewFeatureExtractor" + tokenizer_class = "NewTokenizer" + + def __init__(self, feature_extractor, tokenizer): + super().__init__(feature_extractor, tokenizer) + + try: + AutoConfig.register("custom", CustomConfig) + AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor) + AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer) + AutoProcessor.register(CustomConfig, NewProcessor) + # If remote code is not set, the default is to use local classes. + processor = AutoProcessor.from_pretrained( + "hf-internal-testing/test_dynamic_processor", + ) + self.assertEqual(processor.__class__.__name__, "NewProcessor") + finally: + if "custom" in CONFIG_MAPPING._extra_content: + del CONFIG_MAPPING._extra_content["custom"] + if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: + del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig] + if CustomConfig in TOKENIZER_MAPPING._extra_content: + del TOKENIZER_MAPPING._extra_content[CustomConfig] + if CustomConfig in PROCESSOR_MAPPING._extra_content: + del PROCESSOR_MAPPING._extra_content[CustomConfig] + def test_auto_processor_creates_tokenizer(self): processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert") self.assertEqual(processor.__class__.__name__, "BertTokenizerFast")