diff --git a/docs/source/main_classes/processors.mdx b/docs/source/main_classes/processors.mdx index 8f5f1e048d..2aaca485df 100644 --- a/docs/source/main_classes/processors.mdx +++ b/docs/source/main_classes/processors.mdx @@ -12,10 +12,22 @@ specific language governing permissions and limitations under the License. # Processors -This library includes processors for several traditional tasks. These processors can be used to process a dataset into -examples that can be fed to a model. +Processors can mean two different things in the Transformers library: +- the objects that pre-process inputs for multi-modal models such as [Wav2Vec2](../model_doc/wav2vec2) (speech and text) + or [CLIP](../model_doc/clip) (text and vision) +- deprecated objects that were used in older versions of the library to preprocess data for GLUE or SQUAD. -## Processors +## Multi-modal processors + +Any multi-modal model will require an object to encode or decode the data that groups several modalities (among text, +vision and audio). This is handled by objects called processors, which group tokenizers (for the text modality) and +feature extractors (for vision and audio). + +Those processors inherit from the following base class that implements the saving and loading functionality: + +[[autodoc]] ProcessorMixin + +## Deprecated processors All processors follow the same architecture which is that of the [`~data.processors.utils.DataProcessor`]. The processor returns a list of diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 18d7c35b65..2476d41fc8 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -95,7 +95,7 @@ _import_structure = { "dependency_versions_table": [], "dynamic_module_utils": [], "feature_extraction_sequence_utils": ["SequenceFeatureExtractor"], - "feature_extraction_utils": ["BatchFeature"], + "feature_extraction_utils": ["BatchFeature", "FeatureExtractionMixin"], "file_utils": [ "CONFIG_NAME", "MODEL_CARD_NAME", @@ -365,6 +365,7 @@ _import_structure = { "ZeroShotClassificationPipeline", "pipeline", ], + "processing_utils": ["ProcessorMixin"], "testing_utils": [], "tokenization_utils": ["PreTrainedTokenizer"], "tokenization_utils_base": [ @@ -2307,7 +2308,7 @@ if TYPE_CHECKING: from .feature_extraction_sequence_utils import SequenceFeatureExtractor # Feature Extractor - from .feature_extraction_utils import BatchFeature + from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin # Files and general utilities from .file_utils import ( @@ -2555,6 +2556,7 @@ if TYPE_CHECKING: ZeroShotClassificationPipeline, pipeline, ) + from .processing_utils import ProcessorMixin # Tokenization from .tokenization_utils import PreTrainedTokenizer diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py index ebbde87272..2323dbc7e8 100644 --- a/src/transformers/models/clip/processing_clip.py +++ b/src/transformers/models/clip/processing_clip.py @@ -15,12 +15,11 @@ """ Image/Text processor class for CLIP """ +from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding -from .feature_extraction_clip import CLIPFeatureExtractor -from .tokenization_clip import CLIPTokenizer -class CLIPProcessor: +class CLIPProcessor(ProcessorMixin): r""" Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor. @@ -33,77 +32,13 @@ class CLIPProcessor: tokenizer ([`CLIPTokenizer`]): The tokenizer is a required input. """ + feature_extractor_class = "CLIPFeatureExtractor" + tokenizer_class = "CLIPTokenizer" def __init__(self, feature_extractor, tokenizer): - if not isinstance(feature_extractor, CLIPFeatureExtractor): - raise ValueError( - f"`feature_extractor` has to be of type CLIPFeatureExtractor, but is {type(feature_extractor)}" - ) - if not isinstance(tokenizer, CLIPTokenizer): - raise ValueError(f"`tokenizer` has to be of type CLIPTokenizer, but is {type(tokenizer)}") - - self.feature_extractor = feature_extractor - self.tokenizer = tokenizer + super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor - def save_pretrained(self, save_directory): - """ - Save a CLIP feature extractor object and CLIP tokenizer object to the directory `save_directory`, so that it - can be re-loaded using the [`~CLIPProcessor.from_pretrained`] class method. - - - - This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and - [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - save_directory (`str` or `os.PathLike`): - Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will - be created if it does not exist). - """ - self.feature_extractor._set_processor_class(self.__class__.__name__) - self.feature_extractor.save_pretrained(save_directory) - - self.tokenizer._set_processor_class(self.__class__.__name__) - self.tokenizer.save_pretrained(save_directory) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" - Instantiate a [`CLIPProcessor`] from a pretrained CLIP processor. - - - - This class method is simply calling CLIPFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and - CLIPTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the - docstrings of the methods above for more information. - - - - Args: - pretrained_model_name_or_path (`str` or `os.PathLike`): - This can be either: - - - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like `clip-vit-base-patch32`, or - namespaced under a user or organization name, like `openai/clip-vit-base-patch32`. - - a path to a *directory* containing a feature extractor file saved using the - [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. - - a path or url to a saved feature extractor JSON *file*, e.g., - `./my_model_directory/preprocessor_config.json`. - - **kwargs - Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and - [`PreTrainedTokenizer`] - """ - feature_extractor = CLIPFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) - tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - - return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) - def __call__(self, text=None, images=None, return_tensors=None, **kwargs): """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py index b727def91c..4277d3a1b2 100644 --- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py @@ -18,13 +18,11 @@ Processor class for LayoutLMv2. from typing import List, Optional, Union from ...file_utils import TensorType +from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy -from .feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor -from .tokenization_layoutlmv2 import LayoutLMv2Tokenizer -from .tokenization_layoutlmv2_fast import LayoutLMv2TokenizerFast -class LayoutLMv2Processor: +class LayoutLMv2Processor(ProcessorMixin): r""" Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a single processor. @@ -43,84 +41,8 @@ class LayoutLMv2Processor: tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`): An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input. """ - - def __init__(self, feature_extractor, tokenizer): - if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor): - raise ValueError( - f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}" - ) - if not isinstance(tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast)): - raise ValueError( - f"`tokenizer` has to be of type {LayoutLMv2Tokenizer.__class__} or {LayoutLMv2TokenizerFast.__class__}, but is {type(tokenizer)}" - ) - - self.feature_extractor = feature_extractor - self.tokenizer = tokenizer - - def save_pretrained(self, save_directory): - """ - Save a LayoutLMv2 feature_extractor object and LayoutLMv2 tokenizer object to the directory `save_directory`, - so that it can be re-loaded using the [`~LayoutLMv2Processor.from_pretrained`] class method. - - - - This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and - [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - save_directory (`str` or `os.PathLike`): - Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will - be created if it does not exist). - """ - self.feature_extractor._set_processor_class(self.__class__.__name__) - self.feature_extractor.save_pretrained(save_directory) - - self.tokenizer._set_processor_class(self.__class__.__name__) - self.tokenizer.save_pretrained(save_directory) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs): - r""" - Instantiate a [`LayoutLMv2Processor`] from a pretrained LayoutLMv2 processor. - - - - This class method is simply calling LayoutLMv2FeatureExtractor's - [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and LayoutLMv2TokenizerFast's - [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - pretrained_model_name_or_path (`str` or `os.PathLike`): - This can be either: - - - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or - namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. - - a path to a *directory* containing a feature extractor file saved using the - [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. - - a path or url to a saved feature extractor JSON *file*, e.g., - `./my_model_directory/preprocessor_config.json`. - - use_fast (`bool`, *optional*, defaults to `True`): - Whether or not to instantiate a fast tokenizer. - - **kwargs - Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and - [`PreTrainedTokenizer`] - """ - feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) - if use_fast: - tokenizer = LayoutLMv2TokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) - else: - tokenizer = LayoutLMv2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - - return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) + feature_extractor_class = "LayoutLMv2FeatureExtractor" + tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast") def __call__( self, diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py index 231810bf5e..5bd1dffb5c 100644 --- a/src/transformers/models/layoutxlm/processing_layoutxlm.py +++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py @@ -17,15 +17,12 @@ Processor class for LayoutXLM. """ from typing import List, Optional, Union -from transformers.models.layoutlmv2.feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor - from ...file_utils import TensorType +from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy -from .tokenization_layoutxlm import LayoutXLMTokenizer -from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast -class LayoutXLMProcessor: +class LayoutXLMProcessor(ProcessorMixin): r""" Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a single processor. @@ -44,84 +41,8 @@ class LayoutXLMProcessor: tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`): An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input. """ - - def __init__(self, feature_extractor, tokenizer): - if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor): - raise ValueError( - f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}" - ) - if not isinstance(tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast)): - raise ValueError( - f"`tokenizer` has to be of type {LayoutXLMTokenizer.__class__} or {LayoutXLMTokenizerFast.__class__}, but is {type(tokenizer)}" - ) - - self.feature_extractor = feature_extractor - self.tokenizer = tokenizer - - def save_pretrained(self, save_directory): - """ - Save a LayoutXLM feature_extractor object and LayoutXLM tokenizer object to the directory `save_directory`, so - that it can be re-loaded using the [`~LayoutXLMProcessor.from_pretrained`] class method. - - - - This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and - [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - save_directory (`str` or `os.PathLike`): - Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will - be created if it does not exist). - """ - self.feature_extractor._set_processor_class(self.__class__.__name__) - self.feature_extractor.save_pretrained(save_directory) - - self.tokenizer._set_processor_class(self.__class__.__name__) - self.tokenizer.save_pretrained(save_directory) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs): - r""" - Instantiate a [`LayoutXLMProcessor`] from a pretrained LayoutXLM processor. - - - - This class method is simply calling Layoutv2FeatureExtractor's - [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and LayoutXLMTokenizerFast's - [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - pretrained_model_name_or_path (`str` or `os.PathLike`): - This can be either: - - - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or - namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. - - a path to a *directory* containing a feature extractor file saved using the - [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. - - a path or url to a saved feature extractor JSON *file*, e.g., - `./my_model_directory/preprocessor_config.json`. - - use_fast (`bool`, *optional*, defaults to `True`): - Whether or not to instantiate a fast tokenizer. - - **kwargs - Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and - [`PreTrainedTokenizer`] - """ - feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) - if use_fast: - tokenizer = LayoutXLMTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) - else: - tokenizer = LayoutXLMTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - - return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) + feature_extractor_class = "LayoutLMv2FeatureExtractor" + tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast") def __call__( self, diff --git a/src/transformers/models/speech_to_text/processing_speech_to_text.py b/src/transformers/models/speech_to_text/processing_speech_to_text.py index 8d983d2cc0..969df9d108 100644 --- a/src/transformers/models/speech_to_text/processing_speech_to_text.py +++ b/src/transformers/models/speech_to_text/processing_speech_to_text.py @@ -17,11 +17,10 @@ Speech processor class for Speech2Text """ from contextlib import contextmanager -from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor -from .tokenization_speech_to_text import Speech2TextTokenizer +from ...processing_utils import ProcessorMixin -class Speech2TextProcessor: +class Speech2TextProcessor(ProcessorMixin): r""" Constructs a Speech2Text processor which wraps a Speech2Text feature extractor and a Speech2Text tokenizer into a single processor. @@ -36,79 +35,13 @@ class Speech2TextProcessor: tokenizer (`Speech2TextTokenizer`): An instance of [`Speech2TextTokenizer`]. The tokenizer is a required input. """ + feature_extractor_class = "Speech2TextFeatureExtractor" + tokenizer_class = "Speech2TextTokenizer" def __init__(self, feature_extractor, tokenizer): - if not isinstance(feature_extractor, Speech2TextFeatureExtractor): - raise ValueError( - f"`feature_extractor` has to be of type {Speech2TextFeatureExtractor.__class__}, but is {type(feature_extractor)}" - ) - if not isinstance(tokenizer, Speech2TextTokenizer): - raise ValueError( - f"`tokenizer` has to be of type {Speech2TextTokenizer.__class__}, but is {type(tokenizer)}" - ) - - self.feature_extractor = feature_extractor - self.tokenizer = tokenizer + super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor - def save_pretrained(self, save_directory): - """ - Save a Speech2Text feature extractor object and Speech2Text tokenizer object to the directory `save_directory`, - so that it can be re-loaded using the [`~Speech2TextProcessor.from_pretrained`] class method. - - - - This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and - [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - save_directory (`str` or `os.PathLike`): - Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will - be created if it does not exist). - """ - self.feature_extractor._set_processor_class(self.__class__.__name__) - self.feature_extractor.save_pretrained(save_directory) - - self.tokenizer._set_processor_class(self.__class__.__name__) - self.tokenizer.save_pretrained(save_directory) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" - Instantiate a [`Speech2TextProcessor`] from a pretrained Speech2Text processor. - - - - This class method is simply calling Speech2TextFeatureExtractor's - [`~PreTrainedFeatureExtractor.from_pretrained`] and Speech2TextTokenizer's - [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - pretrained_model_name_or_path (`str` or `os.PathLike`): - This can be either: - - - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or - namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. - - a path to a *directory* containing a feature extractor file saved using the - [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. - - a path or url to a saved feature extractor JSON *file*, e.g., - `./my_model_directory/preprocessor_config.json`. - **kwargs - Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and - [`PreTrainedTokenizer`] - """ - feature_extractor = Speech2TextFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) - tokenizer = Speech2TextTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - - return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) - def __call__(self, *args, **kwargs): """ When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's diff --git a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py index 5567ef982b..28189ba881 100644 --- a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py @@ -17,12 +17,10 @@ Speech processor class for Speech2Text2 """ from contextlib import contextmanager -from ...feature_extraction_sequence_utils import SequenceFeatureExtractor -from ..auto.feature_extraction_auto import AutoFeatureExtractor -from .tokenization_speech_to_text_2 import Speech2Text2Tokenizer +from ...processing_utils import ProcessorMixin -class Speech2Text2Processor: +class Speech2Text2Processor(ProcessorMixin): r""" Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into a single processor. @@ -36,77 +34,13 @@ class Speech2Text2Processor: tokenizer (`Speech2Text2Tokenizer`): An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input. """ + feature_extractor_class = "AutoFeatureExtractor" + tokenizer_class = "Speech2Text2Tokenizer" def __init__(self, feature_extractor, tokenizer): - if not isinstance(feature_extractor, SequenceFeatureExtractor): - raise ValueError( - f"`feature_extractor` has to be of type {SequenceFeatureExtractor.__class__}, but is {type(feature_extractor)}" - ) - if not isinstance(tokenizer, Speech2Text2Tokenizer): - raise ValueError( - f"`tokenizer` has to be of type {Speech2Text2Tokenizer.__class__}, but is {type(tokenizer)}" - ) - - self.feature_extractor = feature_extractor - self.tokenizer = tokenizer + super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor - def save_pretrained(self, save_directory): - """ - Save a Speech2Text2 feature extractor object and Speech2Text2 tokenizer object to the directory - `save_directory`, so that it can be re-loaded using the [`~Speech2Text2Processor.from_pretrained`] class - method. - - - - This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and - [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - save_directory (`str` or `os.PathLike`): - Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will - be created if it does not exist). - """ - - self.feature_extractor.save_pretrained(save_directory) - self.tokenizer.save_pretrained(save_directory) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" - Instantiate a [`Speech2Text2Processor`] from a pretrained Speech2Text2 processor. - - - - This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and - Speech2Text2Tokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the - docstrings of the methods above for more information. - - - - Args: - pretrained_model_name_or_path (`str` or `os.PathLike`): - This can be either: - - - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or - namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. - - a path to a *directory* containing a feature extractor file saved using the - [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. - - a path or url to a saved feature extractor JSON *file*, e.g., - `./my_model_directory/preprocessor_config.json`. - **kwargs - Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and - [`PreTrainedTokenizer`] - """ - feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) - tokenizer = Speech2Text2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - - return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) - def __call__(self, *args, **kwargs): """ When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py index a74dc0b38e..2c7893a091 100644 --- a/src/transformers/models/trocr/processing_trocr.py +++ b/src/transformers/models/trocr/processing_trocr.py @@ -17,15 +17,10 @@ Processor class for TrOCR. """ from contextlib import contextmanager -from transformers import AutoFeatureExtractor, AutoTokenizer -from transformers.feature_extraction_utils import FeatureExtractionMixin -from transformers.models.roberta.tokenization_roberta import RobertaTokenizer -from transformers.models.roberta.tokenization_roberta_fast import RobertaTokenizerFast -from transformers.models.xlm_roberta.tokenization_xlm_roberta import XLMRobertaTokenizer -from transformers.models.xlm_roberta.tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast +from ...processing_utils import ProcessorMixin -class TrOCRProcessor: +class TrOCRProcessor(ProcessorMixin): r""" Constructs a TrOCR processor which wraps a vision feature extractor and a TrOCR tokenizer into a single processor. @@ -39,78 +34,13 @@ class TrOCRProcessor: tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`]): An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input. """ + feature_extractor_class = "AutoFeatureExtractor" + tokenizer_class = "AutoTokenizer" def __init__(self, feature_extractor, tokenizer): - if not isinstance(feature_extractor, FeatureExtractionMixin): - raise ValueError( - f"`feature_extractor` has to be of type {FeatureExtractionMixin.__class__}, but is {type(feature_extractor)}" - ) - if not isinstance( - tokenizer, (RobertaTokenizer, RobertaTokenizerFast, XLMRobertaTokenizer, XLMRobertaTokenizerFast) - ): - raise ValueError( - f"`tokenizer` has to be of type {RobertaTokenizer.__class__} or {RobertaTokenizerFast.__class__} or {XLMRobertaTokenizer.__class__} or {XLMRobertaTokenizerFast.__class__}, but is {type(tokenizer)}" - ) - - self.feature_extractor = feature_extractor - self.tokenizer = tokenizer + super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor - def save_pretrained(self, save_directory): - """ - Save a TrOCR feature extractor object and TrOCR tokenizer object to the directory `save_directory`, so that it - can be re-loaded using the [`~TrOCRProcessor.from_pretrained`] class method. - - - - This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and - [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - save_directory (`str` or `os.PathLike`): - Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will - be created if it does not exist). - """ - - self.feature_extractor.save_pretrained(save_directory) - self.tokenizer.save_pretrained(save_directory) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" - Instantiate a [`TrOCRProcessor`] from a pretrained TrOCR processor. - - - - This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and - TrOCRTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the - docstrings of the methods above for more information. - - - - Args: - pretrained_model_name_or_path (`str` or `os.PathLike`): - This can be either: - - - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or - namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. - - a path to a *directory* containing a feature extractor file saved using the - [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. - - a path or url to a saved feature extractor JSON *file*, e.g., - `./my_model_directory/preprocessor_config.json`. - **kwargs - Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and - [`PreTrainedTokenizer`] - """ - feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - - return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) - def __call__(self, *args, **kwargs): """ When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py index 50ca918065..602d85de68 100644 --- a/src/transformers/models/vilt/processing_vilt.py +++ b/src/transformers/models/vilt/processing_vilt.py @@ -18,14 +18,12 @@ Processor class for ViLT. from typing import List, Optional, Union -from transformers import BertTokenizerFast - from ...file_utils import TensorType +from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy -from .feature_extraction_vilt import ViltFeatureExtractor -class ViltProcessor: +class ViltProcessor(ProcessorMixin): r""" Constructs a ViLT processor which wraps a BERT tokenizer and ViLT feature extractor into a single processor. @@ -38,75 +36,13 @@ class ViltProcessor: tokenizer (`BertTokenizerFast`): An instance of ['BertTokenizerFast`]. The tokenizer is a required input. """ + feature_extractor_class = "ViltFeatureExtractor" + tokenizer_class = ("BertTokenizer", "BertTokenizerFast") def __init__(self, feature_extractor, tokenizer): - if not isinstance(feature_extractor, ViltFeatureExtractor): - raise ValueError( - f"`feature_extractor` has to be of type {ViltFeatureExtractor.__class__}, but is {type(feature_extractor)}" - ) - if not isinstance(tokenizer, BertTokenizerFast): - raise ValueError(f"`tokenizer` has to be of type {BertTokenizerFast.__class__}, but is {type(tokenizer)}") - - self.feature_extractor = feature_extractor - self.tokenizer = tokenizer + super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor - def save_pretrained(self, save_directory): - """ - Save a ViLT feature_extractor object and BERT tokenizer object to the directory `save_directory`, so that it - can be re-loaded using the [`~ViltProcessor.from_pretrained`] class method. - - - - This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and - [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - save_directory (`str` or `os.PathLike`): - Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will - be created if it does not exist). - """ - - self.feature_extractor.save_pretrained(save_directory) - self.tokenizer.save_pretrained(save_directory) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" - Instantiate a [`ViltProcessor`] from a pretrained ViLT processor. - - - - This class method is simply calling ViltFeatureExtractor's - [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and BertTokenizerFast's - [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - pretrained_model_name_or_path (`str` or `os.PathLike`): - This can be either: - - - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or - namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. - - a path to a *directory* containing a feature extractor file saved using the - [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. - - a path or url to a saved feature extractor JSON *file*, e.g., - `./my_model_directory/preprocessor_config.json`. - **kwargs - Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and - [`PreTrainedTokenizer`] - """ - feature_extractor = ViltFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) - tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) - - return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) - def __call__( self, images, diff --git a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py index 4406f80d3d..6cc58b2627 100644 --- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py @@ -15,17 +15,12 @@ """ Processor class for VisionTextDualEncoder """ -from typing import Union - -from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -from transformers.feature_extraction_utils import FeatureExtractionMixin +from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding -from ..auto.feature_extraction_auto import AutoFeatureExtractor -from ..auto.tokenization_auto import AutoTokenizer -class VisionTextDualEncoderProcessor: +class VisionTextDualEncoderProcessor(ProcessorMixin): r""" Constructs a VisionTextDualEncoder processor which wraps a vision feature extractor and a tokenizer into a single processor. @@ -40,82 +35,13 @@ class VisionTextDualEncoderProcessor: tokenizer ([`PreTrainedTokenizer`]): The tokenizer is a required input. """ + feature_extractor_class = "AutoFeatureExtractor" + tokenizer_class = "AutoTokenizer" - def __init__( - self, feature_extractor: FeatureExtractionMixin, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] - ): - if not isinstance(feature_extractor, FeatureExtractionMixin): - raise ValueError( - f"`feature_extractor` has to be of type {FeatureExtractionMixin.__class__}, but is {type(feature_extractor)}" - ) - if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)): - raise ValueError( - f"`tokenizer` has to be of type `PreTrainedTokenizer` or `PreTrainedTokenizerFast`, but is {type(tokenizer)}" - ) - - self.feature_extractor = feature_extractor - self.tokenizer = tokenizer + def __init__(self, feature_extractor, tokenizer): + super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor - def save_pretrained(self, save_directory): - """ - Save a VisionTextDualEncoder feature extractor object and VisionTextDualEncoder tokenizer object to the - directory `save_directory`, so that it can be re-loaded using the - [`~VisionTextDualEncoderProcessor.from_pretrained`] class method. - - - - This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and - [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - save_directory (`str` or `os.PathLike`): - Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will - be created if it does not exist). - """ - self.feature_extractor._set_processor_class(self.__class__.__name__) - self.feature_extractor.save_pretrained(save_directory) - - self.tokenizer._set_processor_class(self.__class__.__name__) - self.tokenizer.save_pretrained(save_directory) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" - Instantiate a [`VisionTextDualEncoderProcessor`] from a pretrained VisionTextDualEncoder processor. - - - - This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and - AutoTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the - docstrings of the methods above for more information. - - - - Args: - pretrained_model_name_or_path (`str` or `os.PathLike`): - This can be either: - - - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or - namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. - - a path to a *directory* containing a feature extractor file saved using the - [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. - - a path or url to a saved feature extractor JSON *file*, e.g., - `./my_model_directory/preprocessor_config.json`. - - **kwargs - Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and - [`PreTrainedTokenizer`] - """ - feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - - return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) - def __call__(self, text=None, images=None, return_tensors=None, **kwargs): """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py index 7679272142..1470c254dc 100644 --- a/src/transformers/models/wav2vec2/processing_wav2vec2.py +++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py @@ -18,14 +18,12 @@ Speech processor class for Wav2Vec2 import warnings from contextlib import contextmanager -from ...tokenization_utils import PreTrainedTokenizer -from ...tokenization_utils_fast import PreTrainedTokenizerFast -from ..auto.tokenization_auto import AutoTokenizer +from ...processing_utils import ProcessorMixin from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer -class Wav2Vec2Processor: +class Wav2Vec2Processor(ProcessorMixin): r""" Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single processor. @@ -39,82 +37,17 @@ class Wav2Vec2Processor: tokenizer ([`PreTrainedTokenizer`]): An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input. """ + feature_extractor_class = "Wav2Vec2FeatureExtractor" + tokenizer_class = "AutoTokenizer" def __init__(self, feature_extractor, tokenizer): - if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor): - raise ValueError( - f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}" - ) - if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)): - raise ValueError( - f"`tokenizer` has to be of type {PreTrainedTokenizer.__class__}, but is {type(tokenizer)}" - ) - - self.feature_extractor = feature_extractor - self.tokenizer = tokenizer + super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor - def save_pretrained(self, save_directory): - """ - Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory `save_directory`, so - that it can be re-loaded using the [`~Wav2Vec2Processor.from_pretrained`] class method. - - - - This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and - [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - save_directory (`str` or `os.PathLike`): - Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will - be created if it does not exist). - """ - self.feature_extractor._set_processor_class(self.__class__.__name__) - self.feature_extractor.save_pretrained(save_directory) - - self.tokenizer._set_processor_class(self.__class__.__name__) - self.tokenizer.save_pretrained(save_directory) - @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" - Instantiate a [`Wav2Vec2Processor`] from a pretrained Wav2Vec2 processor. - - - - This class method is simply calling Wav2Vec2FeatureExtractor's - [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and PreTrainedTokenizer's - [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods - above for more information. - - - - Args: - pretrained_model_name_or_path (`str` or `os.PathLike`): - This can be either: - - - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or - namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. - - a path to a *directory* containing a feature extractor file saved using the - [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. - - a path or url to a saved feature extractor JSON *file*, e.g., - `./my_model_directory/preprocessor_config.json`. - **kwargs - Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and - [`PreTrainedTokenizer`] - """ - feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) - - # load generic `AutoTokenizer` - # need fallback here for backward compatibility in case processor is - # loaded from just a tokenizer file that does not have a `tokenizer_class` attribute - # behavior should be deprecated in major future release try: - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + return super().from_pretrained(pretrained_model_name_or_path, **kwargs) except OSError: warnings.warn( f"Loading a tokenizer inside {cls.__name__} from a config that does not" @@ -124,9 +57,11 @@ class Wav2Vec2Processor: "file to suppress this warning: ", FutureWarning, ) + + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) + return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) def __call__(self, *args, **kwargs): """ diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py index 148e42ec66..c31b209c18 100644 --- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py @@ -23,16 +23,16 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union import numpy as np -from ...feature_extraction_utils import FeatureExtractionMixin from ...file_utils import ModelOutput, requires_backends -from ...tokenization_utils import PreTrainedTokenizer -from ..wav2vec2.feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor -from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer +from ...processing_utils import ProcessorMixin if TYPE_CHECKING: from pyctcdecode import BeamSearchDecoderCTC + from ...feature_extraction_utils import FeatureExtractionMixin + from ...tokenization_utils import PreTrainedTokenizerBase + @dataclass class Wav2Vec2DecoderWithLMOutput(ModelOutput): @@ -47,7 +47,7 @@ class Wav2Vec2DecoderWithLMOutput(ModelOutput): text: Union[List[str], str] -class Wav2Vec2ProcessorWithLM: +class Wav2Vec2ProcessorWithLM(ProcessorMixin): r""" Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder with language model support into a single processor for language model boosted speech recognition decoding. @@ -60,24 +60,18 @@ class Wav2Vec2ProcessorWithLM: decoder (`pyctcdecode.BeamSearchDecoderCTC`): An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input. """ + feature_extractor_class = "Wav2Vec2FeatureExtractor" + tokenizer_class = "Wav2Vec2CTCTokenizer" def __init__( self, - feature_extractor: FeatureExtractionMixin, - tokenizer: PreTrainedTokenizer, + feature_extractor: "FeatureExtractionMixin", + tokenizer: "PreTrainedTokenizerBase", decoder: "BeamSearchDecoderCTC", ): from pyctcdecode import BeamSearchDecoderCTC - if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor): - raise ValueError( - f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}" - ) - if not isinstance(tokenizer, Wav2Vec2CTCTokenizer): - # TODO(PVP) - this can be relaxed in the future to allow other kinds of tokenizers - raise ValueError( - f"`tokenizer` has to be of type {Wav2Vec2CTCTokenizer.__class__}, but is {type(tokenizer)}" - ) + super().__init__(feature_extractor, tokenizer) if not isinstance(decoder, BeamSearchDecoderCTC): raise ValueError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}") @@ -90,37 +84,11 @@ class Wav2Vec2ProcessorWithLM: f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet." ) - self.feature_extractor = feature_extractor - self.tokenizer = tokenizer self.decoder = decoder self.current_processor = self.feature_extractor def save_pretrained(self, save_directory): - """ - Save the Wav2Vec2 feature_extractor, a tokenizer object and a pyctcdecode decoder to the directory - `save_directory`, so that they can be re-loaded using the [`~Wav2Vec2ProcessorWithLM.from_pretrained`] class - method. - - - - This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained,`] - [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`] and pyctcdecode's - [`pyctcdecode.BeamSearchDecoderCTC.save_to_dir`]. - - Please refer to the docstrings of the methods above for more information. - - - - Args: - save_directory (`str` or `os.PathLike`): - Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will - be created if it does not exist). - """ - self.feature_extractor._set_processor_class(self.__class__.__name__) - self.feature_extractor.save_pretrained(save_directory) - - self.tokenizer._set_processor_class(self.__class__.__name__) - self.tokenizer.save_pretrained(save_directory) + super().save_pretrained(save_directory) self.decoder.save_to_dir(save_directory) @classmethod @@ -157,8 +125,7 @@ class Wav2Vec2ProcessorWithLM: requires_backends(cls, "pyctcdecode") from pyctcdecode import BeamSearchDecoderCTC - feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) - tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + feature_extractor, tokenizer = super()._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) if os.path.isdir(pretrained_model_name_or_path): decoder = BeamSearchDecoderCTC.load_from_dir(pretrained_model_name_or_path) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py new file mode 100644 index 0000000000..ec6196c862 --- /dev/null +++ b/src/transformers/processing_utils.py @@ -0,0 +1,161 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + Processing saving/loading class for common processors. +""" + +import importlib.util +from pathlib import Path + + +# Comment to write +spec = importlib.util.spec_from_file_location( + "transformers", Path(__file__).parent / "__init__.py", submodule_search_locations=[Path(__file__).parent] +) +transformers_module = spec.loader.load_module() + + +AUTO_TO_BASE_CLASS_MAPPING = { + "AutoTokenizer": "PreTrainedTokenizerBase", + "AutoFeatureExtractor": "FeatureExtractionMixin", +} + + +class ProcessorMixin: + """ + This is a mixin used to provide saving/loading functionality for all processor classes. + """ + + attributes = ["feature_extractor", "tokenizer"] + # Names need to be attr_class for attr in attributes + feature_extractor_class = None + tokenizer_class = None + + # args have to match the attributes class attribute + def __init__(self, *args, **kwargs): + # Sanitize args and kwargs + for key in kwargs: + if key not in self.attributes: + raise TypeError(f"Unexepcted keyword argument {key}.") + for arg, attribute_name in zip(args, self.attributes): + if attribute_name in kwargs: + raise TypeError(f"Got multiple values for argument {attribute_name}.") + else: + kwargs[attribute_name] = arg + + if len(kwargs) != len(self.attributes): + raise ValueError( + f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got " + f"{len(args)} arguments instead." + ) + + # Check each arg is of the proper class (this will also catch a user initializing in the wrong order) + for attribute_name, arg in kwargs.items(): + class_name = getattr(self, f"{attribute_name}_class") + # Nothing is ever going to be an instance of "AutoXxx", in that case we check the base class. + class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name) + if isinstance(class_name, tuple): + proper_class = tuple(getattr(transformers_module, n) for n in class_name if n is not None) + else: + proper_class = getattr(transformers_module, class_name) + + if not isinstance(arg, proper_class): + raise ValueError( + f"Received a {type(arg).__name__} for argument {attribute_name}, but a {class_name} was expected." + ) + + setattr(self, attribute_name, arg) + + def __repr__(self): + attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes] + attributes_repr = "\n".join(attributes_repr) + return f"{self.__class__.__name__}:\n{attributes_repr}" + + def save_pretrained(self, save_directory): + """ + Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it + can be reloaded using the [`~ProcessorMixin.from_pretrained`] method. + + + + This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and + [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods + above for more information. + + + + Args: + save_directory (`str` or `os.PathLike`): + Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will + be created if it does not exist). + """ + for attribute_name in self.attributes: + attribute = getattr(self, attribute_name) + # Include the processor class in the attribute config so this processor can then be reloaded with the + # `AutoProcessor` API. + if hasattr(attribute, "_set_processor_class"): + attribute._set_processor_class(self.__class__.__name__) + attribute.save_pretrained(save_directory) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" + Instantiate a processor associated with a pretrained model. + + + + This class method is simply calling the feature extractor + [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer + [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the + methods above for more information. + + + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or + namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. + - a path to a *directory* containing a feature extractor file saved using the + [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. + - a path or url to a saved feature extractor JSON *file*, e.g., + `./my_model_directory/preprocessor_config.json`. + **kwargs + Additional keyword arguments passed along to both + [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and + [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. + """ + args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) + return cls(*args) + + @classmethod + def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + args = [] + for attribute_name in cls.attributes: + class_name = getattr(cls, f"{attribute_name}_class") + if isinstance(class_name, tuple): + classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name) + use_fast = kwargs.get("use_fast", True) + if use_fast and classes[1] is not None: + attribute_class = classes[1] + else: + attribute_class = classes[0] + else: + attribute_class = getattr(transformers_module, class_name) + + args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs)) + return args