diff --git a/docs/source/main_classes/processors.mdx b/docs/source/main_classes/processors.mdx
index 8f5f1e048d..2aaca485df 100644
--- a/docs/source/main_classes/processors.mdx
+++ b/docs/source/main_classes/processors.mdx
@@ -12,10 +12,22 @@ specific language governing permissions and limitations under the License.
# Processors
-This library includes processors for several traditional tasks. These processors can be used to process a dataset into
-examples that can be fed to a model.
+Processors can mean two different things in the Transformers library:
+- the objects that pre-process inputs for multi-modal models such as [Wav2Vec2](../model_doc/wav2vec2) (speech and text)
+ or [CLIP](../model_doc/clip) (text and vision)
+- deprecated objects that were used in older versions of the library to preprocess data for GLUE or SQUAD.
-## Processors
+## Multi-modal processors
+
+Any multi-modal model will require an object to encode or decode the data that groups several modalities (among text,
+vision and audio). This is handled by objects called processors, which group tokenizers (for the text modality) and
+feature extractors (for vision and audio).
+
+Those processors inherit from the following base class that implements the saving and loading functionality:
+
+[[autodoc]] ProcessorMixin
+
+## Deprecated processors
All processors follow the same architecture which is that of the
[`~data.processors.utils.DataProcessor`]. The processor returns a list of
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 18d7c35b65..2476d41fc8 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -95,7 +95,7 @@ _import_structure = {
"dependency_versions_table": [],
"dynamic_module_utils": [],
"feature_extraction_sequence_utils": ["SequenceFeatureExtractor"],
- "feature_extraction_utils": ["BatchFeature"],
+ "feature_extraction_utils": ["BatchFeature", "FeatureExtractionMixin"],
"file_utils": [
"CONFIG_NAME",
"MODEL_CARD_NAME",
@@ -365,6 +365,7 @@ _import_structure = {
"ZeroShotClassificationPipeline",
"pipeline",
],
+ "processing_utils": ["ProcessorMixin"],
"testing_utils": [],
"tokenization_utils": ["PreTrainedTokenizer"],
"tokenization_utils_base": [
@@ -2307,7 +2308,7 @@ if TYPE_CHECKING:
from .feature_extraction_sequence_utils import SequenceFeatureExtractor
# Feature Extractor
- from .feature_extraction_utils import BatchFeature
+ from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
# Files and general utilities
from .file_utils import (
@@ -2555,6 +2556,7 @@ if TYPE_CHECKING:
ZeroShotClassificationPipeline,
pipeline,
)
+ from .processing_utils import ProcessorMixin
# Tokenization
from .tokenization_utils import PreTrainedTokenizer
diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
index ebbde87272..2323dbc7e8 100644
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -15,12 +15,11 @@
"""
Image/Text processor class for CLIP
"""
+from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
-from .feature_extraction_clip import CLIPFeatureExtractor
-from .tokenization_clip import CLIPTokenizer
-class CLIPProcessor:
+class CLIPProcessor(ProcessorMixin):
r"""
Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.
@@ -33,77 +32,13 @@ class CLIPProcessor:
tokenizer ([`CLIPTokenizer`]):
The tokenizer is a required input.
"""
+ feature_extractor_class = "CLIPFeatureExtractor"
+ tokenizer_class = "CLIPTokenizer"
def __init__(self, feature_extractor, tokenizer):
- if not isinstance(feature_extractor, CLIPFeatureExtractor):
- raise ValueError(
- f"`feature_extractor` has to be of type CLIPFeatureExtractor, but is {type(feature_extractor)}"
- )
- if not isinstance(tokenizer, CLIPTokenizer):
- raise ValueError(f"`tokenizer` has to be of type CLIPTokenizer, but is {type(tokenizer)}")
-
- self.feature_extractor = feature_extractor
- self.tokenizer = tokenizer
+ super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
- def save_pretrained(self, save_directory):
- """
- Save a CLIP feature extractor object and CLIP tokenizer object to the directory `save_directory`, so that it
- can be re-loaded using the [`~CLIPProcessor.from_pretrained`] class method.
-
-
-
- This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
- [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- save_directory (`str` or `os.PathLike`):
- Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
- be created if it does not exist).
- """
- self.feature_extractor._set_processor_class(self.__class__.__name__)
- self.feature_extractor.save_pretrained(save_directory)
-
- self.tokenizer._set_processor_class(self.__class__.__name__)
- self.tokenizer.save_pretrained(save_directory)
-
- @classmethod
- def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
- r"""
- Instantiate a [`CLIPProcessor`] from a pretrained CLIP processor.
-
-
-
- This class method is simply calling CLIPFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
- CLIPTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
- docstrings of the methods above for more information.
-
-
-
- Args:
- pretrained_model_name_or_path (`str` or `os.PathLike`):
- This can be either:
-
- - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like `clip-vit-base-patch32`, or
- namespaced under a user or organization name, like `openai/clip-vit-base-patch32`.
- - a path to a *directory* containing a feature extractor file saved using the
- [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- - a path or url to a saved feature extractor JSON *file*, e.g.,
- `./my_model_directory/preprocessor_config.json`.
-
- **kwargs
- Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
- [`PreTrainedTokenizer`]
- """
- feature_extractor = CLIPFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
- tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
- return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
-
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
index b727def91c..4277d3a1b2 100644
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -18,13 +18,11 @@ Processor class for LayoutLMv2.
from typing import List, Optional, Union
from ...file_utils import TensorType
+from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from .feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor
-from .tokenization_layoutlmv2 import LayoutLMv2Tokenizer
-from .tokenization_layoutlmv2_fast import LayoutLMv2TokenizerFast
-class LayoutLMv2Processor:
+class LayoutLMv2Processor(ProcessorMixin):
r"""
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a
single processor.
@@ -43,84 +41,8 @@ class LayoutLMv2Processor:
tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
"""
-
- def __init__(self, feature_extractor, tokenizer):
- if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor):
- raise ValueError(
- f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
- )
- if not isinstance(tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast)):
- raise ValueError(
- f"`tokenizer` has to be of type {LayoutLMv2Tokenizer.__class__} or {LayoutLMv2TokenizerFast.__class__}, but is {type(tokenizer)}"
- )
-
- self.feature_extractor = feature_extractor
- self.tokenizer = tokenizer
-
- def save_pretrained(self, save_directory):
- """
- Save a LayoutLMv2 feature_extractor object and LayoutLMv2 tokenizer object to the directory `save_directory`,
- so that it can be re-loaded using the [`~LayoutLMv2Processor.from_pretrained`] class method.
-
-
-
- This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
- [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- save_directory (`str` or `os.PathLike`):
- Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
- be created if it does not exist).
- """
- self.feature_extractor._set_processor_class(self.__class__.__name__)
- self.feature_extractor.save_pretrained(save_directory)
-
- self.tokenizer._set_processor_class(self.__class__.__name__)
- self.tokenizer.save_pretrained(save_directory)
-
- @classmethod
- def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
- r"""
- Instantiate a [`LayoutLMv2Processor`] from a pretrained LayoutLMv2 processor.
-
-
-
- This class method is simply calling LayoutLMv2FeatureExtractor's
- [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and LayoutLMv2TokenizerFast's
- [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- pretrained_model_name_or_path (`str` or `os.PathLike`):
- This can be either:
-
- - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
- namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- - a path to a *directory* containing a feature extractor file saved using the
- [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- - a path or url to a saved feature extractor JSON *file*, e.g.,
- `./my_model_directory/preprocessor_config.json`.
-
- use_fast (`bool`, *optional*, defaults to `True`):
- Whether or not to instantiate a fast tokenizer.
-
- **kwargs
- Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
- [`PreTrainedTokenizer`]
- """
- feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
- if use_fast:
- tokenizer = LayoutLMv2TokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
- else:
- tokenizer = LayoutLMv2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
- return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
+ feature_extractor_class = "LayoutLMv2FeatureExtractor"
+ tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
def __call__(
self,
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index 231810bf5e..5bd1dffb5c 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -17,15 +17,12 @@ Processor class for LayoutXLM.
"""
from typing import List, Optional, Union
-from transformers.models.layoutlmv2.feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor
-
from ...file_utils import TensorType
+from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from .tokenization_layoutxlm import LayoutXLMTokenizer
-from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
-class LayoutXLMProcessor:
+class LayoutXLMProcessor(ProcessorMixin):
r"""
Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
single processor.
@@ -44,84 +41,8 @@ class LayoutXLMProcessor:
tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`):
An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
"""
-
- def __init__(self, feature_extractor, tokenizer):
- if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor):
- raise ValueError(
- f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
- )
- if not isinstance(tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast)):
- raise ValueError(
- f"`tokenizer` has to be of type {LayoutXLMTokenizer.__class__} or {LayoutXLMTokenizerFast.__class__}, but is {type(tokenizer)}"
- )
-
- self.feature_extractor = feature_extractor
- self.tokenizer = tokenizer
-
- def save_pretrained(self, save_directory):
- """
- Save a LayoutXLM feature_extractor object and LayoutXLM tokenizer object to the directory `save_directory`, so
- that it can be re-loaded using the [`~LayoutXLMProcessor.from_pretrained`] class method.
-
-
-
- This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
- [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- save_directory (`str` or `os.PathLike`):
- Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
- be created if it does not exist).
- """
- self.feature_extractor._set_processor_class(self.__class__.__name__)
- self.feature_extractor.save_pretrained(save_directory)
-
- self.tokenizer._set_processor_class(self.__class__.__name__)
- self.tokenizer.save_pretrained(save_directory)
-
- @classmethod
- def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
- r"""
- Instantiate a [`LayoutXLMProcessor`] from a pretrained LayoutXLM processor.
-
-
-
- This class method is simply calling Layoutv2FeatureExtractor's
- [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and LayoutXLMTokenizerFast's
- [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- pretrained_model_name_or_path (`str` or `os.PathLike`):
- This can be either:
-
- - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
- namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- - a path to a *directory* containing a feature extractor file saved using the
- [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- - a path or url to a saved feature extractor JSON *file*, e.g.,
- `./my_model_directory/preprocessor_config.json`.
-
- use_fast (`bool`, *optional*, defaults to `True`):
- Whether or not to instantiate a fast tokenizer.
-
- **kwargs
- Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
- [`PreTrainedTokenizer`]
- """
- feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
- if use_fast:
- tokenizer = LayoutXLMTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
- else:
- tokenizer = LayoutXLMTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
- return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
+ feature_extractor_class = "LayoutLMv2FeatureExtractor"
+ tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
def __call__(
self,
diff --git a/src/transformers/models/speech_to_text/processing_speech_to_text.py b/src/transformers/models/speech_to_text/processing_speech_to_text.py
index 8d983d2cc0..969df9d108 100644
--- a/src/transformers/models/speech_to_text/processing_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/processing_speech_to_text.py
@@ -17,11 +17,10 @@ Speech processor class for Speech2Text
"""
from contextlib import contextmanager
-from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor
-from .tokenization_speech_to_text import Speech2TextTokenizer
+from ...processing_utils import ProcessorMixin
-class Speech2TextProcessor:
+class Speech2TextProcessor(ProcessorMixin):
r"""
Constructs a Speech2Text processor which wraps a Speech2Text feature extractor and a Speech2Text tokenizer into a
single processor.
@@ -36,79 +35,13 @@ class Speech2TextProcessor:
tokenizer (`Speech2TextTokenizer`):
An instance of [`Speech2TextTokenizer`]. The tokenizer is a required input.
"""
+ feature_extractor_class = "Speech2TextFeatureExtractor"
+ tokenizer_class = "Speech2TextTokenizer"
def __init__(self, feature_extractor, tokenizer):
- if not isinstance(feature_extractor, Speech2TextFeatureExtractor):
- raise ValueError(
- f"`feature_extractor` has to be of type {Speech2TextFeatureExtractor.__class__}, but is {type(feature_extractor)}"
- )
- if not isinstance(tokenizer, Speech2TextTokenizer):
- raise ValueError(
- f"`tokenizer` has to be of type {Speech2TextTokenizer.__class__}, but is {type(tokenizer)}"
- )
-
- self.feature_extractor = feature_extractor
- self.tokenizer = tokenizer
+ super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
- def save_pretrained(self, save_directory):
- """
- Save a Speech2Text feature extractor object and Speech2Text tokenizer object to the directory `save_directory`,
- so that it can be re-loaded using the [`~Speech2TextProcessor.from_pretrained`] class method.
-
-
-
- This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
- [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- save_directory (`str` or `os.PathLike`):
- Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
- be created if it does not exist).
- """
- self.feature_extractor._set_processor_class(self.__class__.__name__)
- self.feature_extractor.save_pretrained(save_directory)
-
- self.tokenizer._set_processor_class(self.__class__.__name__)
- self.tokenizer.save_pretrained(save_directory)
-
- @classmethod
- def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
- r"""
- Instantiate a [`Speech2TextProcessor`] from a pretrained Speech2Text processor.
-
-
-
- This class method is simply calling Speech2TextFeatureExtractor's
- [`~PreTrainedFeatureExtractor.from_pretrained`] and Speech2TextTokenizer's
- [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- pretrained_model_name_or_path (`str` or `os.PathLike`):
- This can be either:
-
- - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
- namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- - a path to a *directory* containing a feature extractor file saved using the
- [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- - a path or url to a saved feature extractor JSON *file*, e.g.,
- `./my_model_directory/preprocessor_config.json`.
- **kwargs
- Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
- [`PreTrainedTokenizer`]
- """
- feature_extractor = Speech2TextFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
- tokenizer = Speech2TextTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
- return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
-
def __call__(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
diff --git a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
index 5567ef982b..28189ba881 100644
--- a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
@@ -17,12 +17,10 @@ Speech processor class for Speech2Text2
"""
from contextlib import contextmanager
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ..auto.feature_extraction_auto import AutoFeatureExtractor
-from .tokenization_speech_to_text_2 import Speech2Text2Tokenizer
+from ...processing_utils import ProcessorMixin
-class Speech2Text2Processor:
+class Speech2Text2Processor(ProcessorMixin):
r"""
Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into
a single processor.
@@ -36,77 +34,13 @@ class Speech2Text2Processor:
tokenizer (`Speech2Text2Tokenizer`):
An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input.
"""
+ feature_extractor_class = "AutoFeatureExtractor"
+ tokenizer_class = "Speech2Text2Tokenizer"
def __init__(self, feature_extractor, tokenizer):
- if not isinstance(feature_extractor, SequenceFeatureExtractor):
- raise ValueError(
- f"`feature_extractor` has to be of type {SequenceFeatureExtractor.__class__}, but is {type(feature_extractor)}"
- )
- if not isinstance(tokenizer, Speech2Text2Tokenizer):
- raise ValueError(
- f"`tokenizer` has to be of type {Speech2Text2Tokenizer.__class__}, but is {type(tokenizer)}"
- )
-
- self.feature_extractor = feature_extractor
- self.tokenizer = tokenizer
+ super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
- def save_pretrained(self, save_directory):
- """
- Save a Speech2Text2 feature extractor object and Speech2Text2 tokenizer object to the directory
- `save_directory`, so that it can be re-loaded using the [`~Speech2Text2Processor.from_pretrained`] class
- method.
-
-
-
- This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
- [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- save_directory (`str` or `os.PathLike`):
- Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
- be created if it does not exist).
- """
-
- self.feature_extractor.save_pretrained(save_directory)
- self.tokenizer.save_pretrained(save_directory)
-
- @classmethod
- def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
- r"""
- Instantiate a [`Speech2Text2Processor`] from a pretrained Speech2Text2 processor.
-
-
-
- This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
- Speech2Text2Tokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
- docstrings of the methods above for more information.
-
-
-
- Args:
- pretrained_model_name_or_path (`str` or `os.PathLike`):
- This can be either:
-
- - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
- namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- - a path to a *directory* containing a feature extractor file saved using the
- [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- - a path or url to a saved feature extractor JSON *file*, e.g.,
- `./my_model_directory/preprocessor_config.json`.
- **kwargs
- Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
- [`PreTrainedTokenizer`]
- """
- feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
- tokenizer = Speech2Text2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
- return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
-
def __call__(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index a74dc0b38e..2c7893a091 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -17,15 +17,10 @@ Processor class for TrOCR.
"""
from contextlib import contextmanager
-from transformers import AutoFeatureExtractor, AutoTokenizer
-from transformers.feature_extraction_utils import FeatureExtractionMixin
-from transformers.models.roberta.tokenization_roberta import RobertaTokenizer
-from transformers.models.roberta.tokenization_roberta_fast import RobertaTokenizerFast
-from transformers.models.xlm_roberta.tokenization_xlm_roberta import XLMRobertaTokenizer
-from transformers.models.xlm_roberta.tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast
+from ...processing_utils import ProcessorMixin
-class TrOCRProcessor:
+class TrOCRProcessor(ProcessorMixin):
r"""
Constructs a TrOCR processor which wraps a vision feature extractor and a TrOCR tokenizer into a single processor.
@@ -39,78 +34,13 @@ class TrOCRProcessor:
tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`]):
An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input.
"""
+ feature_extractor_class = "AutoFeatureExtractor"
+ tokenizer_class = "AutoTokenizer"
def __init__(self, feature_extractor, tokenizer):
- if not isinstance(feature_extractor, FeatureExtractionMixin):
- raise ValueError(
- f"`feature_extractor` has to be of type {FeatureExtractionMixin.__class__}, but is {type(feature_extractor)}"
- )
- if not isinstance(
- tokenizer, (RobertaTokenizer, RobertaTokenizerFast, XLMRobertaTokenizer, XLMRobertaTokenizerFast)
- ):
- raise ValueError(
- f"`tokenizer` has to be of type {RobertaTokenizer.__class__} or {RobertaTokenizerFast.__class__} or {XLMRobertaTokenizer.__class__} or {XLMRobertaTokenizerFast.__class__}, but is {type(tokenizer)}"
- )
-
- self.feature_extractor = feature_extractor
- self.tokenizer = tokenizer
+ super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
- def save_pretrained(self, save_directory):
- """
- Save a TrOCR feature extractor object and TrOCR tokenizer object to the directory `save_directory`, so that it
- can be re-loaded using the [`~TrOCRProcessor.from_pretrained`] class method.
-
-
-
- This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
- [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- save_directory (`str` or `os.PathLike`):
- Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
- be created if it does not exist).
- """
-
- self.feature_extractor.save_pretrained(save_directory)
- self.tokenizer.save_pretrained(save_directory)
-
- @classmethod
- def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
- r"""
- Instantiate a [`TrOCRProcessor`] from a pretrained TrOCR processor.
-
-
-
- This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
- TrOCRTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
- docstrings of the methods above for more information.
-
-
-
- Args:
- pretrained_model_name_or_path (`str` or `os.PathLike`):
- This can be either:
-
- - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
- namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- - a path to a *directory* containing a feature extractor file saved using the
- [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- - a path or url to a saved feature extractor JSON *file*, e.g.,
- `./my_model_directory/preprocessor_config.json`.
- **kwargs
- Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
- [`PreTrainedTokenizer`]
- """
- feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
- tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
- return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
-
def __call__(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index 50ca918065..602d85de68 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -18,14 +18,12 @@ Processor class for ViLT.
from typing import List, Optional, Union
-from transformers import BertTokenizerFast
-
from ...file_utils import TensorType
+from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from .feature_extraction_vilt import ViltFeatureExtractor
-class ViltProcessor:
+class ViltProcessor(ProcessorMixin):
r"""
Constructs a ViLT processor which wraps a BERT tokenizer and ViLT feature extractor into a single processor.
@@ -38,75 +36,13 @@ class ViltProcessor:
tokenizer (`BertTokenizerFast`):
An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
"""
+ feature_extractor_class = "ViltFeatureExtractor"
+ tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, feature_extractor, tokenizer):
- if not isinstance(feature_extractor, ViltFeatureExtractor):
- raise ValueError(
- f"`feature_extractor` has to be of type {ViltFeatureExtractor.__class__}, but is {type(feature_extractor)}"
- )
- if not isinstance(tokenizer, BertTokenizerFast):
- raise ValueError(f"`tokenizer` has to be of type {BertTokenizerFast.__class__}, but is {type(tokenizer)}")
-
- self.feature_extractor = feature_extractor
- self.tokenizer = tokenizer
+ super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
- def save_pretrained(self, save_directory):
- """
- Save a ViLT feature_extractor object and BERT tokenizer object to the directory `save_directory`, so that it
- can be re-loaded using the [`~ViltProcessor.from_pretrained`] class method.
-
-
-
- This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
- [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- save_directory (`str` or `os.PathLike`):
- Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
- be created if it does not exist).
- """
-
- self.feature_extractor.save_pretrained(save_directory)
- self.tokenizer.save_pretrained(save_directory)
-
- @classmethod
- def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
- r"""
- Instantiate a [`ViltProcessor`] from a pretrained ViLT processor.
-
-
-
- This class method is simply calling ViltFeatureExtractor's
- [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and BertTokenizerFast's
- [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- pretrained_model_name_or_path (`str` or `os.PathLike`):
- This can be either:
-
- - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
- namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- - a path to a *directory* containing a feature extractor file saved using the
- [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- - a path or url to a saved feature extractor JSON *file*, e.g.,
- `./my_model_directory/preprocessor_config.json`.
- **kwargs
- Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
- [`PreTrainedTokenizer`]
- """
- feature_extractor = ViltFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
- tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
- return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
-
def __call__(
self,
images,
diff --git a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
index 4406f80d3d..6cc58b2627 100644
--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@@ -15,17 +15,12 @@
"""
Processor class for VisionTextDualEncoder
"""
-from typing import Union
-
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
-from transformers.feature_extraction_utils import FeatureExtractionMixin
+from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
-from ..auto.feature_extraction_auto import AutoFeatureExtractor
-from ..auto.tokenization_auto import AutoTokenizer
-class VisionTextDualEncoderProcessor:
+class VisionTextDualEncoderProcessor(ProcessorMixin):
r"""
Constructs a VisionTextDualEncoder processor which wraps a vision feature extractor and a tokenizer into a single
processor.
@@ -40,82 +35,13 @@ class VisionTextDualEncoderProcessor:
tokenizer ([`PreTrainedTokenizer`]):
The tokenizer is a required input.
"""
+ feature_extractor_class = "AutoFeatureExtractor"
+ tokenizer_class = "AutoTokenizer"
- def __init__(
- self, feature_extractor: FeatureExtractionMixin, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
- ):
- if not isinstance(feature_extractor, FeatureExtractionMixin):
- raise ValueError(
- f"`feature_extractor` has to be of type {FeatureExtractionMixin.__class__}, but is {type(feature_extractor)}"
- )
- if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
- raise ValueError(
- f"`tokenizer` has to be of type `PreTrainedTokenizer` or `PreTrainedTokenizerFast`, but is {type(tokenizer)}"
- )
-
- self.feature_extractor = feature_extractor
- self.tokenizer = tokenizer
+ def __init__(self, feature_extractor, tokenizer):
+ super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
- def save_pretrained(self, save_directory):
- """
- Save a VisionTextDualEncoder feature extractor object and VisionTextDualEncoder tokenizer object to the
- directory `save_directory`, so that it can be re-loaded using the
- [`~VisionTextDualEncoderProcessor.from_pretrained`] class method.
-
-
-
- This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
- [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- save_directory (`str` or `os.PathLike`):
- Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
- be created if it does not exist).
- """
- self.feature_extractor._set_processor_class(self.__class__.__name__)
- self.feature_extractor.save_pretrained(save_directory)
-
- self.tokenizer._set_processor_class(self.__class__.__name__)
- self.tokenizer.save_pretrained(save_directory)
-
- @classmethod
- def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
- r"""
- Instantiate a [`VisionTextDualEncoderProcessor`] from a pretrained VisionTextDualEncoder processor.
-
-
-
- This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
- AutoTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
- docstrings of the methods above for more information.
-
-
-
- Args:
- pretrained_model_name_or_path (`str` or `os.PathLike`):
- This can be either:
-
- - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
- namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- - a path to a *directory* containing a feature extractor file saved using the
- [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- - a path or url to a saved feature extractor JSON *file*, e.g.,
- `./my_model_directory/preprocessor_config.json`.
-
- **kwargs
- Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
- [`PreTrainedTokenizer`]
- """
- feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
- tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
- return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
-
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py
index 7679272142..1470c254dc 100644
--- a/src/transformers/models/wav2vec2/processing_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -18,14 +18,12 @@ Speech processor class for Wav2Vec2
import warnings
from contextlib import contextmanager
-from ...tokenization_utils import PreTrainedTokenizer
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ..auto.tokenization_auto import AutoTokenizer
+from ...processing_utils import ProcessorMixin
from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
-class Wav2Vec2Processor:
+class Wav2Vec2Processor(ProcessorMixin):
r"""
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
processor.
@@ -39,82 +37,17 @@ class Wav2Vec2Processor:
tokenizer ([`PreTrainedTokenizer`]):
An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
"""
+ feature_extractor_class = "Wav2Vec2FeatureExtractor"
+ tokenizer_class = "AutoTokenizer"
def __init__(self, feature_extractor, tokenizer):
- if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor):
- raise ValueError(
- f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
- )
- if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
- raise ValueError(
- f"`tokenizer` has to be of type {PreTrainedTokenizer.__class__}, but is {type(tokenizer)}"
- )
-
- self.feature_extractor = feature_extractor
- self.tokenizer = tokenizer
+ super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
- def save_pretrained(self, save_directory):
- """
- Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory `save_directory`, so
- that it can be re-loaded using the [`~Wav2Vec2Processor.from_pretrained`] class method.
-
-
-
- This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
- [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- save_directory (`str` or `os.PathLike`):
- Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
- be created if it does not exist).
- """
- self.feature_extractor._set_processor_class(self.__class__.__name__)
- self.feature_extractor.save_pretrained(save_directory)
-
- self.tokenizer._set_processor_class(self.__class__.__name__)
- self.tokenizer.save_pretrained(save_directory)
-
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
- r"""
- Instantiate a [`Wav2Vec2Processor`] from a pretrained Wav2Vec2 processor.
-
-
-
- This class method is simply calling Wav2Vec2FeatureExtractor's
- [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and PreTrainedTokenizer's
- [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
- above for more information.
-
-
-
- Args:
- pretrained_model_name_or_path (`str` or `os.PathLike`):
- This can be either:
-
- - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
- huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
- namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- - a path to a *directory* containing a feature extractor file saved using the
- [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- - a path or url to a saved feature extractor JSON *file*, e.g.,
- `./my_model_directory/preprocessor_config.json`.
- **kwargs
- Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
- [`PreTrainedTokenizer`]
- """
- feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
- # load generic `AutoTokenizer`
- # need fallback here for backward compatibility in case processor is
- # loaded from just a tokenizer file that does not have a `tokenizer_class` attribute
- # behavior should be deprecated in major future release
try:
- tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+ return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
except OSError:
warnings.warn(
f"Loading a tokenizer inside {cls.__name__} from a config that does not"
@@ -124,9 +57,11 @@ class Wav2Vec2Processor:
"file to suppress this warning: ",
FutureWarning,
)
+
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
- return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
+ return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(self, *args, **kwargs):
"""
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index 148e42ec66..c31b209c18 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -23,16 +23,16 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
import numpy as np
-from ...feature_extraction_utils import FeatureExtractionMixin
from ...file_utils import ModelOutput, requires_backends
-from ...tokenization_utils import PreTrainedTokenizer
-from ..wav2vec2.feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
-from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
+from ...processing_utils import ProcessorMixin
if TYPE_CHECKING:
from pyctcdecode import BeamSearchDecoderCTC
+ from ...feature_extraction_utils import FeatureExtractionMixin
+ from ...tokenization_utils import PreTrainedTokenizerBase
+
@dataclass
class Wav2Vec2DecoderWithLMOutput(ModelOutput):
@@ -47,7 +47,7 @@ class Wav2Vec2DecoderWithLMOutput(ModelOutput):
text: Union[List[str], str]
-class Wav2Vec2ProcessorWithLM:
+class Wav2Vec2ProcessorWithLM(ProcessorMixin):
r"""
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder
with language model support into a single processor for language model boosted speech recognition decoding.
@@ -60,24 +60,18 @@ class Wav2Vec2ProcessorWithLM:
decoder (`pyctcdecode.BeamSearchDecoderCTC`):
An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
"""
+ feature_extractor_class = "Wav2Vec2FeatureExtractor"
+ tokenizer_class = "Wav2Vec2CTCTokenizer"
def __init__(
self,
- feature_extractor: FeatureExtractionMixin,
- tokenizer: PreTrainedTokenizer,
+ feature_extractor: "FeatureExtractionMixin",
+ tokenizer: "PreTrainedTokenizerBase",
decoder: "BeamSearchDecoderCTC",
):
from pyctcdecode import BeamSearchDecoderCTC
- if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor):
- raise ValueError(
- f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
- )
- if not isinstance(tokenizer, Wav2Vec2CTCTokenizer):
- # TODO(PVP) - this can be relaxed in the future to allow other kinds of tokenizers
- raise ValueError(
- f"`tokenizer` has to be of type {Wav2Vec2CTCTokenizer.__class__}, but is {type(tokenizer)}"
- )
+ super().__init__(feature_extractor, tokenizer)
if not isinstance(decoder, BeamSearchDecoderCTC):
raise ValueError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}")
@@ -90,37 +84,11 @@ class Wav2Vec2ProcessorWithLM:
f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet."
)
- self.feature_extractor = feature_extractor
- self.tokenizer = tokenizer
self.decoder = decoder
self.current_processor = self.feature_extractor
def save_pretrained(self, save_directory):
- """
- Save the Wav2Vec2 feature_extractor, a tokenizer object and a pyctcdecode decoder to the directory
- `save_directory`, so that they can be re-loaded using the [`~Wav2Vec2ProcessorWithLM.from_pretrained`] class
- method.
-
-
-
- This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained,`]
- [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`] and pyctcdecode's
- [`pyctcdecode.BeamSearchDecoderCTC.save_to_dir`].
-
- Please refer to the docstrings of the methods above for more information.
-
-
-
- Args:
- save_directory (`str` or `os.PathLike`):
- Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
- be created if it does not exist).
- """
- self.feature_extractor._set_processor_class(self.__class__.__name__)
- self.feature_extractor.save_pretrained(save_directory)
-
- self.tokenizer._set_processor_class(self.__class__.__name__)
- self.tokenizer.save_pretrained(save_directory)
+ super().save_pretrained(save_directory)
self.decoder.save_to_dir(save_directory)
@classmethod
@@ -157,8 +125,7 @@ class Wav2Vec2ProcessorWithLM:
requires_backends(cls, "pyctcdecode")
from pyctcdecode import BeamSearchDecoderCTC
- feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
- tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+ feature_extractor, tokenizer = super()._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
if os.path.isdir(pretrained_model_name_or_path):
decoder = BeamSearchDecoderCTC.load_from_dir(pretrained_model_name_or_path)
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
new file mode 100644
index 0000000000..ec6196c862
--- /dev/null
+++ b/src/transformers/processing_utils.py
@@ -0,0 +1,161 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Processing saving/loading class for common processors.
+"""
+
+import importlib.util
+from pathlib import Path
+
+
+# Comment to write
+spec = importlib.util.spec_from_file_location(
+ "transformers", Path(__file__).parent / "__init__.py", submodule_search_locations=[Path(__file__).parent]
+)
+transformers_module = spec.loader.load_module()
+
+
+AUTO_TO_BASE_CLASS_MAPPING = {
+ "AutoTokenizer": "PreTrainedTokenizerBase",
+ "AutoFeatureExtractor": "FeatureExtractionMixin",
+}
+
+
+class ProcessorMixin:
+ """
+ This is a mixin used to provide saving/loading functionality for all processor classes.
+ """
+
+ attributes = ["feature_extractor", "tokenizer"]
+ # Names need to be attr_class for attr in attributes
+ feature_extractor_class = None
+ tokenizer_class = None
+
+ # args have to match the attributes class attribute
+ def __init__(self, *args, **kwargs):
+ # Sanitize args and kwargs
+ for key in kwargs:
+ if key not in self.attributes:
+ raise TypeError(f"Unexepcted keyword argument {key}.")
+ for arg, attribute_name in zip(args, self.attributes):
+ if attribute_name in kwargs:
+ raise TypeError(f"Got multiple values for argument {attribute_name}.")
+ else:
+ kwargs[attribute_name] = arg
+
+ if len(kwargs) != len(self.attributes):
+ raise ValueError(
+ f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got "
+ f"{len(args)} arguments instead."
+ )
+
+ # Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
+ for attribute_name, arg in kwargs.items():
+ class_name = getattr(self, f"{attribute_name}_class")
+ # Nothing is ever going to be an instance of "AutoXxx", in that case we check the base class.
+ class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name)
+ if isinstance(class_name, tuple):
+ proper_class = tuple(getattr(transformers_module, n) for n in class_name if n is not None)
+ else:
+ proper_class = getattr(transformers_module, class_name)
+
+ if not isinstance(arg, proper_class):
+ raise ValueError(
+ f"Received a {type(arg).__name__} for argument {attribute_name}, but a {class_name} was expected."
+ )
+
+ setattr(self, attribute_name, arg)
+
+ def __repr__(self):
+ attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
+ attributes_repr = "\n".join(attributes_repr)
+ return f"{self.__class__.__name__}:\n{attributes_repr}"
+
+ def save_pretrained(self, save_directory):
+ """
+ Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
+ can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
+
+
+
+ This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
+ [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
+ above for more information.
+
+
+
+ Args:
+ save_directory (`str` or `os.PathLike`):
+ Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
+ be created if it does not exist).
+ """
+ for attribute_name in self.attributes:
+ attribute = getattr(self, attribute_name)
+ # Include the processor class in the attribute config so this processor can then be reloaded with the
+ # `AutoProcessor` API.
+ if hasattr(attribute, "_set_processor_class"):
+ attribute._set_processor_class(self.__class__.__name__)
+ attribute.save_pretrained(save_directory)
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+ r"""
+ Instantiate a processor associated with a pretrained model.
+
+
+
+ This class method is simply calling the feature extractor
+ [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer
+ [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
+ methods above for more information.
+
+
+
+ Args:
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
+ This can be either:
+
+ - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+ namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - a path to a *directory* containing a feature extractor file saved using the
+ [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
+ - a path or url to a saved feature extractor JSON *file*, e.g.,
+ `./my_model_directory/preprocessor_config.json`.
+ **kwargs
+ Additional keyword arguments passed along to both
+ [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
+ [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
+ """
+ args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
+ return cls(*args)
+
+ @classmethod
+ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+ args = []
+ for attribute_name in cls.attributes:
+ class_name = getattr(cls, f"{attribute_name}_class")
+ if isinstance(class_name, tuple):
+ classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
+ use_fast = kwargs.get("use_fast", True)
+ if use_fast and classes[1] is not None:
+ attribute_class = classes[1]
+ else:
+ attribute_class = classes[0]
+ else:
+ attribute_class = getattr(transformers_module, class_name)
+
+ args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+ return args