PoC for a ProcessorMixin class (#15549)
* PoC for a ProcessorMixin class * Documentation * Apply suggestions from code review Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Suraj Patil <surajp815@gmail.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * Roll out to other processors * Add base feature extractor class in init * Use args and kwargs Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Suraj Patil <surajp815@gmail.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
@@ -12,10 +12,22 @@ specific language governing permissions and limitations under the License.
|
|||||||
|
|
||||||
# Processors
|
# Processors
|
||||||
|
|
||||||
This library includes processors for several traditional tasks. These processors can be used to process a dataset into
|
Processors can mean two different things in the Transformers library:
|
||||||
examples that can be fed to a model.
|
- the objects that pre-process inputs for multi-modal models such as [Wav2Vec2](../model_doc/wav2vec2) (speech and text)
|
||||||
|
or [CLIP](../model_doc/clip) (text and vision)
|
||||||
|
- deprecated objects that were used in older versions of the library to preprocess data for GLUE or SQUAD.
|
||||||
|
|
||||||
## Processors
|
## Multi-modal processors
|
||||||
|
|
||||||
|
Any multi-modal model will require an object to encode or decode the data that groups several modalities (among text,
|
||||||
|
vision and audio). This is handled by objects called processors, which group tokenizers (for the text modality) and
|
||||||
|
feature extractors (for vision and audio).
|
||||||
|
|
||||||
|
Those processors inherit from the following base class that implements the saving and loading functionality:
|
||||||
|
|
||||||
|
[[autodoc]] ProcessorMixin
|
||||||
|
|
||||||
|
## Deprecated processors
|
||||||
|
|
||||||
All processors follow the same architecture which is that of the
|
All processors follow the same architecture which is that of the
|
||||||
[`~data.processors.utils.DataProcessor`]. The processor returns a list of
|
[`~data.processors.utils.DataProcessor`]. The processor returns a list of
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ _import_structure = {
|
|||||||
"dependency_versions_table": [],
|
"dependency_versions_table": [],
|
||||||
"dynamic_module_utils": [],
|
"dynamic_module_utils": [],
|
||||||
"feature_extraction_sequence_utils": ["SequenceFeatureExtractor"],
|
"feature_extraction_sequence_utils": ["SequenceFeatureExtractor"],
|
||||||
"feature_extraction_utils": ["BatchFeature"],
|
"feature_extraction_utils": ["BatchFeature", "FeatureExtractionMixin"],
|
||||||
"file_utils": [
|
"file_utils": [
|
||||||
"CONFIG_NAME",
|
"CONFIG_NAME",
|
||||||
"MODEL_CARD_NAME",
|
"MODEL_CARD_NAME",
|
||||||
@@ -365,6 +365,7 @@ _import_structure = {
|
|||||||
"ZeroShotClassificationPipeline",
|
"ZeroShotClassificationPipeline",
|
||||||
"pipeline",
|
"pipeline",
|
||||||
],
|
],
|
||||||
|
"processing_utils": ["ProcessorMixin"],
|
||||||
"testing_utils": [],
|
"testing_utils": [],
|
||||||
"tokenization_utils": ["PreTrainedTokenizer"],
|
"tokenization_utils": ["PreTrainedTokenizer"],
|
||||||
"tokenization_utils_base": [
|
"tokenization_utils_base": [
|
||||||
@@ -2307,7 +2308,7 @@ if TYPE_CHECKING:
|
|||||||
from .feature_extraction_sequence_utils import SequenceFeatureExtractor
|
from .feature_extraction_sequence_utils import SequenceFeatureExtractor
|
||||||
|
|
||||||
# Feature Extractor
|
# Feature Extractor
|
||||||
from .feature_extraction_utils import BatchFeature
|
from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
|
||||||
|
|
||||||
# Files and general utilities
|
# Files and general utilities
|
||||||
from .file_utils import (
|
from .file_utils import (
|
||||||
@@ -2555,6 +2556,7 @@ if TYPE_CHECKING:
|
|||||||
ZeroShotClassificationPipeline,
|
ZeroShotClassificationPipeline,
|
||||||
pipeline,
|
pipeline,
|
||||||
)
|
)
|
||||||
|
from .processing_utils import ProcessorMixin
|
||||||
|
|
||||||
# Tokenization
|
# Tokenization
|
||||||
from .tokenization_utils import PreTrainedTokenizer
|
from .tokenization_utils import PreTrainedTokenizer
|
||||||
|
|||||||
@@ -15,12 +15,11 @@
|
|||||||
"""
|
"""
|
||||||
Image/Text processor class for CLIP
|
Image/Text processor class for CLIP
|
||||||
"""
|
"""
|
||||||
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils_base import BatchEncoding
|
from ...tokenization_utils_base import BatchEncoding
|
||||||
from .feature_extraction_clip import CLIPFeatureExtractor
|
|
||||||
from .tokenization_clip import CLIPTokenizer
|
|
||||||
|
|
||||||
|
|
||||||
class CLIPProcessor:
|
class CLIPProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.
|
Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.
|
||||||
|
|
||||||
@@ -33,77 +32,13 @@ class CLIPProcessor:
|
|||||||
tokenizer ([`CLIPTokenizer`]):
|
tokenizer ([`CLIPTokenizer`]):
|
||||||
The tokenizer is a required input.
|
The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
|
feature_extractor_class = "CLIPFeatureExtractor"
|
||||||
|
tokenizer_class = "CLIPTokenizer"
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, feature_extractor, tokenizer):
|
||||||
if not isinstance(feature_extractor, CLIPFeatureExtractor):
|
super().__init__(feature_extractor, tokenizer)
|
||||||
raise ValueError(
|
|
||||||
f"`feature_extractor` has to be of type CLIPFeatureExtractor, but is {type(feature_extractor)}"
|
|
||||||
)
|
|
||||||
if not isinstance(tokenizer, CLIPTokenizer):
|
|
||||||
raise ValueError(f"`tokenizer` has to be of type CLIPTokenizer, but is {type(tokenizer)}")
|
|
||||||
|
|
||||||
self.feature_extractor = feature_extractor
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.current_processor = self.feature_extractor
|
self.current_processor = self.feature_extractor
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
|
||||||
"""
|
|
||||||
Save a CLIP feature extractor object and CLIP tokenizer object to the directory `save_directory`, so that it
|
|
||||||
can be re-loaded using the [`~CLIPProcessor.from_pretrained`] class method.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_directory (`str` or `os.PathLike`):
|
|
||||||
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
|
||||||
be created if it does not exist).
|
|
||||||
"""
|
|
||||||
self.feature_extractor._set_processor_class(self.__class__.__name__)
|
|
||||||
self.feature_extractor.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
self.tokenizer._set_processor_class(self.__class__.__name__)
|
|
||||||
self.tokenizer.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`CLIPProcessor`] from a pretrained CLIP processor.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling CLIPFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
|
|
||||||
CLIPTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
|
|
||||||
docstrings of the methods above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
|
||||||
This can be either:
|
|
||||||
|
|
||||||
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
|
|
||||||
huggingface.co. Valid model ids can be located at the root-level, like `clip-vit-base-patch32`, or
|
|
||||||
namespaced under a user or organization name, like `openai/clip-vit-base-patch32`.
|
|
||||||
- a path to a *directory* containing a feature extractor file saved using the
|
|
||||||
[`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
|
|
||||||
- a path or url to a saved feature extractor JSON *file*, e.g.,
|
|
||||||
`./my_model_directory/preprocessor_config.json`.
|
|
||||||
|
|
||||||
**kwargs
|
|
||||||
Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
|
|
||||||
[`PreTrainedTokenizer`]
|
|
||||||
"""
|
|
||||||
feature_extractor = CLIPFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
|
||||||
|
|
||||||
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
|
|||||||
@@ -18,13 +18,11 @@ Processor class for LayoutLMv2.
|
|||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from ...file_utils import TensorType
|
from ...file_utils import TensorType
|
||||||
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
|
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
|
||||||
from .feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor
|
|
||||||
from .tokenization_layoutlmv2 import LayoutLMv2Tokenizer
|
|
||||||
from .tokenization_layoutlmv2_fast import LayoutLMv2TokenizerFast
|
|
||||||
|
|
||||||
|
|
||||||
class LayoutLMv2Processor:
|
class LayoutLMv2Processor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a
|
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a
|
||||||
single processor.
|
single processor.
|
||||||
@@ -43,84 +41,8 @@ class LayoutLMv2Processor:
|
|||||||
tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
|
tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
|
||||||
An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
|
An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
|
feature_extractor_class = "LayoutLMv2FeatureExtractor"
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
|
||||||
if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor):
|
|
||||||
raise ValueError(
|
|
||||||
f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
|
|
||||||
)
|
|
||||||
if not isinstance(tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast)):
|
|
||||||
raise ValueError(
|
|
||||||
f"`tokenizer` has to be of type {LayoutLMv2Tokenizer.__class__} or {LayoutLMv2TokenizerFast.__class__}, but is {type(tokenizer)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.feature_extractor = feature_extractor
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
|
||||||
"""
|
|
||||||
Save a LayoutLMv2 feature_extractor object and LayoutLMv2 tokenizer object to the directory `save_directory`,
|
|
||||||
so that it can be re-loaded using the [`~LayoutLMv2Processor.from_pretrained`] class method.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_directory (`str` or `os.PathLike`):
|
|
||||||
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
|
||||||
be created if it does not exist).
|
|
||||||
"""
|
|
||||||
self.feature_extractor._set_processor_class(self.__class__.__name__)
|
|
||||||
self.feature_extractor.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
self.tokenizer._set_processor_class(self.__class__.__name__)
|
|
||||||
self.tokenizer.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`LayoutLMv2Processor`] from a pretrained LayoutLMv2 processor.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling LayoutLMv2FeatureExtractor's
|
|
||||||
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and LayoutLMv2TokenizerFast's
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
|
||||||
This can be either:
|
|
||||||
|
|
||||||
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
|
|
||||||
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
|
|
||||||
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
||||||
- a path to a *directory* containing a feature extractor file saved using the
|
|
||||||
[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
|
|
||||||
- a path or url to a saved feature extractor JSON *file*, e.g.,
|
|
||||||
`./my_model_directory/preprocessor_config.json`.
|
|
||||||
|
|
||||||
use_fast (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether or not to instantiate a fast tokenizer.
|
|
||||||
|
|
||||||
**kwargs
|
|
||||||
Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
|
|
||||||
[`PreTrainedTokenizer`]
|
|
||||||
"""
|
|
||||||
feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
if use_fast:
|
|
||||||
tokenizer = LayoutLMv2TokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
else:
|
|
||||||
tokenizer = LayoutLMv2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -17,15 +17,12 @@ Processor class for LayoutXLM.
|
|||||||
"""
|
"""
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from transformers.models.layoutlmv2.feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor
|
|
||||||
|
|
||||||
from ...file_utils import TensorType
|
from ...file_utils import TensorType
|
||||||
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
|
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
|
||||||
from .tokenization_layoutxlm import LayoutXLMTokenizer
|
|
||||||
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
|
|
||||||
|
|
||||||
|
|
||||||
class LayoutXLMProcessor:
|
class LayoutXLMProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
|
Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
|
||||||
single processor.
|
single processor.
|
||||||
@@ -44,84 +41,8 @@ class LayoutXLMProcessor:
|
|||||||
tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`):
|
tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`):
|
||||||
An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
|
An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
|
feature_extractor_class = "LayoutLMv2FeatureExtractor"
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
|
||||||
if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor):
|
|
||||||
raise ValueError(
|
|
||||||
f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
|
|
||||||
)
|
|
||||||
if not isinstance(tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast)):
|
|
||||||
raise ValueError(
|
|
||||||
f"`tokenizer` has to be of type {LayoutXLMTokenizer.__class__} or {LayoutXLMTokenizerFast.__class__}, but is {type(tokenizer)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.feature_extractor = feature_extractor
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
|
||||||
"""
|
|
||||||
Save a LayoutXLM feature_extractor object and LayoutXLM tokenizer object to the directory `save_directory`, so
|
|
||||||
that it can be re-loaded using the [`~LayoutXLMProcessor.from_pretrained`] class method.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_directory (`str` or `os.PathLike`):
|
|
||||||
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
|
||||||
be created if it does not exist).
|
|
||||||
"""
|
|
||||||
self.feature_extractor._set_processor_class(self.__class__.__name__)
|
|
||||||
self.feature_extractor.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
self.tokenizer._set_processor_class(self.__class__.__name__)
|
|
||||||
self.tokenizer.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`LayoutXLMProcessor`] from a pretrained LayoutXLM processor.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling Layoutv2FeatureExtractor's
|
|
||||||
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and LayoutXLMTokenizerFast's
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
|
||||||
This can be either:
|
|
||||||
|
|
||||||
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
|
|
||||||
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
|
|
||||||
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
||||||
- a path to a *directory* containing a feature extractor file saved using the
|
|
||||||
[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
|
|
||||||
- a path or url to a saved feature extractor JSON *file*, e.g.,
|
|
||||||
`./my_model_directory/preprocessor_config.json`.
|
|
||||||
|
|
||||||
use_fast (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether or not to instantiate a fast tokenizer.
|
|
||||||
|
|
||||||
**kwargs
|
|
||||||
Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
|
|
||||||
[`PreTrainedTokenizer`]
|
|
||||||
"""
|
|
||||||
feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
if use_fast:
|
|
||||||
tokenizer = LayoutXLMTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
else:
|
|
||||||
tokenizer = LayoutXLMTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -17,11 +17,10 @@ Speech processor class for Speech2Text
|
|||||||
"""
|
"""
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor
|
from ...processing_utils import ProcessorMixin
|
||||||
from .tokenization_speech_to_text import Speech2TextTokenizer
|
|
||||||
|
|
||||||
|
|
||||||
class Speech2TextProcessor:
|
class Speech2TextProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a Speech2Text processor which wraps a Speech2Text feature extractor and a Speech2Text tokenizer into a
|
Constructs a Speech2Text processor which wraps a Speech2Text feature extractor and a Speech2Text tokenizer into a
|
||||||
single processor.
|
single processor.
|
||||||
@@ -36,79 +35,13 @@ class Speech2TextProcessor:
|
|||||||
tokenizer (`Speech2TextTokenizer`):
|
tokenizer (`Speech2TextTokenizer`):
|
||||||
An instance of [`Speech2TextTokenizer`]. The tokenizer is a required input.
|
An instance of [`Speech2TextTokenizer`]. The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
|
feature_extractor_class = "Speech2TextFeatureExtractor"
|
||||||
|
tokenizer_class = "Speech2TextTokenizer"
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, feature_extractor, tokenizer):
|
||||||
if not isinstance(feature_extractor, Speech2TextFeatureExtractor):
|
super().__init__(feature_extractor, tokenizer)
|
||||||
raise ValueError(
|
|
||||||
f"`feature_extractor` has to be of type {Speech2TextFeatureExtractor.__class__}, but is {type(feature_extractor)}"
|
|
||||||
)
|
|
||||||
if not isinstance(tokenizer, Speech2TextTokenizer):
|
|
||||||
raise ValueError(
|
|
||||||
f"`tokenizer` has to be of type {Speech2TextTokenizer.__class__}, but is {type(tokenizer)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.feature_extractor = feature_extractor
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.current_processor = self.feature_extractor
|
self.current_processor = self.feature_extractor
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
|
||||||
"""
|
|
||||||
Save a Speech2Text feature extractor object and Speech2Text tokenizer object to the directory `save_directory`,
|
|
||||||
so that it can be re-loaded using the [`~Speech2TextProcessor.from_pretrained`] class method.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_directory (`str` or `os.PathLike`):
|
|
||||||
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
|
||||||
be created if it does not exist).
|
|
||||||
"""
|
|
||||||
self.feature_extractor._set_processor_class(self.__class__.__name__)
|
|
||||||
self.feature_extractor.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
self.tokenizer._set_processor_class(self.__class__.__name__)
|
|
||||||
self.tokenizer.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`Speech2TextProcessor`] from a pretrained Speech2Text processor.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling Speech2TextFeatureExtractor's
|
|
||||||
[`~PreTrainedFeatureExtractor.from_pretrained`] and Speech2TextTokenizer's
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
|
||||||
This can be either:
|
|
||||||
|
|
||||||
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
|
|
||||||
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
|
|
||||||
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
||||||
- a path to a *directory* containing a feature extractor file saved using the
|
|
||||||
[`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
|
|
||||||
- a path or url to a saved feature extractor JSON *file*, e.g.,
|
|
||||||
`./my_model_directory/preprocessor_config.json`.
|
|
||||||
**kwargs
|
|
||||||
Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
|
|
||||||
[`PreTrainedTokenizer`]
|
|
||||||
"""
|
|
||||||
feature_extractor = Speech2TextFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
tokenizer = Speech2TextTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
|
||||||
|
|
||||||
def __call__(self, *args, **kwargs):
|
def __call__(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
|
When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
|
||||||
|
|||||||
@@ -17,12 +17,10 @@ Speech processor class for Speech2Text2
|
|||||||
"""
|
"""
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
|
from ...processing_utils import ProcessorMixin
|
||||||
from ..auto.feature_extraction_auto import AutoFeatureExtractor
|
|
||||||
from .tokenization_speech_to_text_2 import Speech2Text2Tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
class Speech2Text2Processor:
|
class Speech2Text2Processor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into
|
Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into
|
||||||
a single processor.
|
a single processor.
|
||||||
@@ -36,77 +34,13 @@ class Speech2Text2Processor:
|
|||||||
tokenizer (`Speech2Text2Tokenizer`):
|
tokenizer (`Speech2Text2Tokenizer`):
|
||||||
An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input.
|
An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
|
feature_extractor_class = "AutoFeatureExtractor"
|
||||||
|
tokenizer_class = "Speech2Text2Tokenizer"
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, feature_extractor, tokenizer):
|
||||||
if not isinstance(feature_extractor, SequenceFeatureExtractor):
|
super().__init__(feature_extractor, tokenizer)
|
||||||
raise ValueError(
|
|
||||||
f"`feature_extractor` has to be of type {SequenceFeatureExtractor.__class__}, but is {type(feature_extractor)}"
|
|
||||||
)
|
|
||||||
if not isinstance(tokenizer, Speech2Text2Tokenizer):
|
|
||||||
raise ValueError(
|
|
||||||
f"`tokenizer` has to be of type {Speech2Text2Tokenizer.__class__}, but is {type(tokenizer)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.feature_extractor = feature_extractor
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.current_processor = self.feature_extractor
|
self.current_processor = self.feature_extractor
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
|
||||||
"""
|
|
||||||
Save a Speech2Text2 feature extractor object and Speech2Text2 tokenizer object to the directory
|
|
||||||
`save_directory`, so that it can be re-loaded using the [`~Speech2Text2Processor.from_pretrained`] class
|
|
||||||
method.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_directory (`str` or `os.PathLike`):
|
|
||||||
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
|
||||||
be created if it does not exist).
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.feature_extractor.save_pretrained(save_directory)
|
|
||||||
self.tokenizer.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`Speech2Text2Processor`] from a pretrained Speech2Text2 processor.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
|
|
||||||
Speech2Text2Tokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
|
|
||||||
docstrings of the methods above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
|
||||||
This can be either:
|
|
||||||
|
|
||||||
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
|
|
||||||
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
|
|
||||||
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
||||||
- a path to a *directory* containing a feature extractor file saved using the
|
|
||||||
[`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
|
|
||||||
- a path or url to a saved feature extractor JSON *file*, e.g.,
|
|
||||||
`./my_model_directory/preprocessor_config.json`.
|
|
||||||
**kwargs
|
|
||||||
Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
|
|
||||||
[`PreTrainedTokenizer`]
|
|
||||||
"""
|
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
tokenizer = Speech2Text2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
|
||||||
|
|
||||||
def __call__(self, *args, **kwargs):
|
def __call__(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
|
When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
|
||||||
|
|||||||
@@ -17,15 +17,10 @@ Processor class for TrOCR.
|
|||||||
"""
|
"""
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
from transformers import AutoFeatureExtractor, AutoTokenizer
|
from ...processing_utils import ProcessorMixin
|
||||||
from transformers.feature_extraction_utils import FeatureExtractionMixin
|
|
||||||
from transformers.models.roberta.tokenization_roberta import RobertaTokenizer
|
|
||||||
from transformers.models.roberta.tokenization_roberta_fast import RobertaTokenizerFast
|
|
||||||
from transformers.models.xlm_roberta.tokenization_xlm_roberta import XLMRobertaTokenizer
|
|
||||||
from transformers.models.xlm_roberta.tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast
|
|
||||||
|
|
||||||
|
|
||||||
class TrOCRProcessor:
|
class TrOCRProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a TrOCR processor which wraps a vision feature extractor and a TrOCR tokenizer into a single processor.
|
Constructs a TrOCR processor which wraps a vision feature extractor and a TrOCR tokenizer into a single processor.
|
||||||
|
|
||||||
@@ -39,78 +34,13 @@ class TrOCRProcessor:
|
|||||||
tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`]):
|
tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`]):
|
||||||
An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input.
|
An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
|
feature_extractor_class = "AutoFeatureExtractor"
|
||||||
|
tokenizer_class = "AutoTokenizer"
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, feature_extractor, tokenizer):
|
||||||
if not isinstance(feature_extractor, FeatureExtractionMixin):
|
super().__init__(feature_extractor, tokenizer)
|
||||||
raise ValueError(
|
|
||||||
f"`feature_extractor` has to be of type {FeatureExtractionMixin.__class__}, but is {type(feature_extractor)}"
|
|
||||||
)
|
|
||||||
if not isinstance(
|
|
||||||
tokenizer, (RobertaTokenizer, RobertaTokenizerFast, XLMRobertaTokenizer, XLMRobertaTokenizerFast)
|
|
||||||
):
|
|
||||||
raise ValueError(
|
|
||||||
f"`tokenizer` has to be of type {RobertaTokenizer.__class__} or {RobertaTokenizerFast.__class__} or {XLMRobertaTokenizer.__class__} or {XLMRobertaTokenizerFast.__class__}, but is {type(tokenizer)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.feature_extractor = feature_extractor
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.current_processor = self.feature_extractor
|
self.current_processor = self.feature_extractor
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
|
||||||
"""
|
|
||||||
Save a TrOCR feature extractor object and TrOCR tokenizer object to the directory `save_directory`, so that it
|
|
||||||
can be re-loaded using the [`~TrOCRProcessor.from_pretrained`] class method.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_directory (`str` or `os.PathLike`):
|
|
||||||
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
|
||||||
be created if it does not exist).
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.feature_extractor.save_pretrained(save_directory)
|
|
||||||
self.tokenizer.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`TrOCRProcessor`] from a pretrained TrOCR processor.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
|
|
||||||
TrOCRTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
|
|
||||||
docstrings of the methods above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
|
||||||
This can be either:
|
|
||||||
|
|
||||||
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
|
|
||||||
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
|
|
||||||
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
||||||
- a path to a *directory* containing a feature extractor file saved using the
|
|
||||||
[`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
|
|
||||||
- a path or url to a saved feature extractor JSON *file*, e.g.,
|
|
||||||
`./my_model_directory/preprocessor_config.json`.
|
|
||||||
**kwargs
|
|
||||||
Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
|
|
||||||
[`PreTrainedTokenizer`]
|
|
||||||
"""
|
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
|
||||||
|
|
||||||
def __call__(self, *args, **kwargs):
|
def __call__(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
|
When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
|
||||||
|
|||||||
@@ -18,14 +18,12 @@ Processor class for ViLT.
|
|||||||
|
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from transformers import BertTokenizerFast
|
|
||||||
|
|
||||||
from ...file_utils import TensorType
|
from ...file_utils import TensorType
|
||||||
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
|
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
|
||||||
from .feature_extraction_vilt import ViltFeatureExtractor
|
|
||||||
|
|
||||||
|
|
||||||
class ViltProcessor:
|
class ViltProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a ViLT processor which wraps a BERT tokenizer and ViLT feature extractor into a single processor.
|
Constructs a ViLT processor which wraps a BERT tokenizer and ViLT feature extractor into a single processor.
|
||||||
|
|
||||||
@@ -38,75 +36,13 @@ class ViltProcessor:
|
|||||||
tokenizer (`BertTokenizerFast`):
|
tokenizer (`BertTokenizerFast`):
|
||||||
An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
|
An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
|
feature_extractor_class = "ViltFeatureExtractor"
|
||||||
|
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, feature_extractor, tokenizer):
|
||||||
if not isinstance(feature_extractor, ViltFeatureExtractor):
|
super().__init__(feature_extractor, tokenizer)
|
||||||
raise ValueError(
|
|
||||||
f"`feature_extractor` has to be of type {ViltFeatureExtractor.__class__}, but is {type(feature_extractor)}"
|
|
||||||
)
|
|
||||||
if not isinstance(tokenizer, BertTokenizerFast):
|
|
||||||
raise ValueError(f"`tokenizer` has to be of type {BertTokenizerFast.__class__}, but is {type(tokenizer)}")
|
|
||||||
|
|
||||||
self.feature_extractor = feature_extractor
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.current_processor = self.feature_extractor
|
self.current_processor = self.feature_extractor
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
|
||||||
"""
|
|
||||||
Save a ViLT feature_extractor object and BERT tokenizer object to the directory `save_directory`, so that it
|
|
||||||
can be re-loaded using the [`~ViltProcessor.from_pretrained`] class method.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_directory (`str` or `os.PathLike`):
|
|
||||||
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
|
||||||
be created if it does not exist).
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.feature_extractor.save_pretrained(save_directory)
|
|
||||||
self.tokenizer.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`ViltProcessor`] from a pretrained ViLT processor.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling ViltFeatureExtractor's
|
|
||||||
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and BertTokenizerFast's
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
|
||||||
This can be either:
|
|
||||||
|
|
||||||
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
|
|
||||||
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
|
|
||||||
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
||||||
- a path to a *directory* containing a feature extractor file saved using the
|
|
||||||
[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
|
|
||||||
- a path or url to a saved feature extractor JSON *file*, e.g.,
|
|
||||||
`./my_model_directory/preprocessor_config.json`.
|
|
||||||
**kwargs
|
|
||||||
Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
|
|
||||||
[`PreTrainedTokenizer`]
|
|
||||||
"""
|
|
||||||
feature_extractor = ViltFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
images,
|
images,
|
||||||
|
|||||||
@@ -15,17 +15,12 @@
|
|||||||
"""
|
"""
|
||||||
Processor class for VisionTextDualEncoder
|
Processor class for VisionTextDualEncoder
|
||||||
"""
|
"""
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
|
||||||
from transformers.feature_extraction_utils import FeatureExtractionMixin
|
|
||||||
|
|
||||||
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils_base import BatchEncoding
|
from ...tokenization_utils_base import BatchEncoding
|
||||||
from ..auto.feature_extraction_auto import AutoFeatureExtractor
|
|
||||||
from ..auto.tokenization_auto import AutoTokenizer
|
|
||||||
|
|
||||||
|
|
||||||
class VisionTextDualEncoderProcessor:
|
class VisionTextDualEncoderProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a VisionTextDualEncoder processor which wraps a vision feature extractor and a tokenizer into a single
|
Constructs a VisionTextDualEncoder processor which wraps a vision feature extractor and a tokenizer into a single
|
||||||
processor.
|
processor.
|
||||||
@@ -40,82 +35,13 @@ class VisionTextDualEncoderProcessor:
|
|||||||
tokenizer ([`PreTrainedTokenizer`]):
|
tokenizer ([`PreTrainedTokenizer`]):
|
||||||
The tokenizer is a required input.
|
The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
|
feature_extractor_class = "AutoFeatureExtractor"
|
||||||
|
tokenizer_class = "AutoTokenizer"
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, feature_extractor, tokenizer):
|
||||||
self, feature_extractor: FeatureExtractionMixin, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
|
super().__init__(feature_extractor, tokenizer)
|
||||||
):
|
|
||||||
if not isinstance(feature_extractor, FeatureExtractionMixin):
|
|
||||||
raise ValueError(
|
|
||||||
f"`feature_extractor` has to be of type {FeatureExtractionMixin.__class__}, but is {type(feature_extractor)}"
|
|
||||||
)
|
|
||||||
if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
|
|
||||||
raise ValueError(
|
|
||||||
f"`tokenizer` has to be of type `PreTrainedTokenizer` or `PreTrainedTokenizerFast`, but is {type(tokenizer)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.feature_extractor = feature_extractor
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.current_processor = self.feature_extractor
|
self.current_processor = self.feature_extractor
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
|
||||||
"""
|
|
||||||
Save a VisionTextDualEncoder feature extractor object and VisionTextDualEncoder tokenizer object to the
|
|
||||||
directory `save_directory`, so that it can be re-loaded using the
|
|
||||||
[`~VisionTextDualEncoderProcessor.from_pretrained`] class method.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_directory (`str` or `os.PathLike`):
|
|
||||||
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
|
||||||
be created if it does not exist).
|
|
||||||
"""
|
|
||||||
self.feature_extractor._set_processor_class(self.__class__.__name__)
|
|
||||||
self.feature_extractor.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
self.tokenizer._set_processor_class(self.__class__.__name__)
|
|
||||||
self.tokenizer.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
|
||||||
r"""
|
|
||||||
Instantiate a [`VisionTextDualEncoderProcessor`] from a pretrained VisionTextDualEncoder processor.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
|
|
||||||
AutoTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
|
|
||||||
docstrings of the methods above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
|
||||||
This can be either:
|
|
||||||
|
|
||||||
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
|
|
||||||
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
|
|
||||||
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
||||||
- a path to a *directory* containing a feature extractor file saved using the
|
|
||||||
[`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
|
|
||||||
- a path or url to a saved feature extractor JSON *file*, e.g.,
|
|
||||||
`./my_model_directory/preprocessor_config.json`.
|
|
||||||
|
|
||||||
**kwargs
|
|
||||||
Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
|
|
||||||
[`PreTrainedTokenizer`]
|
|
||||||
"""
|
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
|
||||||
|
|
||||||
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
|
|||||||
@@ -18,14 +18,12 @@ Speech processor class for Wav2Vec2
|
|||||||
import warnings
|
import warnings
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
from ...tokenization_utils import PreTrainedTokenizer
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
|
||||||
from ..auto.tokenization_auto import AutoTokenizer
|
|
||||||
from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
|
from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
|
||||||
from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
|
from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
|
||||||
|
|
||||||
|
|
||||||
class Wav2Vec2Processor:
|
class Wav2Vec2Processor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
|
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
|
||||||
processor.
|
processor.
|
||||||
@@ -39,82 +37,17 @@ class Wav2Vec2Processor:
|
|||||||
tokenizer ([`PreTrainedTokenizer`]):
|
tokenizer ([`PreTrainedTokenizer`]):
|
||||||
An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
|
An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
|
||||||
"""
|
"""
|
||||||
|
feature_extractor_class = "Wav2Vec2FeatureExtractor"
|
||||||
|
tokenizer_class = "AutoTokenizer"
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
def __init__(self, feature_extractor, tokenizer):
|
||||||
if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor):
|
super().__init__(feature_extractor, tokenizer)
|
||||||
raise ValueError(
|
|
||||||
f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
|
|
||||||
)
|
|
||||||
if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
|
|
||||||
raise ValueError(
|
|
||||||
f"`tokenizer` has to be of type {PreTrainedTokenizer.__class__}, but is {type(tokenizer)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.feature_extractor = feature_extractor
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.current_processor = self.feature_extractor
|
self.current_processor = self.feature_extractor
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
|
||||||
"""
|
|
||||||
Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory `save_directory`, so
|
|
||||||
that it can be re-loaded using the [`~Wav2Vec2Processor.from_pretrained`] class method.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_directory (`str` or `os.PathLike`):
|
|
||||||
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
|
||||||
be created if it does not exist).
|
|
||||||
"""
|
|
||||||
self.feature_extractor._set_processor_class(self.__class__.__name__)
|
|
||||||
self.feature_extractor.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
self.tokenizer._set_processor_class(self.__class__.__name__)
|
|
||||||
self.tokenizer.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||||
r"""
|
|
||||||
Instantiate a [`Wav2Vec2Processor`] from a pretrained Wav2Vec2 processor.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling Wav2Vec2FeatureExtractor's
|
|
||||||
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and PreTrainedTokenizer's
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
|
|
||||||
above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
|
||||||
This can be either:
|
|
||||||
|
|
||||||
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
|
|
||||||
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
|
|
||||||
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
||||||
- a path to a *directory* containing a feature extractor file saved using the
|
|
||||||
[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
|
|
||||||
- a path or url to a saved feature extractor JSON *file*, e.g.,
|
|
||||||
`./my_model_directory/preprocessor_config.json`.
|
|
||||||
**kwargs
|
|
||||||
Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
|
|
||||||
[`PreTrainedTokenizer`]
|
|
||||||
"""
|
|
||||||
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
# load generic `AutoTokenizer`
|
|
||||||
# need fallback here for backward compatibility in case processor is
|
|
||||||
# loaded from just a tokenizer file that does not have a `tokenizer_class` attribute
|
|
||||||
# behavior should be deprecated in major future release
|
|
||||||
try:
|
try:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
except OSError:
|
except OSError:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
f"Loading a tokenizer inside {cls.__name__} from a config that does not"
|
f"Loading a tokenizer inside {cls.__name__} from a config that does not"
|
||||||
@@ -124,6 +57,8 @@ class Wav2Vec2Processor:
|
|||||||
"file to suppress this warning: ",
|
"file to suppress this warning: ",
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
|
||||||
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||||
|
|||||||
@@ -23,16 +23,16 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from ...feature_extraction_utils import FeatureExtractionMixin
|
|
||||||
from ...file_utils import ModelOutput, requires_backends
|
from ...file_utils import ModelOutput, requires_backends
|
||||||
from ...tokenization_utils import PreTrainedTokenizer
|
from ...processing_utils import ProcessorMixin
|
||||||
from ..wav2vec2.feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
|
|
||||||
from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
|
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pyctcdecode import BeamSearchDecoderCTC
|
from pyctcdecode import BeamSearchDecoderCTC
|
||||||
|
|
||||||
|
from ...feature_extraction_utils import FeatureExtractionMixin
|
||||||
|
from ...tokenization_utils import PreTrainedTokenizerBase
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Wav2Vec2DecoderWithLMOutput(ModelOutput):
|
class Wav2Vec2DecoderWithLMOutput(ModelOutput):
|
||||||
@@ -47,7 +47,7 @@ class Wav2Vec2DecoderWithLMOutput(ModelOutput):
|
|||||||
text: Union[List[str], str]
|
text: Union[List[str], str]
|
||||||
|
|
||||||
|
|
||||||
class Wav2Vec2ProcessorWithLM:
|
class Wav2Vec2ProcessorWithLM(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder
|
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder
|
||||||
with language model support into a single processor for language model boosted speech recognition decoding.
|
with language model support into a single processor for language model boosted speech recognition decoding.
|
||||||
@@ -60,24 +60,18 @@ class Wav2Vec2ProcessorWithLM:
|
|||||||
decoder (`pyctcdecode.BeamSearchDecoderCTC`):
|
decoder (`pyctcdecode.BeamSearchDecoderCTC`):
|
||||||
An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
|
An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
|
||||||
"""
|
"""
|
||||||
|
feature_extractor_class = "Wav2Vec2FeatureExtractor"
|
||||||
|
tokenizer_class = "Wav2Vec2CTCTokenizer"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
feature_extractor: FeatureExtractionMixin,
|
feature_extractor: "FeatureExtractionMixin",
|
||||||
tokenizer: PreTrainedTokenizer,
|
tokenizer: "PreTrainedTokenizerBase",
|
||||||
decoder: "BeamSearchDecoderCTC",
|
decoder: "BeamSearchDecoderCTC",
|
||||||
):
|
):
|
||||||
from pyctcdecode import BeamSearchDecoderCTC
|
from pyctcdecode import BeamSearchDecoderCTC
|
||||||
|
|
||||||
if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor):
|
super().__init__(feature_extractor, tokenizer)
|
||||||
raise ValueError(
|
|
||||||
f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
|
|
||||||
)
|
|
||||||
if not isinstance(tokenizer, Wav2Vec2CTCTokenizer):
|
|
||||||
# TODO(PVP) - this can be relaxed in the future to allow other kinds of tokenizers
|
|
||||||
raise ValueError(
|
|
||||||
f"`tokenizer` has to be of type {Wav2Vec2CTCTokenizer.__class__}, but is {type(tokenizer)}"
|
|
||||||
)
|
|
||||||
if not isinstance(decoder, BeamSearchDecoderCTC):
|
if not isinstance(decoder, BeamSearchDecoderCTC):
|
||||||
raise ValueError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}")
|
raise ValueError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}")
|
||||||
|
|
||||||
@@ -90,37 +84,11 @@ class Wav2Vec2ProcessorWithLM:
|
|||||||
f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet."
|
f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet."
|
||||||
)
|
)
|
||||||
|
|
||||||
self.feature_extractor = feature_extractor
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.decoder = decoder
|
self.decoder = decoder
|
||||||
self.current_processor = self.feature_extractor
|
self.current_processor = self.feature_extractor
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
def save_pretrained(self, save_directory):
|
||||||
"""
|
super().save_pretrained(save_directory)
|
||||||
Save the Wav2Vec2 feature_extractor, a tokenizer object and a pyctcdecode decoder to the directory
|
|
||||||
`save_directory`, so that they can be re-loaded using the [`~Wav2Vec2ProcessorWithLM.from_pretrained`] class
|
|
||||||
method.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained,`]
|
|
||||||
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`] and pyctcdecode's
|
|
||||||
[`pyctcdecode.BeamSearchDecoderCTC.save_to_dir`].
|
|
||||||
|
|
||||||
Please refer to the docstrings of the methods above for more information.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_directory (`str` or `os.PathLike`):
|
|
||||||
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
|
||||||
be created if it does not exist).
|
|
||||||
"""
|
|
||||||
self.feature_extractor._set_processor_class(self.__class__.__name__)
|
|
||||||
self.feature_extractor.save_pretrained(save_directory)
|
|
||||||
|
|
||||||
self.tokenizer._set_processor_class(self.__class__.__name__)
|
|
||||||
self.tokenizer.save_pretrained(save_directory)
|
|
||||||
self.decoder.save_to_dir(save_directory)
|
self.decoder.save_to_dir(save_directory)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -157,8 +125,7 @@ class Wav2Vec2ProcessorWithLM:
|
|||||||
requires_backends(cls, "pyctcdecode")
|
requires_backends(cls, "pyctcdecode")
|
||||||
from pyctcdecode import BeamSearchDecoderCTC
|
from pyctcdecode import BeamSearchDecoderCTC
|
||||||
|
|
||||||
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
feature_extractor, tokenizer = super()._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
if os.path.isdir(pretrained_model_name_or_path):
|
if os.path.isdir(pretrained_model_name_or_path):
|
||||||
decoder = BeamSearchDecoderCTC.load_from_dir(pretrained_model_name_or_path)
|
decoder = BeamSearchDecoderCTC.load_from_dir(pretrained_model_name_or_path)
|
||||||
|
|||||||
161
src/transformers/processing_utils.py
Normal file
161
src/transformers/processing_utils.py
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2022 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Processing saving/loading class for common processors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
# Comment to write
|
||||||
|
spec = importlib.util.spec_from_file_location(
|
||||||
|
"transformers", Path(__file__).parent / "__init__.py", submodule_search_locations=[Path(__file__).parent]
|
||||||
|
)
|
||||||
|
transformers_module = spec.loader.load_module()
|
||||||
|
|
||||||
|
|
||||||
|
AUTO_TO_BASE_CLASS_MAPPING = {
|
||||||
|
"AutoTokenizer": "PreTrainedTokenizerBase",
|
||||||
|
"AutoFeatureExtractor": "FeatureExtractionMixin",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessorMixin:
|
||||||
|
"""
|
||||||
|
This is a mixin used to provide saving/loading functionality for all processor classes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
attributes = ["feature_extractor", "tokenizer"]
|
||||||
|
# Names need to be attr_class for attr in attributes
|
||||||
|
feature_extractor_class = None
|
||||||
|
tokenizer_class = None
|
||||||
|
|
||||||
|
# args have to match the attributes class attribute
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
# Sanitize args and kwargs
|
||||||
|
for key in kwargs:
|
||||||
|
if key not in self.attributes:
|
||||||
|
raise TypeError(f"Unexepcted keyword argument {key}.")
|
||||||
|
for arg, attribute_name in zip(args, self.attributes):
|
||||||
|
if attribute_name in kwargs:
|
||||||
|
raise TypeError(f"Got multiple values for argument {attribute_name}.")
|
||||||
|
else:
|
||||||
|
kwargs[attribute_name] = arg
|
||||||
|
|
||||||
|
if len(kwargs) != len(self.attributes):
|
||||||
|
raise ValueError(
|
||||||
|
f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got "
|
||||||
|
f"{len(args)} arguments instead."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
|
||||||
|
for attribute_name, arg in kwargs.items():
|
||||||
|
class_name = getattr(self, f"{attribute_name}_class")
|
||||||
|
# Nothing is ever going to be an instance of "AutoXxx", in that case we check the base class.
|
||||||
|
class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name)
|
||||||
|
if isinstance(class_name, tuple):
|
||||||
|
proper_class = tuple(getattr(transformers_module, n) for n in class_name if n is not None)
|
||||||
|
else:
|
||||||
|
proper_class = getattr(transformers_module, class_name)
|
||||||
|
|
||||||
|
if not isinstance(arg, proper_class):
|
||||||
|
raise ValueError(
|
||||||
|
f"Received a {type(arg).__name__} for argument {attribute_name}, but a {class_name} was expected."
|
||||||
|
)
|
||||||
|
|
||||||
|
setattr(self, attribute_name, arg)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
|
||||||
|
attributes_repr = "\n".join(attributes_repr)
|
||||||
|
return f"{self.__class__.__name__}:\n{attributes_repr}"
|
||||||
|
|
||||||
|
def save_pretrained(self, save_directory):
|
||||||
|
"""
|
||||||
|
Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
|
||||||
|
can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
|
||||||
|
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
|
||||||
|
above for more information.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (`str` or `os.PathLike`):
|
||||||
|
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
||||||
|
be created if it does not exist).
|
||||||
|
"""
|
||||||
|
for attribute_name in self.attributes:
|
||||||
|
attribute = getattr(self, attribute_name)
|
||||||
|
# Include the processor class in the attribute config so this processor can then be reloaded with the
|
||||||
|
# `AutoProcessor` API.
|
||||||
|
if hasattr(attribute, "_set_processor_class"):
|
||||||
|
attribute._set_processor_class(self.__class__.__name__)
|
||||||
|
attribute.save_pretrained(save_directory)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||||
|
r"""
|
||||||
|
Instantiate a processor associated with a pretrained model.
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
This class method is simply calling the feature extractor
|
||||||
|
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer
|
||||||
|
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
|
||||||
|
methods above for more information.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
||||||
|
This can be either:
|
||||||
|
|
||||||
|
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
|
||||||
|
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
|
||||||
|
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
|
||||||
|
- a path to a *directory* containing a feature extractor file saved using the
|
||||||
|
[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
|
||||||
|
- a path or url to a saved feature extractor JSON *file*, e.g.,
|
||||||
|
`./my_model_directory/preprocessor_config.json`.
|
||||||
|
**kwargs
|
||||||
|
Additional keyword arguments passed along to both
|
||||||
|
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
|
||||||
|
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
|
||||||
|
"""
|
||||||
|
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
return cls(*args)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||||
|
args = []
|
||||||
|
for attribute_name in cls.attributes:
|
||||||
|
class_name = getattr(cls, f"{attribute_name}_class")
|
||||||
|
if isinstance(class_name, tuple):
|
||||||
|
classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
|
||||||
|
use_fast = kwargs.get("use_fast", True)
|
||||||
|
if use_fast and classes[1] is not None:
|
||||||
|
attribute_class = classes[1]
|
||||||
|
else:
|
||||||
|
attribute_class = classes[0]
|
||||||
|
else:
|
||||||
|
attribute_class = getattr(transformers_module, class_name)
|
||||||
|
|
||||||
|
args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
|
||||||
|
return args
|
||||||
Reference in New Issue
Block a user