[AutoProcessor] Correct AutoProcessor and automatically add processor… (#14881)

* [AutoProcessor] Correct AutoProcessor and automatically add processor class * up * up * up * up * up * up * up * up * continue tomorrow * up * up * up * make processor class private * fix loop
2021-12-30 09:56:43 +01:00
parent d7d60df0ec
commit a1392883ce
11 changed files with 110 additions and 7 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1444,6 +1444,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        self.init_inputs = ()
        self.init_kwargs = copy.deepcopy(kwargs)
        self.name_or_path = kwargs.pop("name_or_path", "")
+        self._processor_class = kwargs.pop("processor_class", None)

        # For backward compatibility we fallback to set model_max_length from max_len if provided
        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
@@ -1505,6 +1506,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
            )

+    def _set_processor_class(self, processor_class: str):
+        """Sets processor class as an attribute."""
+        self._processor_class = processor_class
+
    def __repr__(self) -> str:
        return (
            f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
@@ -2029,6 +2034,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        tokenizer_config["tokenizer_class"] = tokenizer_class
        if getattr(self, "_auto_map", None) is not None:
            tokenizer_config["auto_map"] = self._auto_map
+        if getattr(self, "_processor_class", None) is not None:
+            tokenizer_config["processor_class"] = self._processor_class

        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(tokenizer_config, ensure_ascii=False))