[AutoProcessor] Correct AutoProcessor and automatically add processor… (#14881)

* [AutoProcessor] Correct AutoProcessor and automatically add processor class

* up

* up

* up

* up

* up

* up

* up

* up

* continue tomorrow

* up

* up

* up

* make processor class private

* fix loop
This commit is contained in:
Patrick von Platen
2021-12-30 09:56:43 +01:00
committed by GitHub
parent d7d60df0ec
commit a1392883ce
11 changed files with 110 additions and 7 deletions

View File

@@ -1444,6 +1444,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
self.init_inputs = ()
self.init_kwargs = copy.deepcopy(kwargs)
self.name_or_path = kwargs.pop("name_or_path", "")
self._processor_class = kwargs.pop("processor_class", None)
# For backward compatibility we fallback to set model_max_length from max_len if provided
model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
@@ -1505,6 +1506,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
)
def _set_processor_class(self, processor_class: str):
"""Sets processor class as an attribute."""
self._processor_class = processor_class
def __repr__(self) -> str:
return (
f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
@@ -2029,6 +2034,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
tokenizer_config["tokenizer_class"] = tokenizer_class
if getattr(self, "_auto_map", None) is not None:
tokenizer_config["auto_map"] = self._auto_map
if getattr(self, "_processor_class", None) is not None:
tokenizer_config["processor_class"] = self._processor_class
with open(tokenizer_config_file, "w", encoding="utf-8") as f:
f.write(json.dumps(tokenizer_config, ensure_ascii=False))