Use code on the Hub from another repo (#22698)

* initial work

* Add other classes

* Refactor code

* Move warning and fix dynamic pipeline

* Issue warning when necessary

* Add test
This commit is contained in:
Sylvain Gugger
2023-04-17 11:36:29 -04:00
committed by GitHub
parent 4d2c52e830
commit ea7b0a539a
12 changed files with 98 additions and 66 deletions

View File

@@ -1817,6 +1817,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
cache_dir=cache_dir,
local_files_only=local_files_only,
_commit_hash=commit_hash,
_is_local=is_local,
**kwargs,
)
@@ -1831,6 +1832,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
cache_dir=None,
local_files_only=False,
_commit_hash=None,
_is_local=False,
**kwargs,
):
# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
@@ -1861,7 +1863,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
# First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
config_tokenizer_class = init_kwargs.get("tokenizer_class")
init_kwargs.pop("tokenizer_class", None)
init_kwargs.pop("auto_map", None)
saved_init_inputs = init_kwargs.pop("init_inputs", ())
if not init_inputs:
init_inputs = saved_init_inputs
@@ -1869,6 +1870,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
config_tokenizer_class = None
init_kwargs = init_configuration
if "auto_map" in init_kwargs and not _is_local:
new_auto_map = {}
for key, value in init_kwargs["auto_map"].items():
if isinstance(value, (list, tuple)):
new_auto_map[key] = [f"{pretrained_model_name_or_path}--{v}" for v in value]
else:
new_auto_map[key] = f"{pretrained_model_name_or_path}--{value}"
init_kwargs["auto_map"] = new_auto_map
if config_tokenizer_class is None:
from .models.auto.configuration_auto import AutoConfig # tests_ignore