Honor trust_remote_code for custom tokenizers (#28854)
* pass through trust_remote_code for dynamically loading unregistered tokenizers specified by config add test * change directories back to previous directory after test * fix ruff check * Add a note to that block for future in case we want to remove it later --------- Co-authored-by: Matt <rocketknight1@gmail.com>
This commit is contained in:
@@ -800,7 +800,9 @@ class AutoTokenizer:
|
|||||||
_ = kwargs.pop("code_revision", None)
|
_ = kwargs.pop("code_revision", None)
|
||||||
if os.path.isdir(pretrained_model_name_or_path):
|
if os.path.isdir(pretrained_model_name_or_path):
|
||||||
tokenizer_class.register_for_auto_class()
|
tokenizer_class.register_for_auto_class()
|
||||||
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
return tokenizer_class.from_pretrained(
|
||||||
|
pretrained_model_name_or_path, *inputs, trust_remote_code=trust_remote_code, **kwargs
|
||||||
|
)
|
||||||
elif config_tokenizer_class is not None:
|
elif config_tokenizer_class is not None:
|
||||||
tokenizer_class = None
|
tokenizer_class = None
|
||||||
if use_fast and not config_tokenizer_class.endswith("Fast"):
|
if use_fast and not config_tokenizer_class.endswith("Fast"):
|
||||||
|
|||||||
@@ -1810,6 +1810,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
local_files_only: bool = False,
|
local_files_only: bool = False,
|
||||||
token: Optional[Union[str, bool]] = None,
|
token: Optional[Union[str, bool]] = None,
|
||||||
revision: str = "main",
|
revision: str = "main",
|
||||||
|
trust_remote_code=False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
r"""
|
r"""
|
||||||
@@ -1853,6 +1854,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
facebook/rag-token-base), specify it here.
|
facebook/rag-token-base), specify it here.
|
||||||
inputs (additional positional arguments, *optional*):
|
inputs (additional positional arguments, *optional*):
|
||||||
Will be passed along to the Tokenizer `__init__` method.
|
Will be passed along to the Tokenizer `__init__` method.
|
||||||
|
trust_remote_code (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
|
||||||
|
should only be set to `True` for repositories you trust and in which you have read the code, as it will
|
||||||
|
execute code present on the Hub on your local machine.
|
||||||
kwargs (additional keyword arguments, *optional*):
|
kwargs (additional keyword arguments, *optional*):
|
||||||
Will be passed to the Tokenizer `__init__` method. Can be used to set special tokens like `bos_token`,
|
Will be passed to the Tokenizer `__init__` method. Can be used to set special tokens like `bos_token`,
|
||||||
`eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
|
`eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
|
||||||
@@ -2036,6 +2041,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
local_files_only=local_files_only,
|
local_files_only=local_files_only,
|
||||||
_commit_hash=commit_hash,
|
_commit_hash=commit_hash,
|
||||||
_is_local=is_local,
|
_is_local=is_local,
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -2051,6 +2057,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
local_files_only=False,
|
local_files_only=False,
|
||||||
_commit_hash=None,
|
_commit_hash=None,
|
||||||
_is_local=False,
|
_is_local=False,
|
||||||
|
trust_remote_code=False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
|
# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
|
||||||
@@ -2099,6 +2106,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if config_tokenizer_class is None:
|
if config_tokenizer_class is None:
|
||||||
|
# Matt: This entire block is only used to decide if the tokenizer class matches the class in the repo.
|
||||||
|
# If not, it raises a warning, but otherwise continues. Since we mostly load tokenizers with
|
||||||
|
# AutoTokenizer these days, it seems like a lot of work (and a source of bugs) for little gain.
|
||||||
|
# Maybe we can just remove this entirely?
|
||||||
from .models.auto.configuration_auto import AutoConfig # tests_ignore
|
from .models.auto.configuration_auto import AutoConfig # tests_ignore
|
||||||
|
|
||||||
# Second attempt. If we have not yet found tokenizer_class, let's try to use the config.
|
# Second attempt. If we have not yet found tokenizer_class, let's try to use the config.
|
||||||
@@ -2108,6 +2119,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
token=token,
|
token=token,
|
||||||
cache_dir=cache_dir,
|
cache_dir=cache_dir,
|
||||||
local_files_only=local_files_only,
|
local_files_only=local_files_only,
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
_commit_hash=_commit_hash,
|
_commit_hash=_commit_hash,
|
||||||
)
|
)
|
||||||
config_tokenizer_class = config.tokenizer_class
|
config_tokenizer_class = config.tokenizer_class
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
@@ -429,3 +430,73 @@ class AutoTokenizerTest(unittest.TestCase):
|
|||||||
self.assertEqual(counter["GET"], 0)
|
self.assertEqual(counter["GET"], 0)
|
||||||
self.assertEqual(counter["HEAD"], 1)
|
self.assertEqual(counter["HEAD"], 1)
|
||||||
self.assertEqual(counter.total_calls, 1)
|
self.assertEqual(counter.total_calls, 1)
|
||||||
|
|
||||||
|
def test_init_tokenizer_with_trust(self):
|
||||||
|
nop_tokenizer_code = """
|
||||||
|
import transformers
|
||||||
|
|
||||||
|
class NopTokenizer(transformers.PreTrainedTokenizer):
|
||||||
|
def get_vocab(self):
|
||||||
|
return {}
|
||||||
|
"""
|
||||||
|
|
||||||
|
nop_config_code = """
|
||||||
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
|
class NopConfig(PretrainedConfig):
|
||||||
|
model_type = "test_unregistered_dynamic"
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
"""
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
fake_model_id = "hf-internal-testing/test_unregistered_dynamic"
|
||||||
|
fake_repo = os.path.join(tmp_dir, fake_model_id)
|
||||||
|
os.makedirs(fake_repo)
|
||||||
|
|
||||||
|
tokenizer_src_file = os.path.join(fake_repo, "tokenizer.py")
|
||||||
|
with open(tokenizer_src_file, "w") as wfp:
|
||||||
|
wfp.write(nop_tokenizer_code)
|
||||||
|
|
||||||
|
model_config_src_file = os.path.join(fake_repo, "config.py")
|
||||||
|
with open(model_config_src_file, "w") as wfp:
|
||||||
|
wfp.write(nop_config_code)
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"model_type": "test_unregistered_dynamic",
|
||||||
|
"auto_map": {"AutoConfig": f"{fake_model_id}--config.NopConfig"},
|
||||||
|
}
|
||||||
|
|
||||||
|
config_file = os.path.join(fake_repo, "config.json")
|
||||||
|
with open(config_file, "w") as wfp:
|
||||||
|
json.dump(config, wfp, indent=2)
|
||||||
|
|
||||||
|
tokenizer_config = {
|
||||||
|
"auto_map": {
|
||||||
|
"AutoTokenizer": [
|
||||||
|
f"{fake_model_id}--tokenizer.NopTokenizer",
|
||||||
|
None,
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizer_config_file = os.path.join(fake_repo, "tokenizer_config.json")
|
||||||
|
with open(tokenizer_config_file, "w") as wfp:
|
||||||
|
json.dump(tokenizer_config, wfp, indent=2)
|
||||||
|
|
||||||
|
prev_dir = os.getcwd()
|
||||||
|
try:
|
||||||
|
# it looks like subdir= is broken in the from_pretrained also, so this is necessary
|
||||||
|
os.chdir(tmp_dir)
|
||||||
|
|
||||||
|
# this should work because we trust the code
|
||||||
|
_ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=True)
|
||||||
|
try:
|
||||||
|
# this should fail because we don't trust and we're not at a terminal for interactive response
|
||||||
|
_ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=False)
|
||||||
|
self.fail("AutoTokenizer.from_pretrained with trust_remote_code=False should raise ValueException")
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
os.chdir(prev_dir)
|
||||||
|
|||||||
Reference in New Issue
Block a user