[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)
* splitting fast and slow tokenizers [WIP] * [WIP] splitting sentencepiece and tokenizers dependencies * update dummy objects * add name_or_path to models and tokenizers * prefix added to file names * prefix * styling + quality * spliting all the tokenizer files - sorting sentencepiece based ones * update tokenizer version up to 0.9.0 * remove hard dependency on sentencepiece 🎉 * and removed hard dependency on tokenizers 🎉 * update conversion script * update missing models * fixing tests * move test_tokenization_fast to main tokenization tests - fix bugs * bump up tokenizers * fix bert_generation * update ad fix several tokenizers * keep sentencepiece in deps for now * fix funnel and deberta tests * fix fsmt * fix marian tests * fix layoutlm * fix squeezebert and gpt2 * fix T5 tokenization * fix xlnet tests * style * fix mbart * bump up tokenizers to 0.9.2 * fix model tests * fix tf models * fix seq2seq examples * fix tests without sentencepiece * fix slow => fast conversion without sentencepiece * update auto and bert generation tests * fix mbart tests * fix auto and common test without tokenizers * fix tests without tokenizers * clean up tests lighten up when tokenizers + sentencepiece are both off * style quality and tests fixing * add sentencepiece to doc/examples reqs * leave sentencepiece on for now * style quality split hebert and fix pegasus * WIP Herbert fast * add sample_text_no_unicode and fix hebert tokenization * skip FSMT example test for now * fix style * fix fsmt in example tests * update following Lysandre and Sylvain's comments * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -23,20 +23,19 @@ import json
|
||||
import os
|
||||
import warnings
|
||||
from collections import OrderedDict, UserDict
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from tokenizers import AddedToken
|
||||
from tokenizers import Encoding as EncodingFast
|
||||
|
||||
from .file_utils import (
|
||||
add_end_docstrings,
|
||||
cached_path,
|
||||
hf_bucket_url,
|
||||
is_remote_url,
|
||||
is_tf_available,
|
||||
is_tokenizers_available,
|
||||
is_torch_available,
|
||||
torch_required,
|
||||
)
|
||||
@@ -45,9 +44,36 @@ from .utils import logging
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_tokenizers_available():
|
||||
from tokenizers import AddedToken
|
||||
from tokenizers import Encoding as EncodingFast
|
||||
else:
|
||||
|
||||
@dataclass(frozen=True, eq=True)
|
||||
class AddedToken:
|
||||
"""AddedToken represents a token to be added to a Tokenizer
|
||||
An AddedToken can have special options defining the way it should behave.
|
||||
"""
|
||||
|
||||
content: str = field(default_factory=str)
|
||||
single_word: bool = False
|
||||
lstrip: bool = False
|
||||
rstrip: bool = False
|
||||
normalized: bool = True
|
||||
|
||||
def __getstate__(self):
|
||||
return self.__dict__
|
||||
|
||||
@dataclass
|
||||
class EncodingFast:
|
||||
""" This is dummy class because without the `tokenizers` library we don't have these objects anyway """
|
||||
|
||||
pass
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
@@ -1304,6 +1330,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
# inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
|
||||
self.init_inputs = ()
|
||||
self.init_kwargs = copy.deepcopy(kwargs)
|
||||
self.name_or_path = kwargs.pop("name_or_path", "")
|
||||
|
||||
# For backward compatibility we fallback to set model_max_length from max_len if provided
|
||||
model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
|
||||
@@ -1377,6 +1404,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
|
||||
f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
|
||||
f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
|
||||
r"""
|
||||
@@ -1562,7 +1596,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
# We instantiate fast tokenizers based on a slow tokenizer for now
|
||||
# In the future we can also use a direct way based on saving/instantiating
|
||||
# tokenizer's Tokenizer directly from it's serialization JSON
|
||||
if cls.slow_tokenizer_class is not None:
|
||||
if (
|
||||
"tokenizer_file" not in resolved_vocab_files or resolved_vocab_files["tokenizer_file"] is None
|
||||
) and cls.slow_tokenizer_class is not None:
|
||||
slow_tokenizer = cls.slow_tokenizer_class._from_pretrained(
|
||||
copy.deepcopy(resolved_vocab_files),
|
||||
pretrained_model_name_or_path,
|
||||
@@ -1618,6 +1654,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
if slow_tokenizer is not None:
|
||||
init_kwargs["__slow_tokenizer"] = slow_tokenizer
|
||||
|
||||
init_kwargs["name_or_path"] = pretrained_model_name_or_path
|
||||
|
||||
# Instantiate tokenizer.
|
||||
try:
|
||||
tokenizer = cls(*init_inputs, **init_kwargs)
|
||||
@@ -1669,7 +1707,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
return tokenizer
|
||||
|
||||
def save_pretrained(self, save_directory: str) -> Tuple[str]:
|
||||
def save_pretrained(
|
||||
self, save_directory: str, legacy_format: bool = True, filename_prefix: Optional[str] = None
|
||||
) -> Tuple[str]:
|
||||
"""
|
||||
Save the full tokenizer state.
|
||||
|
||||
@@ -1688,7 +1728,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
modifying :obj:`tokenizer.do_lower_case` after creation).
|
||||
|
||||
Args:
|
||||
save_directory (:obj:`str`): The path to a directory where the tokenizer will be saved.
|
||||
save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
|
||||
legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and
|
||||
a separate added_tokens files or in the unified JSON file format for the `tokenizers` library.
|
||||
It's only possible to save a Fast tokenizer in the unified JSON format and this format is incompatible
|
||||
with "slow" tokenizers (not powered by the `tokenizers` library).
|
||||
filename_prefix: (:obj:`str`, `optional`):
|
||||
A prefix to add to the names of the files saved by the tokenizer.
|
||||
|
||||
Returns:
|
||||
A tuple of :obj:`str`: The files saved.
|
||||
@@ -1698,8 +1745,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
return
|
||||
os.makedirs(save_directory, exist_ok=True)
|
||||
|
||||
special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
|
||||
tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
|
||||
special_tokens_map_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
|
||||
)
|
||||
tokenizer_config_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
|
||||
)
|
||||
|
||||
tokenizer_config = copy.deepcopy(self.init_kwargs)
|
||||
if len(self.init_inputs) > 0:
|
||||
@@ -1732,19 +1783,61 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
file_names = (tokenizer_config_file, special_tokens_map_file)
|
||||
|
||||
return self._save_pretrained(save_directory, file_names)
|
||||
return self._save_pretrained(
|
||||
save_directory=save_directory,
|
||||
file_names=file_names,
|
||||
legacy_format=legacy_format,
|
||||
filename_prefix=filename_prefix,
|
||||
)
|
||||
|
||||
def _save_pretrained(self, save_directory: str, file_names: Tuple[str]) -> Tuple[str]:
|
||||
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
|
||||
def _save_pretrained(
|
||||
self,
|
||||
save_directory: str,
|
||||
file_names: Tuple[str],
|
||||
legacy_format: bool = True,
|
||||
filename_prefix: Optional[str] = None,
|
||||
) -> Tuple[str]:
|
||||
"""Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
|
||||
|
||||
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens}
|
||||
using the specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
|
||||
"""
|
||||
if not legacy_format:
|
||||
raise ValueError(
|
||||
"Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format."
|
||||
)
|
||||
|
||||
added_tokens_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
|
||||
)
|
||||
added_vocab = self.get_added_vocab()
|
||||
if added_vocab:
|
||||
with open(added_tokens_file, "w", encoding="utf-8") as f:
|
||||
out_str = json.dumps(added_vocab, ensure_ascii=False)
|
||||
f.write(out_str)
|
||||
|
||||
vocab_files = self.save_vocabulary(save_directory)
|
||||
vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
|
||||
|
||||
return file_names + (vocab_files, added_tokens_file)
|
||||
return file_names + vocab_files + (added_tokens_file,)
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
"""
|
||||
Save only the vocabulary of the tokenizer (vocabulary + added tokens).
|
||||
|
||||
This method won't save the configuration and special token mappings of the tokenizer.
|
||||
Use :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save
|
||||
the whole state of the tokenizer.
|
||||
|
||||
Args:
|
||||
save_directory (:obj:`str`):
|
||||
The directory in which to save the vocabulary.
|
||||
filename_prefix (:obj:`str`, `optional`):
|
||||
An optional prefix to add to the named of the saved files.
|
||||
|
||||
Returns:
|
||||
:obj:`Tuple(str)`: Paths to the files saved.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@add_end_docstrings(
|
||||
ENCODE_KWARGS_DOCSTRING,
|
||||
|
||||
Reference in New Issue
Block a user