[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)

* splitting fast and slow tokenizers [WIP]

* [WIP] splitting sentencepiece and tokenizers dependencies

* update dummy objects

* add name_or_path to models and tokenizers

* prefix added to file names

* prefix

* styling + quality

* spliting all the tokenizer files - sorting sentencepiece based ones

* update tokenizer version up to 0.9.0

* remove hard dependency on sentencepiece 🎉

* and removed hard dependency on tokenizers 🎉

* update conversion script

* update missing models

* fixing tests

* move test_tokenization_fast to main tokenization tests - fix bugs

* bump up tokenizers

* fix bert_generation

* update ad fix several tokenizers

* keep sentencepiece in deps for now

* fix funnel and deberta tests

* fix fsmt

* fix marian tests

* fix layoutlm

* fix squeezebert and gpt2

* fix T5 tokenization

* fix xlnet tests

* style

* fix mbart

* bump up tokenizers to 0.9.2

* fix model tests

* fix tf models

* fix seq2seq examples

* fix tests without sentencepiece

* fix slow => fast  conversion without sentencepiece

* update auto and bert generation tests

* fix mbart tests

* fix auto and common test without tokenizers

* fix tests without tokenizers

* clean up tests lighten up when tokenizers + sentencepiece are both off

* style quality and tests fixing

* add sentencepiece to doc/examples reqs

* leave sentencepiece on for now

* style quality split hebert and fix pegasus

* WIP Herbert fast

* add sample_text_no_unicode and fix hebert tokenization

* skip FSMT example test for now

* fix style

* fix fsmt in example tests

* update following Lysandre and Sylvain's comments

* Update src/transformers/testing_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/testing_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/tokenization_utils_base.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/tokenization_utils_base.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
Thomas Wolf
2020-10-18 20:51:24 +02:00
committed by GitHub
parent c65863ce53
commit ba8c4d0ac0
140 changed files with 6551 additions and 3961 deletions

View File

@@ -23,20 +23,19 @@ import json
import os
import warnings
from collections import OrderedDict, UserDict
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
import numpy as np
from tokenizers import AddedToken
from tokenizers import Encoding as EncodingFast
from .file_utils import (
add_end_docstrings,
cached_path,
hf_bucket_url,
is_remote_url,
is_tf_available,
is_tokenizers_available,
is_torch_available,
torch_required,
)
@@ -45,9 +44,36 @@ from .utils import logging
if is_tf_available():
import tensorflow as tf
if is_torch_available():
import torch
if is_tokenizers_available():
from tokenizers import AddedToken
from tokenizers import Encoding as EncodingFast
else:
@dataclass(frozen=True, eq=True)
class AddedToken:
"""AddedToken represents a token to be added to a Tokenizer
An AddedToken can have special options defining the way it should behave.
"""
content: str = field(default_factory=str)
single_word: bool = False
lstrip: bool = False
rstrip: bool = False
normalized: bool = True
def __getstate__(self):
return self.__dict__
@dataclass
class EncodingFast:
""" This is dummy class because without the `tokenizers` library we don't have these objects anyway """
pass
logger = logging.get_logger(__name__)
@@ -1304,6 +1330,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
# inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
self.init_inputs = ()
self.init_kwargs = copy.deepcopy(kwargs)
self.name_or_path = kwargs.pop("name_or_path", "")
# For backward compatibility we fallback to set model_max_length from max_len if provided
model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
@@ -1377,6 +1404,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
)
def __repr__(self) -> str:
return (
f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
r"""
@@ -1562,7 +1596,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
# We instantiate fast tokenizers based on a slow tokenizer for now
# In the future we can also use a direct way based on saving/instantiating
# tokenizer's Tokenizer directly from it's serialization JSON
if cls.slow_tokenizer_class is not None:
if (
"tokenizer_file" not in resolved_vocab_files or resolved_vocab_files["tokenizer_file"] is None
) and cls.slow_tokenizer_class is not None:
slow_tokenizer = cls.slow_tokenizer_class._from_pretrained(
copy.deepcopy(resolved_vocab_files),
pretrained_model_name_or_path,
@@ -1618,6 +1654,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
if slow_tokenizer is not None:
init_kwargs["__slow_tokenizer"] = slow_tokenizer
init_kwargs["name_or_path"] = pretrained_model_name_or_path
# Instantiate tokenizer.
try:
tokenizer = cls(*init_inputs, **init_kwargs)
@@ -1669,7 +1707,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
return tokenizer
def save_pretrained(self, save_directory: str) -> Tuple[str]:
def save_pretrained(
self, save_directory: str, legacy_format: bool = True, filename_prefix: Optional[str] = None
) -> Tuple[str]:
"""
Save the full tokenizer state.
@@ -1688,7 +1728,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
modifying :obj:`tokenizer.do_lower_case` after creation).
Args:
save_directory (:obj:`str`): The path to a directory where the tokenizer will be saved.
save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and
a separate added_tokens files or in the unified JSON file format for the `tokenizers` library.
It's only possible to save a Fast tokenizer in the unified JSON format and this format is incompatible
with "slow" tokenizers (not powered by the `tokenizers` library).
filename_prefix: (:obj:`str`, `optional`):
A prefix to add to the names of the files saved by the tokenizer.
Returns:
A tuple of :obj:`str`: The files saved.
@@ -1698,8 +1745,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
return
os.makedirs(save_directory, exist_ok=True)
special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
special_tokens_map_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
)
tokenizer_config_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
)
tokenizer_config = copy.deepcopy(self.init_kwargs)
if len(self.init_inputs) > 0:
@@ -1732,19 +1783,61 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
file_names = (tokenizer_config_file, special_tokens_map_file)
return self._save_pretrained(save_directory, file_names)
return self._save_pretrained(
save_directory=save_directory,
file_names=file_names,
legacy_format=legacy_format,
filename_prefix=filename_prefix,
)
def _save_pretrained(self, save_directory: str, file_names: Tuple[str]) -> Tuple[str]:
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
def _save_pretrained(
self,
save_directory: str,
file_names: Tuple[str],
legacy_format: bool = True,
filename_prefix: Optional[str] = None,
) -> Tuple[str]:
"""Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens}
using the specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
"""
if not legacy_format:
raise ValueError(
"Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format."
)
added_tokens_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
)
added_vocab = self.get_added_vocab()
if added_vocab:
with open(added_tokens_file, "w", encoding="utf-8") as f:
out_str = json.dumps(added_vocab, ensure_ascii=False)
f.write(out_str)
vocab_files = self.save_vocabulary(save_directory)
vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
return file_names + (vocab_files, added_tokens_file)
return file_names + vocab_files + (added_tokens_file,)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save only the vocabulary of the tokenizer (vocabulary + added tokens).
This method won't save the configuration and special token mappings of the tokenizer.
Use :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save
the whole state of the tokenizer.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
filename_prefix (:obj:`str`, `optional`):
An optional prefix to add to the named of the saved files.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
raise NotImplementedError
@add_end_docstrings(
ENCODE_KWARGS_DOCSTRING,