Fix for fast tokenizers save_pretrained compatibility with Python. (#2933)
* Renamed file generate by tokenizers when calling save_pretrained to match python. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Added save_vocabulary tests. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Remove python quick and dirty fix for clean Rust impl. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Bump tokenizers dependency to 0.5.1 Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * TransfoXLTokenizerFast uses a json vocabulary file + warning about incompatibility between Python and Rust Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Added some save_pretrained / from_pretrained unittests. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Update tokenizers to 0.5.2 Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Quality and format. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * flake8 Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Making sure there is really a bug in unittest * Fix TransfoXL constructor vocab_file / pretrained_vocab_file mixin. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
This commit is contained in:
2
setup.py
2
setup.py
@@ -92,7 +92,7 @@ setup(
|
|||||||
packages=find_packages("src"),
|
packages=find_packages("src"),
|
||||||
install_requires=[
|
install_requires=[
|
||||||
"numpy",
|
"numpy",
|
||||||
"tokenizers == 0.5.0",
|
"tokenizers == 0.5.2",
|
||||||
# accessing files from S3 directly
|
# accessing files from S3 directly
|
||||||
"boto3",
|
"boto3",
|
||||||
# filesystem locks e.g. to prevent parallel downloads
|
# filesystem locks e.g. to prevent parallel downloads
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ if is_torch_available():
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
VOCAB_FILES_NAMES = {"pretrained_vocab_file": "vocab.bin", "vocab_file": "vocab.txt"}
|
VOCAB_FILES_NAMES = {"pretrained_vocab_file": "vocab.bin", "vocab_file": "vocab.txt"}
|
||||||
|
VOCAB_FILES_NAMES_FAST = {"pretrained_vocab_file": "vocab.json", "vocab_file": "vocab.json"}
|
||||||
|
|
||||||
PRETRAINED_VOCAB_FILES_MAP = {
|
PRETRAINED_VOCAB_FILES_MAP = {
|
||||||
"pretrained_vocab_file": {
|
"pretrained_vocab_file": {
|
||||||
@@ -119,13 +120,23 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
|||||||
self.punction_without_space_before_pattern = re.compile(r"[^\s][{}]".format(self.punctuation_symbols))
|
self.punction_without_space_before_pattern = re.compile(r"[^\s][{}]".format(self.punctuation_symbols))
|
||||||
self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern()
|
self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern()
|
||||||
|
|
||||||
if pretrained_vocab_file is not None:
|
try:
|
||||||
# Hack because, honestly this tokenizer was not made to be used
|
if pretrained_vocab_file is not None:
|
||||||
# in a library like ours, at all.
|
# Hack because, honestly this tokenizer was not made to be used
|
||||||
vocab_dict = torch.load(pretrained_vocab_file)
|
# in a library like ours, at all.
|
||||||
for key, value in vocab_dict.items():
|
vocab_dict = torch.load(pretrained_vocab_file)
|
||||||
if key not in self.__dict__:
|
for key, value in vocab_dict.items():
|
||||||
self.__dict__[key] = value
|
if key not in self.__dict__:
|
||||||
|
self.__dict__[key] = value
|
||||||
|
|
||||||
|
if vocab_file is not None:
|
||||||
|
self.build_vocab()
|
||||||
|
except Exception:
|
||||||
|
raise ValueError(
|
||||||
|
"Unable to parse file {}. Unknown format. "
|
||||||
|
"If you tried to load a model saved through TransfoXLTokenizerFast,"
|
||||||
|
"please note they are not compatible.".format(pretrained_vocab_file)
|
||||||
|
)
|
||||||
|
|
||||||
if vocab_file is not None:
|
if vocab_file is not None:
|
||||||
self.build_vocab()
|
self.build_vocab()
|
||||||
@@ -179,6 +190,12 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
def save_vocabulary(self, vocab_path):
|
def save_vocabulary(self, vocab_path):
|
||||||
"""Save the tokenizer vocabulary to a directory or file."""
|
"""Save the tokenizer vocabulary to a directory or file."""
|
||||||
|
|
||||||
|
logger.warning(
|
||||||
|
"Please note you will not be able to load the save vocabulary in"
|
||||||
|
" Rust-based TransfoXLTokenizerFast as they don't share the same structure."
|
||||||
|
)
|
||||||
|
|
||||||
if os.path.isdir(vocab_path):
|
if os.path.isdir(vocab_path):
|
||||||
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"])
|
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"])
|
||||||
else:
|
else:
|
||||||
@@ -331,8 +348,15 @@ class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer):
|
|||||||
normalization: Optional[str] = None,
|
normalization: Optional[str] = None,
|
||||||
):
|
):
|
||||||
|
|
||||||
tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token)
|
try:
|
||||||
tokenizer = Tokenizer(tokenizer)
|
tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token)
|
||||||
|
tokenizer = Tokenizer(tokenizer)
|
||||||
|
except Exception:
|
||||||
|
raise ValueError(
|
||||||
|
"Unable to parse file {}. Unknown format. "
|
||||||
|
"If you tried to load a model saved through TransfoXLTokenizer,"
|
||||||
|
"please note they are not compatible.".format(vocab_file)
|
||||||
|
)
|
||||||
|
|
||||||
# Create the correct normalization path
|
# Create the correct normalization path
|
||||||
normalizer = []
|
normalizer = []
|
||||||
@@ -379,7 +403,7 @@ class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer):
|
|||||||
|
|
||||||
class TransfoXLTokenizerFast(PreTrainedTokenizerFast):
|
class TransfoXLTokenizerFast(PreTrainedTokenizerFast):
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES_FAST
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP_FAST
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP_FAST
|
||||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||||
|
|
||||||
@@ -419,6 +443,14 @@ class TransfoXLTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def save_pretrained(self, save_directory):
|
||||||
|
logger.warning(
|
||||||
|
"Please note you will not be able to load the vocabulary in"
|
||||||
|
" Python-based TransfoXLTokenizer as they don't share the same structure."
|
||||||
|
)
|
||||||
|
|
||||||
|
return super().save_pretrained(save_directory)
|
||||||
|
|
||||||
|
|
||||||
class LMOrderedIterator(object):
|
class LMOrderedIterator(object):
|
||||||
def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
|
def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
|
||||||
|
|||||||
@@ -1906,8 +1906,9 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
|
|||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
folder, file = save_directory, self.vocab_files_names["vocab_file"]
|
files = self._tokenizer.save(save_directory)
|
||||||
else:
|
else:
|
||||||
folder, file = os.path.split(os.path.abspath(save_directory))
|
folder, file = os.path.split(os.path.abspath(save_directory))
|
||||||
|
files = self._tokenizer.save(folder, name=file)
|
||||||
|
|
||||||
return tuple(self._tokenizer.save(folder, file))
|
return tuple(files)
|
||||||
|
|||||||
@@ -258,6 +258,20 @@ class FastTokenizerMatchingTest(unittest.TestCase):
|
|||||||
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
|
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
|
||||||
self.assertEqual(output_p, output_r)
|
self.assertEqual(output_p, output_r)
|
||||||
|
|
||||||
|
def assert_save_pretrained(self, tokenizer_r, tokenizer_p):
|
||||||
|
|
||||||
|
# Checks it save with the same files
|
||||||
|
self.assertSequenceEqual(tokenizer_r.save_vocabulary("."), tokenizer_p.save_vocabulary("."))
|
||||||
|
|
||||||
|
# Checks everything loads correctly in the same way
|
||||||
|
tokenizer_rp, tokenizer_pp = tokenizer_r.from_pretrained("."), tokenizer_p.from_pretrained(".")
|
||||||
|
|
||||||
|
# Check special tokens are set accordingly on Rust and Python
|
||||||
|
for key in tokenizer_pp.special_tokens_map:
|
||||||
|
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||||
|
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
|
||||||
|
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
|
||||||
|
|
||||||
def test_bert(self):
|
def test_bert(self):
|
||||||
for tokenizer_name in BertTokenizer.pretrained_vocab_files_map["vocab_file"].keys():
|
for tokenizer_name in BertTokenizer.pretrained_vocab_files_map["vocab_file"].keys():
|
||||||
tokenizer_p = BertTokenizer.from_pretrained(tokenizer_name)
|
tokenizer_p = BertTokenizer.from_pretrained(tokenizer_name)
|
||||||
@@ -294,7 +308,7 @@ class FastTokenizerMatchingTest(unittest.TestCase):
|
|||||||
self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
|
self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
|
||||||
|
|
||||||
# Check the number of returned files for save_vocabulary
|
# Check the number of returned files for save_vocabulary
|
||||||
self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary(".")))
|
self.assert_save_pretrained(tokenizer_r, tokenizer_p)
|
||||||
|
|
||||||
# Check for padding
|
# Check for padding
|
||||||
self.assert_padding(tokenizer_r, tokenizer_p)
|
self.assert_padding(tokenizer_r, tokenizer_p)
|
||||||
@@ -335,12 +349,26 @@ class FastTokenizerMatchingTest(unittest.TestCase):
|
|||||||
# Check alignment for build_inputs_with_special_tokens
|
# Check alignment for build_inputs_with_special_tokens
|
||||||
self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
|
self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
|
||||||
|
|
||||||
# Check the number of returned files for save_vocabulary
|
|
||||||
self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary(".")))
|
|
||||||
|
|
||||||
# Check for padding
|
# Check for padding
|
||||||
self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p)
|
self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p)
|
||||||
|
|
||||||
|
# Check the number of returned files for save_vocabulary
|
||||||
|
# TransfoXL tokenizers comes in a special format which is not compatible at all
|
||||||
|
# with rust tokenizers. We ensure the errors detection at correctly raised
|
||||||
|
tokenizer_r_files = tokenizer_r.save_pretrained(".")
|
||||||
|
self.assertSequenceEqual(
|
||||||
|
tokenizer_r_files, ["./vocab.json", "./special_tokens_map.json", "./added_tokens.json"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check loading Python-tokenizer save through Rust doesnt work (and the opposite)
|
||||||
|
self.assertRaises(ValueError, tokenizer_p.from_pretrained, *tokenizer_r_files)
|
||||||
|
self.assertRaises(ValueError, tokenizer_r.from_pretrained, *tokenizer_p.save_pretrained("."))
|
||||||
|
|
||||||
|
# Check loading works for Python to Python and Rust to Rust
|
||||||
|
# Issue: https://github.com/huggingface/transformers/issues/3000
|
||||||
|
# self.assertIsNotNone(tokenizer_p.__class__.from_pretrained('./'))
|
||||||
|
self.assertIsNotNone(tokenizer_r.__class__.from_pretrained("./"))
|
||||||
|
|
||||||
def test_distilbert(self):
|
def test_distilbert(self):
|
||||||
for tokenizer_name in DistilBertTokenizer.pretrained_vocab_files_map["vocab_file"].keys():
|
for tokenizer_name in DistilBertTokenizer.pretrained_vocab_files_map["vocab_file"].keys():
|
||||||
tokenizer_p = DistilBertTokenizer.from_pretrained(tokenizer_name)
|
tokenizer_p = DistilBertTokenizer.from_pretrained(tokenizer_name)
|
||||||
@@ -378,7 +406,7 @@ class FastTokenizerMatchingTest(unittest.TestCase):
|
|||||||
self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
|
self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
|
||||||
|
|
||||||
# Check the number of returned files for save_vocabulary
|
# Check the number of returned files for save_vocabulary
|
||||||
self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary(".")))
|
self.assert_save_pretrained(tokenizer_r, tokenizer_p)
|
||||||
|
|
||||||
# Check for padding
|
# Check for padding
|
||||||
self.assert_padding(tokenizer_r, tokenizer_p)
|
self.assert_padding(tokenizer_r, tokenizer_p)
|
||||||
@@ -419,7 +447,7 @@ class FastTokenizerMatchingTest(unittest.TestCase):
|
|||||||
self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
|
self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
|
||||||
|
|
||||||
# Check the number of returned files for save_vocabulary
|
# Check the number of returned files for save_vocabulary
|
||||||
self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary(".")))
|
self.assert_save_pretrained(tokenizer_r, tokenizer_p)
|
||||||
|
|
||||||
# Check for padding
|
# Check for padding
|
||||||
self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p)
|
self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p)
|
||||||
@@ -460,7 +488,7 @@ class FastTokenizerMatchingTest(unittest.TestCase):
|
|||||||
self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
|
self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
|
||||||
|
|
||||||
# Check the number of returned files for save_vocabulary
|
# Check the number of returned files for save_vocabulary
|
||||||
self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary(".")))
|
self.assert_save_pretrained(tokenizer_r, tokenizer_p)
|
||||||
|
|
||||||
# Check for padding
|
# Check for padding
|
||||||
# TODO: Re-enable this test as soon as Roberta align with the python tokenizer.
|
# TODO: Re-enable this test as soon as Roberta align with the python tokenizer.
|
||||||
@@ -501,12 +529,10 @@ class FastTokenizerMatchingTest(unittest.TestCase):
|
|||||||
# Check alignment for build_inputs_with_special_tokens
|
# Check alignment for build_inputs_with_special_tokens
|
||||||
self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
|
self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
|
||||||
|
|
||||||
# Check the number of returned files for save_vocabulary
|
|
||||||
self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary(".")))
|
self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary(".")))
|
||||||
|
|
||||||
# Check for padding
|
# Check for padding
|
||||||
self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p)
|
self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p)
|
||||||
|
|
||||||
|
# Check the number of returned files for save_vocabulary
|
||||||
if __name__ == "__main__":
|
self.assert_save_pretrained(tokenizer_r, tokenizer_p)
|
||||||
unittest.main()
|
|
||||||
|
|||||||
Reference in New Issue
Block a user