diff --git a/setup.py b/setup.py index 5cba551556..2d3f71e883 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ setup( packages=find_packages("src"), install_requires=[ "numpy", - "tokenizers == 0.5.0", + "tokenizers == 0.5.2", # accessing files from S3 directly "boto3", # filesystem locks e.g. to prevent parallel downloads diff --git a/src/transformers/tokenization_transfo_xl.py b/src/transformers/tokenization_transfo_xl.py index 6ca9cb46ce..c09cc5acd6 100644 --- a/src/transformers/tokenization_transfo_xl.py +++ b/src/transformers/tokenization_transfo_xl.py @@ -45,6 +45,7 @@ if is_torch_available(): logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"pretrained_vocab_file": "vocab.bin", "vocab_file": "vocab.txt"} +VOCAB_FILES_NAMES_FAST = {"pretrained_vocab_file": "vocab.json", "vocab_file": "vocab.json"} PRETRAINED_VOCAB_FILES_MAP = { "pretrained_vocab_file": { @@ -119,13 +120,23 @@ class TransfoXLTokenizer(PreTrainedTokenizer): self.punction_without_space_before_pattern = re.compile(r"[^\s][{}]".format(self.punctuation_symbols)) self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern() - if pretrained_vocab_file is not None: - # Hack because, honestly this tokenizer was not made to be used - # in a library like ours, at all. - vocab_dict = torch.load(pretrained_vocab_file) - for key, value in vocab_dict.items(): - if key not in self.__dict__: - self.__dict__[key] = value + try: + if pretrained_vocab_file is not None: + # Hack because, honestly this tokenizer was not made to be used + # in a library like ours, at all. + vocab_dict = torch.load(pretrained_vocab_file) + for key, value in vocab_dict.items(): + if key not in self.__dict__: + self.__dict__[key] = value + + if vocab_file is not None: + self.build_vocab() + except Exception: + raise ValueError( + "Unable to parse file {}. Unknown format. " + "If you tried to load a model saved through TransfoXLTokenizerFast," + "please note they are not compatible.".format(pretrained_vocab_file) + ) if vocab_file is not None: self.build_vocab() @@ -179,6 +190,12 @@ class TransfoXLTokenizer(PreTrainedTokenizer): def save_vocabulary(self, vocab_path): """Save the tokenizer vocabulary to a directory or file.""" + + logger.warning( + "Please note you will not be able to load the save vocabulary in" + " Rust-based TransfoXLTokenizerFast as they don't share the same structure." + ) + if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"]) else: @@ -331,8 +348,15 @@ class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer): normalization: Optional[str] = None, ): - tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token) - tokenizer = Tokenizer(tokenizer) + try: + tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token) + tokenizer = Tokenizer(tokenizer) + except Exception: + raise ValueError( + "Unable to parse file {}. Unknown format. " + "If you tried to load a model saved through TransfoXLTokenizer," + "please note they are not compatible.".format(vocab_file) + ) # Create the correct normalization path normalizer = [] @@ -379,7 +403,7 @@ class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer): class TransfoXLTokenizerFast(PreTrainedTokenizerFast): - vocab_files_names = VOCAB_FILES_NAMES + vocab_files_names = VOCAB_FILES_NAMES_FAST pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP_FAST max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES @@ -419,6 +443,14 @@ class TransfoXLTokenizerFast(PreTrainedTokenizerFast): **kwargs, ) + def save_pretrained(self, save_directory): + logger.warning( + "Please note you will not be able to load the vocabulary in" + " Python-based TransfoXLTokenizer as they don't share the same structure." + ) + + return super().save_pretrained(save_directory) + class LMOrderedIterator(object): def __init__(self, data, bsz, bptt, device="cpu", ext_len=None): diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 90215778da..901952798e 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -1906,8 +1906,9 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer): def save_vocabulary(self, save_directory): if os.path.isdir(save_directory): - folder, file = save_directory, self.vocab_files_names["vocab_file"] + files = self._tokenizer.save(save_directory) else: folder, file = os.path.split(os.path.abspath(save_directory)) + files = self._tokenizer.save(folder, name=file) - return tuple(self._tokenizer.save(folder, file)) + return tuple(files) diff --git a/tests/test_tokenization_fast.py b/tests/test_tokenization_fast.py index 31cd850a61..916b86a28f 100644 --- a/tests/test_tokenization_fast.py +++ b/tests/test_tokenization_fast.py @@ -258,6 +258,20 @@ class FastTokenizerMatchingTest(unittest.TestCase): output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) self.assertEqual(output_p, output_r) + def assert_save_pretrained(self, tokenizer_r, tokenizer_p): + + # Checks it save with the same files + self.assertSequenceEqual(tokenizer_r.save_vocabulary("."), tokenizer_p.save_vocabulary(".")) + + # Checks everything loads correctly in the same way + tokenizer_rp, tokenizer_pp = tokenizer_r.from_pretrained("."), tokenizer_p.from_pretrained(".") + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key)) + # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id")) + def test_bert(self): for tokenizer_name in BertTokenizer.pretrained_vocab_files_map["vocab_file"].keys(): tokenizer_p = BertTokenizer.from_pretrained(tokenizer_name) @@ -294,7 +308,7 @@ class FastTokenizerMatchingTest(unittest.TestCase): self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p) # Check the number of returned files for save_vocabulary - self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary("."))) + self.assert_save_pretrained(tokenizer_r, tokenizer_p) # Check for padding self.assert_padding(tokenizer_r, tokenizer_p) @@ -335,12 +349,26 @@ class FastTokenizerMatchingTest(unittest.TestCase): # Check alignment for build_inputs_with_special_tokens self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p) - # Check the number of returned files for save_vocabulary - self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary("."))) - # Check for padding self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p) + # Check the number of returned files for save_vocabulary + # TransfoXL tokenizers comes in a special format which is not compatible at all + # with rust tokenizers. We ensure the errors detection at correctly raised + tokenizer_r_files = tokenizer_r.save_pretrained(".") + self.assertSequenceEqual( + tokenizer_r_files, ["./vocab.json", "./special_tokens_map.json", "./added_tokens.json"] + ) + + # Check loading Python-tokenizer save through Rust doesnt work (and the opposite) + self.assertRaises(ValueError, tokenizer_p.from_pretrained, *tokenizer_r_files) + self.assertRaises(ValueError, tokenizer_r.from_pretrained, *tokenizer_p.save_pretrained(".")) + + # Check loading works for Python to Python and Rust to Rust + # Issue: https://github.com/huggingface/transformers/issues/3000 + # self.assertIsNotNone(tokenizer_p.__class__.from_pretrained('./')) + self.assertIsNotNone(tokenizer_r.__class__.from_pretrained("./")) + def test_distilbert(self): for tokenizer_name in DistilBertTokenizer.pretrained_vocab_files_map["vocab_file"].keys(): tokenizer_p = DistilBertTokenizer.from_pretrained(tokenizer_name) @@ -378,7 +406,7 @@ class FastTokenizerMatchingTest(unittest.TestCase): self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p) # Check the number of returned files for save_vocabulary - self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary("."))) + self.assert_save_pretrained(tokenizer_r, tokenizer_p) # Check for padding self.assert_padding(tokenizer_r, tokenizer_p) @@ -419,7 +447,7 @@ class FastTokenizerMatchingTest(unittest.TestCase): self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p) # Check the number of returned files for save_vocabulary - self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary("."))) + self.assert_save_pretrained(tokenizer_r, tokenizer_p) # Check for padding self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p) @@ -460,7 +488,7 @@ class FastTokenizerMatchingTest(unittest.TestCase): self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p) # Check the number of returned files for save_vocabulary - self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary("."))) + self.assert_save_pretrained(tokenizer_r, tokenizer_p) # Check for padding # TODO: Re-enable this test as soon as Roberta align with the python tokenizer. @@ -501,12 +529,10 @@ class FastTokenizerMatchingTest(unittest.TestCase): # Check alignment for build_inputs_with_special_tokens self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p) - # Check the number of returned files for save_vocabulary self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary("."))) # Check for padding self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p) - -if __name__ == "__main__": - unittest.main() + # Check the number of returned files for save_vocabulary + self.assert_save_pretrained(tokenizer_r, tokenizer_p)