From 31e5b5ff2276c61af7eebb4c353934f8f675d728 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 11 Dec 2019 15:22:02 -0500 Subject: [PATCH] Fix tests + first example of doc --- transformers/tokenization_utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 2b2cec0c15..63d2cc5cb4 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -255,6 +255,7 @@ class PreTrainedTokenizer(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmz/bert-base-german-cased``. - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. @@ -282,6 +283,9 @@ class PreTrainedTokenizer(object): # Download vocabulary from S3 and cache. tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + # Download vocabulary from S3 (user-uploaded) and cache. + tokenizer = BertTokenizer.from_pretrained('dbmz/bert-base-german-cased') + # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') @@ -327,6 +331,9 @@ class PreTrainedTokenizer(object): if os.path.isdir(pretrained_model_name_or_path): # If a directory is provided we look for the standard filenames full_file_name = os.path.join(pretrained_model_name_or_path, file_name) + if not os.path.exists(full_file_name): + logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) + full_file_name = None elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file) full_file_name = pretrained_model_name_or_path