update readme and pretrained model weight files
This commit is contained in:
@@ -129,7 +129,19 @@ def create_and_check_required_methods_tokenizer(tester, input_text, output_text,
|
||||
tester.assertNotEqual(len(tokens_2), 0)
|
||||
tester.assertIsInstance(text_2, (str, unicode))
|
||||
|
||||
|
||||
def create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
|
||||
weights_list = list(tokenizer_class.max_model_input_sizes.keys())
|
||||
weights_lists_2 = []
|
||||
for file_id, map_list in tokenizer_class.pretrained_vocab_files_map.items():
|
||||
weights_lists_2.append(list(map_list.keys()))
|
||||
|
||||
for weights_list_2 in weights_lists_2:
|
||||
tester.assertListEqual(weights_list, weights_list_2)
|
||||
|
||||
|
||||
def create_and_check_tokenizer_commons(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
|
||||
create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
|
||||
create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
|
||||
create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
|
||||
create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
|
||||
|
||||
@@ -138,7 +138,6 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
|
||||
def save_vocabulary(self, vocab_path):
|
||||
"""Save the tokenizer vocabulary to a directory or file."""
|
||||
index = 0
|
||||
if os.path.isdir(vocab_path):
|
||||
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file'])
|
||||
torch.save(self.__dict__, vocab_file)
|
||||
|
||||
@@ -163,6 +163,11 @@ class PreTrainedTokenizer(object):
|
||||
for file_id, map_list in cls.pretrained_vocab_files_map.items():
|
||||
vocab_files[file_id] = map_list[pretrained_model_name_or_path]
|
||||
else:
|
||||
logger.info(
|
||||
"Model name '{}' not found in model shortcut name list ({}). "
|
||||
"Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
|
||||
pretrained_model_name_or_path, ', '.join(s3_models),
|
||||
pretrained_model_name_or_path))
|
||||
all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
|
||||
'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
|
||||
all_vocab_files_names.update(cls.vocab_files_names)
|
||||
@@ -175,6 +180,14 @@ class PreTrainedTokenizer(object):
|
||||
logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
|
||||
full_file_name = None
|
||||
vocab_files[file_id] = full_file_name
|
||||
if all(full_file_name is None for full_file_name in vocab_files.values()):
|
||||
logger.error(
|
||||
"Model name '{}' was not found in model name list ({}). "
|
||||
"We assumed '{}' was a path or url but couldn't find tokenizer files"
|
||||
"at this path or url.".format(
|
||||
pretrained_model_name_or_path, ', '.join(s3_models),
|
||||
pretrained_model_name_or_path, ))
|
||||
return None
|
||||
|
||||
# Get files from url, cache, or disk depending on the case
|
||||
try:
|
||||
|
||||
@@ -59,6 +59,13 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
'xlm-mlm-en-2048': 512,
|
||||
'xlm-mlm-ende-1024': 512,
|
||||
'xlm-mlm-enfr-1024': 512,
|
||||
'xlm-mlm-enro-1024': 512,
|
||||
'xlm-mlm-tlm-xnli15-1024': 512,
|
||||
'xlm-mlm-xnli15-1024': 512,
|
||||
'xlm-clm-enfr-1024': 512,
|
||||
'xlm-clm-ende-1024': 512,
|
||||
}
|
||||
|
||||
def get_pairs(word):
|
||||
|
||||
Reference in New Issue
Block a user