diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py index a5194d0804..7c662df010 100644 --- a/examples/distillation/run_squad_w_distillation.py +++ b/examples/distillation/run_squad_w_distillation.py @@ -506,9 +506,15 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) - model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) + config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, + cache_dir=args.cache_dir if args.cache_dir else None) + tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None) + model = model_class.from_pretrained(args.model_name_or_path, + from_tf=bool('.ckpt' in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None) if args.teacher_type is not None: assert args.teacher_name_or_path is not None @@ -516,8 +522,11 @@ def main(): assert args.alpha_ce + args.alpha_squad > 0. assert args.teacher_type != 'distilbert', "We constraint teachers not to be of type DistilBERT." teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type] - teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path) - teacher = teacher_model_class.from_pretrained(args.teacher_name_or_path, config=teacher_config) + teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path, + cache_dir=args.cache_dir if args.cache_dir else None) + teacher = teacher_model_class.from_pretrained(args.teacher_name_or_path, + config=teacher_config, + cache_dir=args.cache_dir if args.cache_dir else None) teacher.to(args.device) else: teacher = None @@ -553,8 +562,10 @@ def main(): torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned - model = model_class.from_pretrained(args.output_dir) - tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) + model = model_class.from_pretrained(args.output_dir, cache_dir=args.cache_dir if args.cache_dir else None) + tokenizer = tokenizer_class.from_pretrained(args.output_dir, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None) model.to(args.device) @@ -571,7 +582,7 @@ def main(): for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" - model = model_class.from_pretrained(checkpoint) + model = model_class.from_pretrained(checkpoint, cache_dir=args.cache_dir if args.cache_dir else None) model.to(args.device) # Evaluate diff --git a/examples/run_bertology.py b/examples/run_bertology.py index f37358359d..aae2d150d3 100644 --- a/examples/run_bertology.py +++ b/examples/run_bertology.py @@ -304,10 +304,16 @@ def main(): break config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - num_labels=num_labels, finetuning_task=args.task_name, - output_attentions=True) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path) - model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) + num_labels=num_labels, + finetuning_task=args.task_name, + output_attentions=True, + cache_dir=args.cache_dir if args.cache_dir else None) + tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + cache_dir=args.cache_dir if args.cache_dir else None) + model = model_class.from_pretrained(args.model_name_or_path, + from_tf=bool('.ckpt' in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab diff --git a/examples/run_glue.py b/examples/run_glue.py index 54f6689e4d..1558a812c3 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -477,9 +477,17 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) - model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) + config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, + num_labels=num_labels, + finetuning_task=args.task_name, + cache_dir=args.cache_dir if args.cache_dir else None) + tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None) + model = model_class.from_pretrained(args.model_name_or_path, + from_tf=bool('.ckpt' in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab @@ -514,7 +522,7 @@ def main(): # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) - tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) + tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 982d8aa1b7..2044cfe9e8 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -471,12 +471,18 @@ def main(): torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) + config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, + cache_dir=args.cache_dir if args.cache_dir else None) + tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None) if args.block_size <= 0: args.block_size = tokenizer.max_len_single_sentence # Our input block size will be the max possible for the model args.block_size = min(args.block_size, tokenizer.max_len_single_sentence) - model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) + model = model_class.from_pretrained(args.model_name_or_path, + from_tf=bool('.ckpt' in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None) model.to(args.device) if args.local_rank == 0: diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py index e11264cf57..638bbe74f1 100644 --- a/examples/run_multiple_choice.py +++ b/examples/run_multiple_choice.py @@ -464,9 +464,17 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) - model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) + config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, + num_labels=num_labels, + finetuning_task=args.task_name, + cache_dir=args.cache_dir if args.cache_dir else None) + tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None) + model = model_class.from_pretrained(args.model_name_or_path, + from_tf=bool('.ckpt' in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab diff --git a/examples/run_ner.py b/examples/run_ner.py index aeb6a9a292..e3887ae453 100644 --- a/examples/run_ner.py +++ b/examples/run_ner.py @@ -428,11 +428,15 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - num_labels=num_labels) + num_labels=num_labels, + cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - do_lower_case=args.do_lower_case) - model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), - config=config) + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None) + model = model_class.from_pretrained(args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab diff --git a/examples/run_squad.py b/examples/run_squad.py index a263566c57..254dfbffc9 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -477,9 +477,15 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) - model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) + config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, + cache_dir=args.cache_dir if args.cache_dir else None) + tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None) + model = model_class.from_pretrained(args.model_name_or_path, + from_tf=bool('.ckpt' in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py index e348d9b5ea..489dcb19c7 100644 --- a/templates/adding_a_new_example_script/run_xxx.py +++ b/templates/adding_a_new_example_script/run_xxx.py @@ -472,9 +472,15 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) - model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) + config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, + cache_dir=args.cache_dir if args.cache_dir else None) + tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None) + model = model_class.from_pretrained(args.model_name_or_path, + from_tf=bool('.ckpt' in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab