Kill model archive maps (#4636)

* Kill model archive maps * Fixup * Also kill model_archive_map for MaskedBertPreTrainedModel * Unhook config_archive_map * Tokenizers: align with model id changes * make style && make quality * Fix CI
2020-06-02 09:39:33 -04:00
parent 47a551d17b
commit d4c2cb402d
115 changed files with 792 additions and 1323 deletions
--- a/examples/contrib/mm-imdb/run_mmimdb.py
+++ b/examples/contrib/mm-imdb/run_mmimdb.py
@@ -34,26 +34,11 @@ from tqdm import tqdm, trange
 from transformers import (
    WEIGHTS_NAME,
    AdamW,
-    AlbertConfig,
-    AlbertModel,
-    AlbertTokenizer,
-    BertConfig,
-    BertModel,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertModel,
-    DistilBertTokenizer,
+    AutoConfig,
+    AutoModel,
+    AutoTokenizer,
    MMBTConfig,
    MMBTForClassification,
-    RobertaConfig,
-    RobertaModel,
-    RobertaTokenizer,
-    XLMConfig,
-    XLMModel,
-    XLMTokenizer,
-    XLNetConfig,
-    XLNetModel,
-    XLNetTokenizer,
    get_linear_schedule_with_warmup,
 )
 from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
@@ -67,23 +52,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum(
-    (
-        tuple(conf.pretrained_config_archive_map.keys())
-        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
-    ),
-    (),
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertModel, BertTokenizer),
-    "xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
-    "xlm": (XLMConfig, XLMModel, XLMTokenizer),
-    "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
-    "albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
-}
-

 def set_seed(args):
    random.seed(args.seed)
@@ -351,19 +319,12 @@ def main():
        required=True,
        help="The input data dir. Should contain the .jsonl files for MMIMDB.",
    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
@@ -385,7 +346,7 @@ def main():
    )
    parser.add_argument(
        "--cache_dir",
-        default="",
+        default=None,
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
@@ -526,18 +487,14 @@ def main():
    # Setup model
    labels = get_mmimdb_labels()
    num_labels = len(labels)
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    transformer_config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path
-    )
-    tokenizer = tokenizer_class.from_pretrained(
+    transformer_config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+        cache_dir=args.cache_dir,
    )
-    transformer = model_class.from_pretrained(
-        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir if args.cache_dir else None
+    transformer = AutoModel.from_pretrained(
+        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir
    )
    img_encoder = ImageEncoder(args)
    config = MMBTConfig(transformer_config, num_labels=num_labels)
@@ -583,13 +540,12 @@ def main():
        # Load a trained model and vocabulary that you have fine-tuned
        model = MMBTForClassification(config, transformer, img_encoder)
        model.load_state_dict(torch.load(os.path.join(args.output_dir, WEIGHTS_NAME)))
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -31,14 +31,8 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Tenso
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange

-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForMultipleChoice,
-    BertTokenizer,
-    get_linear_schedule_with_warmup,
-)
+from transformers import WEIGHTS_NAME, AdamW, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup
+from transformers.modeling_auto import AutoModelForMultipleChoice


 try:
@@ -49,12 +43,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in [BertConfig]), ())
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
-}
-

 class SwagExample(object):
    """A single training/test example for the SWAG dataset."""
@@ -492,19 +480,12 @@ def main():
        required=True,
        help="SWAG csv for predictions. E.g., val.csv or test.csv",
    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
@@ -536,9 +517,6 @@ def main():
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
@@ -652,13 +630,9 @@ def main():
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
-    )
-    model = model_class.from_pretrained(
+    config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,)
+    model = AutoModelForMultipleChoice.from_pretrained(
        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
    )

@@ -694,8 +668,8 @@ def main():
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model = AutoModelForMultipleChoice.from_pretrained(args.output_dir)
+        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
@@ -718,8 +692,8 @@ def main():
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)
-            tokenizer = tokenizer_class.from_pretrained(checkpoint)
+            model = AutoModelForMultipleChoice.from_pretrained(checkpoint)
+            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
            model.to(args.device)

            # Evaluate