diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index f318bf8036..7ad8e4df90 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -30,12 +30,15 @@ Gao, Ming Zhou, and Hsiao-Wuen Hon.  “Unified Language Model Pre-Training for
 Natural Language Understanding and Generation.” (May 2019) ArXiv:1905.03197
 """
 
+import argparse
 import logging
 import random
 
 import numpy as np
 import torch
 
+from transformers import BertConfig, Bert2Rnd, BertTokenizer
+
 logger = logging.getLogger(__name__)
 
 
@@ -43,25 +46,60 @@ def set_seed(args):
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
+
+
+def load_and_cache_examples(args, tokenizer):
+    raise NotImplementedError
 
 
 def train(args, train_dataset, model, tokenizer):
     """ Fine-tune the pretrained model on the corpus. """
-    # Data sampler
-    # Data loader
-    # Training
-    raise NotImplementedError
-
-
-def evaluate(args, model, tokenizer, prefix=""):
     raise NotImplementedError
 
 
 def main():
-    raise NotImplementedError
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument("--train_data_file",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input training data file (a text file).")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    # Optional parameters
+    parser.add_argument("--model_name_or_path",
+                        default="bert-base-cased",
+                        type=str,
+                        help="The model checkpoint for weights initialization.")
+    parser.add_argument("--seed", default=42, type=int)
+    args = parser.parse_args()
+
+    # Set up training device
+    device = torch.device("cpu")
+
+    # Set seed
+    set_seed(args)
+
+    # Load pretrained model and tokenizer
+    config_class, model_class, tokenizer_class = BertConfig, Bert2Rnd, BertTokenizer
+    config = config_class.from_pretrained(args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+    model = model_class.from_pretrained(args.model_name_or_path, config=config)
+    model.to(device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    train_dataset = load_and_cache_examples(args, tokenizer)
+    global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
 
-def __main__():
+if __name__ == "__main__":
     main()
diff --git a/examples/run_summarization.py b/examples/run_summarization.py
deleted file mode 100644
index 0a367551d6..0000000000
--- a/examples/run_summarization.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning seq2seq models for abstractive summarization.
-
-The finetuning method for abstractive summarization is inspired by [1]. We
-concatenate the document and summary, mask words of the summary at random and
-maximizing the likelihood of masked words.
-
-[1] Dong Li, Nan Yang, Wenhui Wang, Furu Wei, Xiaodong Liu, Yu Wang, Jianfeng
-Gao, Ming Zhou, and Hsiao-Wuen Hon.  “Unified Language Model Pre-Training for
-Natural Language Understanding and Generation.” (May 2019) ArXiv:1905.03197
-"""
-
-import logging
-import random
-
-import numpy as np
-import torch
-
-logger = logging.getLogger(__name__)
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def train(args, train_dataset, model, tokenizer):
-    raise NotImplementedError
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    raise NotImplementedError