From 47975ed53ec96edfcd83c101c5aac7943f2dd30e Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 6 Aug 2019 11:21:48 -0400
Subject: [PATCH 01/15] Language Modeling fine-tuning using GPT-2.

---
 examples/run_generative_finetuning.py | 402 ++++++++++++++++++++++++++
 examples/utils_lm.py                  |  42 +++
 2 files changed, 444 insertions(+)
 create mode 100644 examples/run_generative_finetuning.py
 create mode 100644 examples/utils_lm.py

diff --git a/examples/run_generative_finetuning.py b/examples/run_generative_finetuning.py
new file mode 100644
index 0000000000..e9e4545dfe
--- /dev/null
+++ b/examples/run_generative_finetuning.py
@@ -0,0 +1,402 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for language modeling on WikiText-2 (GPT, GPT-2, XLM)."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import glob
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, SequentialSampler,)
+from torch.utils.data.distributed import DistributedSampler
+from tensorboardX import SummaryWriter
+from tqdm import tqdm, trange
+
+from pytorch_transformers import (WEIGHTS_NAME, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,)
+from pytorch_transformers import AdamW, WarmupLinearSchedule
+
+from utils_lm import WikiTextDataset
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config,)), ())
+
+MODEL_CLASSES = {
+    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
+}
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=WikiTextDataset.collate)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            batch.to(args.device)
+            model.train()
+            outputs = model(batch, labels=batch)
+            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                scheduler.step()  # Update learning rate schedule
+                optimizer.step()
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    eval_output_dir = args.output_dir
+
+    results = {}
+    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
+
+    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(eval_output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=WikiTextDataset.collate)
+
+    # Eval!
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(eval_dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    eval_loss = 0.0
+    nb_eval_steps = 0
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch.to(args.device)
+
+        with torch.no_grad():
+            outputs = model(batch, labels=batch)
+            lm_loss = outputs[0]
+            eval_loss += lm_loss.mean().item()
+        nb_eval_steps += 1
+
+    eval_loss = eval_loss / nb_eval_steps
+    perplexity = torch.exp(torch.tensor(eval_loss))
+
+    result = {
+        "perplexity": perplexity
+    }
+
+    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
+    with open(output_eval_file, "w") as writer:
+        logger.info("***** Eval results {} *****".format(prefix))
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+            writer.write("%s = %s\n" % (key, str(result[key])))
+
+    return results
+
+
+def load_and_cache_examples(args, tokenizer, evaluate=False):
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    dataset = WikiTextDataset(tokenizer, file="test" if evaluate else "train", directory=args.data_dir)
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--model_name_or_path", default="gpt2", type=str,
+                        help="The model to be fine-tuned.")
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="For distributed training: local_rank")
+    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    args = parser.parse_args()
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+
+    # Set seed
+    set_seed(args)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_name_or_path]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+
+
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            results.update(result)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/utils_lm.py b/examples/utils_lm.py
new file mode 100644
index 0000000000..2b6c393a91
--- /dev/null
+++ b/examples/utils_lm.py
@@ -0,0 +1,42 @@
+from torch.utils.data import Dataset, DataLoader
+import os
+import random
+import torch
+import torch.nn.functional as F
+
+
+class WikiTextDataset(Dataset):
+	def __init__(self, tokenizer, file='train', directory='wikitext', max_context_length=512, device='cpu'):
+		self.device = device
+		self.max_context_length = max_context_length
+
+		self.examples = []
+
+		with open(os.path.join(directory, f"wiki.{file}.raw"), encoding="utf-8") as f:
+			text = f.read()
+			spans = list(filter(lambda item: len(item) > 120, text.split("\n")[:20]))
+
+			for span in spans:
+				span = tokenizer.encode(span)
+				while len(span) > 0:
+					self.examples.append(span[:max_context_length])
+					span = span[max_context_length:]
+
+		# Randomly shuffle the examples array
+		random.shuffle(self.examples)
+
+		# Sort the array by example length.
+		self.examples.sort(key=len)
+
+		print("nice")
+
+	def __len__(self):
+		return len(self.examples)
+
+	def __getitem__(self, item):
+		return torch.tensor(self.examples[item], device=self.device)
+
+	@staticmethod
+	def collate(values):
+		stack = torch.stack([F.pad(value, (len(values[-1]) - value.size(0), 0), "constant", 0) for value in values])
+		return stack

From 3e3e1454974de0e1b72c0688a0341014922cd149 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 6 Aug 2019 12:14:18 -0400
Subject: [PATCH 02/15] Added GPT to the generative fine-tuning.

---
 examples/run_generative_finetuning.py | 6 ++++--
 examples/utils_lm.py                  | 2 --
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/run_generative_finetuning.py b/examples/run_generative_finetuning.py
index e9e4545dfe..458c123553 100644
--- a/examples/run_generative_finetuning.py
+++ b/examples/run_generative_finetuning.py
@@ -30,7 +30,8 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
 
-from pytorch_transformers import (WEIGHTS_NAME, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,)
+from pytorch_transformers import (WEIGHTS_NAME, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
+                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer)
 from pytorch_transformers import AdamW, WarmupLinearSchedule
 
 from utils_lm import WikiTextDataset
@@ -40,7 +41,8 @@ logger = logging.getLogger(__name__)
 ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config,)), ())
 
 MODEL_CLASSES = {
-    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
+    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
+    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer)
 }
 
 
diff --git a/examples/utils_lm.py b/examples/utils_lm.py
index 2b6c393a91..4a3bafb789 100644
--- a/examples/utils_lm.py
+++ b/examples/utils_lm.py
@@ -28,8 +28,6 @@ class WikiTextDataset(Dataset):
 		# Sort the array by example length.
 		self.examples.sort(key=len)
 
-		print("nice")
-
 	def __len__(self):
 		return len(self.examples)
 

From 5c18825a1850ad59021ea9a914e638256dd372f6 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 6 Aug 2019 14:57:07 -0400
Subject: [PATCH 03/15] Removed dataset limit

---
 examples/utils_lm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/utils_lm.py b/examples/utils_lm.py
index 4a3bafb789..2944cdc9ea 100644
--- a/examples/utils_lm.py
+++ b/examples/utils_lm.py
@@ -14,7 +14,7 @@ class WikiTextDataset(Dataset):
 
 		with open(os.path.join(directory, f"wiki.{file}.raw"), encoding="utf-8") as f:
 			text = f.read()
-			spans = list(filter(lambda item: len(item) > 120, text.split("\n")[:20]))
+			spans = list(filter(lambda item: len(item) > 120, text.split("\n")))
 
 			for span in spans:
 				span = tokenizer.encode(span)

From 339e556feb1e6b65cee05d8a1e70d487c416e195 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 9 Aug 2019 18:08:15 -0400
Subject: [PATCH 04/15] CLM for BERT, beginning of CLM fot RoBERTa; still needs
 a better masking token mechanism.

---
 examples/run_generative_finetuning.py | 62 +++++++++++++++++++++------
 1 file changed, 48 insertions(+), 14 deletions(-)

diff --git a/examples/run_generative_finetuning.py b/examples/run_generative_finetuning.py
index 458c123553..44daa3d266 100644
--- a/examples/run_generative_finetuning.py
+++ b/examples/run_generative_finetuning.py
@@ -13,7 +13,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for language modeling on WikiText-2 (GPT, GPT-2, XLM)."""
+"""
+Fine-tuning the library models for language modeling on WikiText-2 (GPT, GPT-2, BERT, RoBERTa).
+GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
+using a masked language modeling (MLM) loss.
+"""
 
 from __future__ import absolute_import, division, print_function
 
@@ -30,8 +34,10 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
 
-from pytorch_transformers import (WEIGHTS_NAME, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
-                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer)
+from pytorch_transformers import (WEIGHTS_NAME, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                  BertConfig, BertForMaskedLM, BertTokenizer, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
 from pytorch_transformers import AdamW, WarmupLinearSchedule
 
 from utils_lm import WikiTextDataset
@@ -42,7 +48,9 @@ ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (
 
 MODEL_CLASSES = {
     'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
-    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer)
+    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
+    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
 }
 
 
@@ -53,6 +61,18 @@ def set_seed(args):
     if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
+# Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original
+def mask_tokens(inputs, tokenizer, args):
+    labels = inputs.clone()
+    masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).byte()
+    labels[~masked_indices] = -1  # We only compute loss on masked tokens
+    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).byte() & masked_indices
+    inputs[indices_replaced] = tokenizer.vocab["[MASK]"]  # 80% of the time, replace masked input tokens with [MASK]
+    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced
+    random_words = torch.randint(args.num_embeddings, labels.shape, dtype=torch.long, device=args.device)
+    inputs[indices_random] = random_words[
+        indices_random]  # 10% of the time, replace masked input tokens with random word
+    return inputs, labels
 
 def train(args, train_dataset, model, tokenizer):
     """ Train the model """
@@ -108,13 +128,14 @@ def train(args, train_dataset, model, tokenizer):
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
     train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
             batch.to(args.device)
             model.train()
-            outputs = model(batch, labels=batch)
+            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
+            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
             loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
 
             if args.n_gpu > 1:
@@ -132,8 +153,8 @@ def train(args, train_dataset, model, tokenizer):
 
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
-                scheduler.step()  # Update learning rate schedule
                 optimizer.step()
+                scheduler.step()  # Update learning rate schedule
                 model.zero_grad()
                 global_step += 1
 
@@ -196,7 +217,7 @@ def evaluate(args, model, tokenizer, prefix=""):
         batch.to(args.device)
 
         with torch.no_grad():
-            outputs = model(batch, labels=batch)
+            outputs = model(batch)
             lm_loss = outputs[0]
             eval_loss += lm_loss.mean().item()
         nb_eval_steps += 1
@@ -236,8 +257,16 @@ def main():
                         help="The output directory where the model predictions and checkpoints will be written.")
 
     ## Other parameters
-    parser.add_argument("--model_name_or_path", default="gpt2", type=str,
-                        help="The model to be fine-tuned.")
+    parser.add_argument("--model_name", default="bert", type=str,
+                        help="The model architecture to be fine-tuned.")
+    parser.add_argument("--model_checkpoint", default="bert-base-cased", type=str,
+                        help="The model checkpoint for weights initialization.")
+
+    parser.add_argument("--mlm", action='store_true',
+                        help="Train with masked-language modeling loss instead of language modeling.")
+    parser.add_argument("--mlm_probability", type=float, default=0.15,
+                        help="Ratio of tokens to mask for masked language modeling loss")
+
     parser.add_argument("--config_name", default="", type=str,
                         help="Pretrained config name or path if not the same as model_name")
     parser.add_argument("--tokenizer_name", default="", type=str,
@@ -303,6 +332,10 @@ def main():
     parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
     args = parser.parse_args()
 
+    if args.model_name in ["bert", "roberta"] and not args.mlm:
+        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
+                         "flag (masked language modeling).")
+
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
 
@@ -339,10 +372,11 @@ def main():
     if args.local_rank not in [-1, 0]:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_name_or_path]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_name]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_checkpoint)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_checkpoint, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_checkpoint, from_tf=bool('.ckpt' in args.model_checkpoint), config=config)
+    args.num_embeddings = config.vocab_size  # We need this to create the model at next line (number of embeddings to use)
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

From 715534800a2a809dbfc66bd17acb36ed30999b0d Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 14 Aug 2019 09:52:57 -0400
Subject: [PATCH 05/15] BERT + RoBERTa masking tokens handling + GPU device
 update.

---
 examples/run_generative_finetuning.py | 27 ++++++++++++++++-----------
 examples/utils_lm.py                  |  5 ++---
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/examples/run_generative_finetuning.py b/examples/run_generative_finetuning.py
index 44daa3d266..ecbf44d8de 100644
--- a/examples/run_generative_finetuning.py
+++ b/examples/run_generative_finetuning.py
@@ -65,11 +65,15 @@ def set_seed(args):
 def mask_tokens(inputs, tokenizer, args):
     labels = inputs.clone()
     masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).byte()
-    labels[~masked_indices] = -1  # We only compute loss on masked tokens
+    labels[~masked_indices.bool()] = -1  # We only compute loss on masked tokens
     indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).byte() & masked_indices
-    inputs[indices_replaced] = tokenizer.vocab["[MASK]"]  # 80% of the time, replace masked input tokens with [MASK]
-    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced
-    random_words = torch.randint(args.num_embeddings, labels.shape, dtype=torch.long, device=args.device)
+
+    if args.model_name == "bert":
+        inputs[indices_replaced.bool()] = tokenizer.vocab["[MASK]"]  # 80% of the time, replace masked input tokens with [MASK]
+    elif args.model_name == "roberta":
+        inputs[indices_replaced.bool()] = tokenizer.encoder["<mask>"]  # 80% of the time, replace masked input tokens with <mask>
+    indices_random = (torch.bernoulli(torch.full(labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced).bool()
+    random_words = torch.randint(args.num_embeddings, labels.shape, dtype=torch.long)
     inputs[indices_random] = random_words[
         indices_random]  # 10% of the time, replace masked input tokens with random word
     return inputs, labels
@@ -132,14 +136,15 @@ def train(args, train_dataset, model, tokenizer):
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-            batch.to(args.device)
-            model.train()
             inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
+            inputs = inputs.to(args.device)
+            labels = labels.to(args.device)
+            model.train()
             outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
             loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -214,7 +219,7 @@ def evaluate(args, model, tokenizer, prefix=""):
     nb_eval_steps = 0
     for batch in tqdm(eval_dataloader, desc="Evaluating"):
         model.eval()
-        batch.to(args.device)
+        batch = batch.to(args.device)
 
         with torch.no_grad():
             outputs = model(batch)
@@ -285,9 +290,9 @@ def main():
     parser.add_argument("--do_lower_case", action='store_true',
                         help="Set this flag if you are using an uncased model.")
 
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int,
                         help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+    parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int,
                         help="Batch size per GPU/CPU for evaluation.")
     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                         help="Number of updates steps to accumulate before performing a backward/update pass.")
@@ -299,7 +304,7 @@ def main():
                         help="Epsilon for Adam optimizer.")
     parser.add_argument("--max_grad_norm", default=1.0, type=float,
                         help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+    parser.add_argument("--num_train_epochs", default=1.0, type=float,
                         help="Total number of training epochs to perform.")
     parser.add_argument("--max_steps", default=-1, type=int,
                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
diff --git a/examples/utils_lm.py b/examples/utils_lm.py
index 2944cdc9ea..68a1ca2cce 100644
--- a/examples/utils_lm.py
+++ b/examples/utils_lm.py
@@ -6,8 +6,7 @@ import torch.nn.functional as F
 
 
 class WikiTextDataset(Dataset):
-	def __init__(self, tokenizer, file='train', directory='wikitext', max_context_length=512, device='cpu'):
-		self.device = device
+	def __init__(self, tokenizer, file='train', directory='wikitext', max_context_length=512):
 		self.max_context_length = max_context_length
 
 		self.examples = []
@@ -32,7 +31,7 @@ class WikiTextDataset(Dataset):
 		return len(self.examples)
 
 	def __getitem__(self, item):
-		return torch.tensor(self.examples[item], device=self.device)
+		return torch.tensor(self.examples[item])
 
 	@staticmethod
 	def collate(values):

From 5652f54ac26f3233f4dcbfd9a2f6879e94a0bc59 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Fri, 16 Aug 2019 13:49:56 -0400
Subject: [PATCH 06/15] Simplified data generator + better perplexity
 calculator

GPT-2 now obtains ~20 perplexity on WikiText-2
---
 examples/run_generative_finetuning.py |  9 +++++----
 examples/utils_lm.py                  | 23 +++++------------------
 2 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/examples/run_generative_finetuning.py b/examples/run_generative_finetuning.py
index ecbf44d8de..bb6aee6f07 100644
--- a/examples/run_generative_finetuning.py
+++ b/examples/run_generative_finetuning.py
@@ -85,7 +85,7 @@ def train(args, train_dataset, model, tokenizer):
 
     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
     train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=WikiTextDataset.collate)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 
     if args.max_steps > 0:
         t_total = args.max_steps
@@ -209,7 +209,7 @@ def evaluate(args, model, tokenizer, prefix=""):
     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
     # Note that DistributedSampler samples randomly
     eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
-    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=WikiTextDataset.collate)
+    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # Eval!
     logger.info("***** Running evaluation {} *****".format(prefix))
@@ -217,12 +217,13 @@ def evaluate(args, model, tokenizer, prefix=""):
     logger.info("  Batch size = %d", args.eval_batch_size)
     eval_loss = 0.0
     nb_eval_steps = 0
+    model.eval()
+
     for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        model.eval()
         batch = batch.to(args.device)
 
         with torch.no_grad():
-            outputs = model(batch)
+            outputs = model(batch, masked_lm_labels=batch) if args.mlm else model(batch, labels=batch)
             lm_loss = outputs[0]
             eval_loss += lm_loss.mean().item()
         nb_eval_steps += 1
diff --git a/examples/utils_lm.py b/examples/utils_lm.py
index 68a1ca2cce..5f22e10a76 100644
--- a/examples/utils_lm.py
+++ b/examples/utils_lm.py
@@ -6,34 +6,21 @@ import torch.nn.functional as F
 
 
 class WikiTextDataset(Dataset):
-	def __init__(self, tokenizer, file='train', directory='wikitext', max_context_length=512):
+	def __init__(self, tokenizer, file='train', directory='wikitext', max_context_length=1024):
 		self.max_context_length = max_context_length
 
 		self.examples = []
 
 		with open(os.path.join(directory, f"wiki.{file}.raw"), encoding="utf-8") as f:
 			text = f.read()
-			spans = list(filter(lambda item: len(item) > 120, text.split("\n")))
+			tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
 
-			for span in spans:
-				span = tokenizer.encode(span)
-				while len(span) > 0:
-					self.examples.append(span[:max_context_length])
-					span = span[max_context_length:]
-
-		# Randomly shuffle the examples array
-		random.shuffle(self.examples)
-
-		# Sort the array by example length.
-		self.examples.sort(key=len)
+			while len(tokenized_text) > max_context_length:
+				self.examples.append(tokenized_text[:max_context_length])
+				tokenized_text = tokenized_text[max_context_length:]
 
 	def __len__(self):
 		return len(self.examples)
 
 	def __getitem__(self, item):
 		return torch.tensor(self.examples[item])
-
-	@staticmethod
-	def collate(values):
-		stack = torch.stack([F.pad(value, (len(values[-1]) - value.size(0), 0), "constant", 0) for value in values])
-		return stack

From f94f1c6016414e059fa4d8ef61ee194fdc891046 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 19 Aug 2019 14:58:50 -0400
Subject: [PATCH 07/15] Distributed training + tokenizer agnostic mask token

---
 examples/run_generative_finetuning.py | 14 +++-----------
 examples/utils_lm.py                  | 27 ++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/examples/run_generative_finetuning.py b/examples/run_generative_finetuning.py
index bb6aee6f07..8501364ae4 100644
--- a/examples/run_generative_finetuning.py
+++ b/examples/run_generative_finetuning.py
@@ -39,12 +39,10 @@ from pytorch_transformers import (WEIGHTS_NAME, GPT2Config, GPT2LMHeadModel, GPT
                                   BertConfig, BertForMaskedLM, BertTokenizer, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                   RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
 from pytorch_transformers import AdamW, WarmupLinearSchedule
+logger = logging.getLogger(__name__)
 
 from utils_lm import WikiTextDataset
 
-logger = logging.getLogger(__name__)
-
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config,)), ())
 
 MODEL_CLASSES = {
     'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
@@ -68,10 +66,7 @@ def mask_tokens(inputs, tokenizer, args):
     labels[~masked_indices.bool()] = -1  # We only compute loss on masked tokens
     indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).byte() & masked_indices
 
-    if args.model_name == "bert":
-        inputs[indices_replaced.bool()] = tokenizer.vocab["[MASK]"]  # 80% of the time, replace masked input tokens with [MASK]
-    elif args.model_name == "roberta":
-        inputs[indices_replaced.bool()] = tokenizer.encoder["<mask>"]  # 80% of the time, replace masked input tokens with <mask>
+    inputs[indices_replaced.bool()] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) # 80% of the time, replace masked input tokens with [MASK]
     indices_random = (torch.bernoulli(torch.full(labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced).bool()
     random_words = torch.randint(args.num_embeddings, labels.shape, dtype=torch.long)
     inputs[indices_random] = random_words[
@@ -246,10 +241,7 @@ def evaluate(args, model, tokenizer, prefix=""):
 
 
 def load_and_cache_examples(args, tokenizer, evaluate=False):
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    dataset = WikiTextDataset(tokenizer, file="test" if evaluate else "train", directory=args.data_dir)
+    dataset = WikiTextDataset(args, tokenizer, file="test" if evaluate else "train", directory=args.data_dir)
     return dataset
 
 
diff --git a/examples/utils_lm.py b/examples/utils_lm.py
index 5f22e10a76..251aea90e1 100644
--- a/examples/utils_lm.py
+++ b/examples/utils_lm.py
@@ -3,10 +3,27 @@ import os
 import random
 import torch
 import torch.nn.functional as F
+import logging
+import pickle
+
+logger = logging.getLogger(__name__)
 
 
 class WikiTextDataset(Dataset):
-	def __init__(self, tokenizer, file='train', directory='wikitext', max_context_length=1024):
+	def __init__(self, args, tokenizer, file='train', directory='wikitext', max_context_length=512, cache=None):
+		if args.local_rank not in [-1, 0]:
+			torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+			
+			
+		cached_features_file = os.path.join(args.data_dir, f'cached_lm_{file}_{args.max_seq_length}')
+		
+		if os.path.exists(cached_features_file):
+			logger.info("Loading features from cached file %s", cached_features_file)
+			with open(cached_features_file, 'rb') as handle:
+				self.examples = pickle.load(handle)
+		else:
+			logger.info("Creating features from dataset file at %s", args.data_dir)	
+		
 		self.max_context_length = max_context_length
 
 		self.examples = []
@@ -18,6 +35,14 @@ class WikiTextDataset(Dataset):
 			while len(tokenized_text) > max_context_length:
 				self.examples.append(tokenized_text[:max_context_length])
 				tokenized_text = tokenized_text[max_context_length:]
+			
+		if args.local_rank in [-1, 0]:
+			logger.info("Saving features into cached file %s", cached_features_file)
+			with open(cached_features_file, 'wb') as handle:
+				pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+		
+		if args.local_rank == 0:
+			torch.distributed.barrier()
 
 	def __len__(self):
 		return len(self.examples)

From a690edab174cd1b7a5b684db34158b16c68441f8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 20 Aug 2019 15:52:12 +0200
Subject: [PATCH 08/15] various fix and clean up on run_lm_finetuning

---
 .gitignore                                    |   5 +-
 ...ive_finetuning.py => run_lm_finetuning.py} | 165 ++++++++++++------
 examples/utils_lm.py                          |  51 ------
 3 files changed, 116 insertions(+), 105 deletions(-)
 rename examples/{run_generative_finetuning.py => run_lm_finetuning.py} (75%)
 delete mode 100644 examples/utils_lm.py

diff --git a/.gitignore b/.gitignore
index 6bbe32df6c..bbc738b931 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,4 +127,7 @@ proc_data
 
 # examples
 runs
-examples/runs
\ No newline at end of file
+examples/runs
+
+# data
+data
\ No newline at end of file
diff --git a/examples/run_generative_finetuning.py b/examples/run_lm_finetuning.py
similarity index 75%
rename from examples/run_generative_finetuning.py
rename to examples/run_lm_finetuning.py
index 8501364ae4..bd7047a587 100644
--- a/examples/run_generative_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -25,33 +25,75 @@ import argparse
 import glob
 import logging
 import os
+import pickle
 import random
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, SequentialSampler,)
+from torch.utils.data import DataLoader, Dataset, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
 
-from pytorch_transformers import (WEIGHTS_NAME, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                  BertConfig, BertForMaskedLM, BertTokenizer, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-from pytorch_transformers import AdamW, WarmupLinearSchedule
-logger = logging.getLogger(__name__)
+from pytorch_transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
+                                  BertConfig, BertForMaskedLM, BertTokenizer,
+                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
+                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
+                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
 
-from utils_lm import WikiTextDataset
+
+logger = logging.getLogger(__name__)
 
 
 MODEL_CLASSES = {
     'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
     'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
-    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
-    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
+    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
+    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
 }
 
 
+class TextDataset(Dataset):
+    def __init__(self, tokenizer, file_path='train', block_size=512):
+        assert os.path.isfile(file_path)
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(directory, f'cached_lm_{block_size}_{filename}')
+
+        if os.path.exists(cached_features_file):
+            logger.info("Loading features from cached file %s", cached_features_file)
+            with open(cached_features_file, 'rb') as handle:
+                self.examples = pickle.load(handle)
+        else:
+            logger.info("Creating features from dataset file at %s", directory)
+
+            self.examples = []
+            with open(file_path, encoding="utf-8") as f:
+                text = f.read()
+
+            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
+            while len(tokenized_text) >= block_size:  # Truncate in block of block_size
+                self.examples.append(tokenized_text[:block_size])
+                tokenized_text = tokenized_text[block_size:]
+            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
+            # If your dataset is small, first you should loook for a bigger one :-) and second you
+            # can change this behavior by adding (model specific) padding.
+
+            logger.info("Saving features into cached file %s", cached_features_file)
+            with open(cached_features_file, 'wb') as handle:
+                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, item):
+        return torch.tensor(self.examples[item])
+
+
+def load_and_cache_examples(args, tokenizer, evaluate=False):
+    dataset = TextDataset(tokenizer, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
+    return dataset
+
+
 def set_seed(args):
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -59,20 +101,27 @@ def set_seed(args):
     if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
-# Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original
-def mask_tokens(inputs, tokenizer, args):
-    labels = inputs.clone()
-    masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).byte()
-    labels[~masked_indices.bool()] = -1  # We only compute loss on masked tokens
-    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).byte() & masked_indices
 
-    inputs[indices_replaced.bool()] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) # 80% of the time, replace masked input tokens with [MASK]
-    indices_random = (torch.bernoulli(torch.full(labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced).bool()
-    random_words = torch.randint(args.num_embeddings, labels.shape, dtype=torch.long)
-    inputs[indices_random] = random_words[
-        indices_random]  # 10% of the time, replace masked input tokens with random word
+def mask_tokens(inputs, tokenizer, args):
+    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
+    labels = inputs.clone()
+    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+    masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).byte()
+    labels[~masked_indices] = -1  # We only compute loss on masked tokens
+
+    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).byte() & masked_indices
+    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
+
+    # 10% of the time, we replace masked input tokens with random word
+    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced
+    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
+    inputs[indices_random] = random_words[indices_random]
+
+    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
     return inputs, labels
 
+
 def train(args, train_dataset, model, tokenizer):
     """ Train the model """
     if args.local_rank in [-1, 0]:
@@ -146,13 +195,15 @@ def train(args, train_dataset, model, tokenizer):
             if args.fp16:
                 with amp.scale_loss(loss, optimizer) as scaled_loss:
                     scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
             else:
                 loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                 optimizer.step()
                 scheduler.step()  # Update learning rate schedule
                 model.zero_grad()
@@ -240,24 +291,22 @@ def evaluate(args, model, tokenizer, prefix=""):
     return results
 
 
-def load_and_cache_examples(args, tokenizer, evaluate=False):
-    dataset = WikiTextDataset(args, tokenizer, file="test" if evaluate else "train", directory=args.data_dir)
-    return dataset
-
-
 def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--train_data_file", default=None, type=str, required=True,
+                        help="The input training data file (a text file).")
     parser.add_argument("--output_dir", default=None, type=str, required=True,
                         help="The output directory where the model predictions and checkpoints will be written.")
 
     ## Other parameters
-    parser.add_argument("--model_name", default="bert", type=str,
+    parser.add_argument("--eval_data_file", default=None, type=str,
+                        help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
+
+    parser.add_argument("--model_type", default="bert", type=str,
                         help="The model architecture to be fine-tuned.")
-    parser.add_argument("--model_checkpoint", default="bert-base-cased", type=str,
+    parser.add_argument("--model_name_or_path", default="bert-base-cased", type=str,
                         help="The model checkpoint for weights initialization.")
 
     parser.add_argument("--mlm", action='store_true',
@@ -266,20 +315,21 @@ def main():
                         help="Ratio of tokens to mask for masked language modeling loss")
 
     parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
+                        help="Optional pretrained config name or path if not the same as model_name_or_path")
     parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
+                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
     parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
+                        help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)")
+    parser.add_argument("--block_size", default=-1, type=int,
+                        help="Optional input sequence length after tokenization."
+                             "The training dataset will be truncated in block of this size for training."
+                             "Default to the model max input length.")
     parser.add_argument("--do_train", action='store_true',
                         help="Whether to run training.")
     parser.add_argument("--do_eval", action='store_true',
                         help="Whether to run eval on the dev set.")
     parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
+                        help="Run evaluation during training at each logging step.")
     parser.add_argument("--do_lower_case", action='store_true',
                         help="Set this flag if you are using an uncased model.")
 
@@ -309,7 +359,7 @@ def main():
     parser.add_argument('--save_steps', type=int, default=50,
                         help="Save checkpoint every X updates steps.")
     parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+                        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number")
     parser.add_argument("--no_cuda", action='store_true',
                         help="Avoid using CUDA when available")
     parser.add_argument('--overwrite_output_dir', action='store_true',
@@ -330,9 +380,12 @@ def main():
     parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
     args = parser.parse_args()
 
-    if args.model_name in ["bert", "roberta"] and not args.mlm:
+    if args.model_type in ["bert", "roberta"] and not args.mlm:
         raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                          "flag (masked language modeling).")
+    if args.eval_data_file is None and args.do_eval:
+        raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
+                         "or remove the --do_eval argument.")
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
@@ -368,30 +421,36 @@ def main():
 
     # Load pretrained model and tokenizer
     if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab
 
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_name]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_checkpoint)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_checkpoint, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_checkpoint, from_tf=bool('.ckpt' in args.model_checkpoint), config=config)
-    args.num_embeddings = config.vocab_size  # We need this to create the model at next line (number of embeddings to use)
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    if args.block_size <= 0:
+        args.block_size = tokenizer.max_len  # Our input block size will be the max possible for the model
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+    model.to(args.device)
 
     if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
+        torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab
 
     logger.info("Training/evaluation parameters %s", args)
 
-
     # Training
     if args.do_train:
+        if args.local_rank not in [-1, 0]:
+            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache
+
         train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
+
+        if args.local_rank == 0:
+            torch.distributed.barrier()
+
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
 
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
         if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
@@ -409,7 +468,7 @@ def main():
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
 
diff --git a/examples/utils_lm.py b/examples/utils_lm.py
deleted file mode 100644
index 251aea90e1..0000000000
--- a/examples/utils_lm.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from torch.utils.data import Dataset, DataLoader
-import os
-import random
-import torch
-import torch.nn.functional as F
-import logging
-import pickle
-
-logger = logging.getLogger(__name__)
-
-
-class WikiTextDataset(Dataset):
-	def __init__(self, args, tokenizer, file='train', directory='wikitext', max_context_length=512, cache=None):
-		if args.local_rank not in [-1, 0]:
-			torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-			
-			
-		cached_features_file = os.path.join(args.data_dir, f'cached_lm_{file}_{args.max_seq_length}')
-		
-		if os.path.exists(cached_features_file):
-			logger.info("Loading features from cached file %s", cached_features_file)
-			with open(cached_features_file, 'rb') as handle:
-				self.examples = pickle.load(handle)
-		else:
-			logger.info("Creating features from dataset file at %s", args.data_dir)	
-		
-		self.max_context_length = max_context_length
-
-		self.examples = []
-
-		with open(os.path.join(directory, f"wiki.{file}.raw"), encoding="utf-8") as f:
-			text = f.read()
-			tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
-
-			while len(tokenized_text) > max_context_length:
-				self.examples.append(tokenized_text[:max_context_length])
-				tokenized_text = tokenized_text[max_context_length:]
-			
-		if args.local_rank in [-1, 0]:
-			logger.info("Saving features into cached file %s", cached_features_file)
-			with open(cached_features_file, 'wb') as handle:
-				pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
-		
-		if args.local_rank == 0:
-			torch.distributed.barrier()
-
-	def __len__(self):
-		return len(self.examples)
-
-	def __getitem__(self, item):
-		return torch.tensor(self.examples[item])

From 2d042274ac9ee6cd03aabcb861126937a29feb1a Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 20 Aug 2019 14:15:28 -0400
Subject: [PATCH 09/15] Sequence special token handling for BERT and RoBERTa

---
 examples/run_lm_finetuning.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index bd7047a587..c69d4db53b 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -71,9 +71,15 @@ class TextDataset(Dataset):
                 text = f.read()
 
             tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
+
+            tokenized_text = tokenizer.add_special_tokens_single_sentence(tokenized_text)
             while len(tokenized_text) >= block_size:  # Truncate in block of block_size
-                self.examples.append(tokenized_text[:block_size])
-                tokenized_text = tokenized_text[block_size:]
+                if isinstance(tokenizer, (BertTokenizer, RobertaTokenizer)):
+                    self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size - 2]))
+                    tokenized_text = tokenized_text[block_size - 2:]
+                else:
+                    self.examples.append(tokenized_text[:block_size])
+                    tokenized_text = tokenized_text[block_size:]
             # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
             # If your dataset is small, first you should loook for a bigger one :-) and second you
             # can change this behavior by adding (model specific) padding.

From d6bbcbc4cf79f0d6da6d4753f4d128ff7e3e42a5 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Wed, 21 Aug 2019 11:22:05 -0400
Subject: [PATCH 10/15] Added finetuning example to documentation

---
 docs/source/examples.rst | 49 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index 51c8d850b9..40e22725ce 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -12,8 +12,8 @@ Examples
      - How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models
    * - `Fine-tuning with BERT: running the examples <#fine-tuning-bert-examples>`_
      - Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``extract_classif.py``\ , ``run_bert_classifier.py``\ , ``run_bert_squad.py`` and ``run_lm_finetuning.py``
-   * - `Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2 <#fine-tuning>`_
-     - Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``run_openai_gpt.py``\ , ``run_transfo_xl.py`` and ``run_gpt2.py``
+   * - `Fine-tuning with OpenAI GPT, Transformer-XL, GPT-2 as well as BERT and RoBERTa <#fine-tuning>`_
+     - Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``run_openai_gpt.py``\ , ``run_transfo_xl.py``, ``run_gpt2.py`` and ``run_lm_finetuning.py``
    * - `Fine-tuning BERT-large on GPUs <#fine-tuning-bert-large>`_
      - How to fine tune ``BERT large``
 
@@ -393,12 +393,13 @@ Thank to the work of @Rocketknight1 and @tholor there are now **several scripts*
 OpenAI GPT, Transformer-XL and GPT-2: running the examples
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-We provide three examples of scripts for OpenAI GPT, Transformer-XL and OpenAI GPT-2 based on (and extended from) the respective original implementations:
+We provide three examples of scripts for OpenAI GPT, Transformer-XL, OpenAI GPT-2, BERT and RoBERTa based on (and extended from) the respective original implementations:
 
 
 * fine-tuning OpenAI GPT on the ROCStories dataset
 * evaluating Transformer-XL on Wikitext 103
 * unconditional and conditional generation from a pre-trained OpenAI GPT-2 model
+* fine-tuning GPT/GPT-2 on a causal language modeling task and BERT/RoBERTa on a masked language modeling task
 
 Fine-tuning OpenAI GPT on the RocStories dataset
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -452,7 +453,47 @@ Unconditional generation:
 
    python run_gpt2.py --unconditional
 
-The same option as in the original scripts are provided, please refere to the code of the example and the original repository of OpenAI.
+The same option as in the original scripts are provided, please refer to the code of the example and the original repository of OpenAI.
+
+
+Causal LM fine-tuning on GPT/GPT-2, Masked LM fine-tuning on BERT/RoBERTa
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Before running the following examples you should download the `WikiText-2 dataset <https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/>`__ and unpack it to some directory `$WIKITEXT_2_DATASET`
+The following results were obtained using the `raw` WikiText-2 (no tokens were replaced before the tokenization).
+
+This example fine-tunes GPT-2 on the WikiText-2 dataset. The loss function is a causal language modeling loss (perplexity).
+
+.. code-block:: bash
+    export WIKITEXT_2_DATASET=/path/to/wikitext_dataset
+
+    python run_lm_finetuning.py
+        --output_dir=output
+        --model_type=gpt2
+        --model_name_or_path=gpt2
+        --do_train
+        --train_data_file=$WIKITEXT_2_DATASET/wiki.train.raw
+        --do_eval
+        --eval_data_file=$WIKITEXT_2_DATASET/wiki.test.raw
+
+This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run.
+It reaches a score of about 20 perplexity once fine-tuned on the dataset.
+
+This example fine-tunes RoBERTa on the WikiText-2 dataset. The loss function is a masked language modeling loss (masked perplexity).
+The `--mlm` flag is necessary to fine-tune BERT/RoBERTa on masked language modeling.
+
+.. code-block:: bash
+    export WIKITEXT_2_DATASET=/path/to/wikitext_dataset
+
+    python run_lm_finetuning.py
+        --output_dir=output
+        --model_type=roberta
+        --model_name_or_path=roberta-base
+        --do_train
+        --train_data_file=$WIKITEXT_2_DATASET/wiki.train.raw
+        --do_eval
+        --eval_data_file=$WIKITEXT_2_DATASET/wiki.test.raw
+        --mlm
 
 .. _fine-tuning-BERT-large:
 

From 47d6853439318f1be596219e270bee4e3819dfbb Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 23 Aug 2019 17:31:11 +0200
Subject: [PATCH 11/15] adding max_lengths for single sentences and sentences
 pairs

---
 pytorch_transformers/tokenization_bert.py    | 8 ++++++++
 pytorch_transformers/tokenization_roberta.py | 8 ++++++++
 pytorch_transformers/tokenization_utils.py   | 8 ++++++++
 pytorch_transformers/tokenization_xlm.py     | 8 ++++++++
 pytorch_transformers/tokenization_xlnet.py   | 8 ++++++++
 5 files changed, 40 insertions(+)

diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 04f35aa466..8ea71ba92b 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -139,6 +139,14 @@ class BertTokenizer(PreTrainedTokenizer):
                                                   tokenize_chinese_chars=tokenize_chinese_chars)
         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
 
+    @property
+    def max_len_single_sentence(self):
+        return self.max_len - 2  # take into account special tokens
+
+    @property
+    def max_len_sentences_pair(self):
+        return self.max_len - 3  # take into account special tokens
+
     @property
     def vocab_size(self):
         return len(self.vocab)
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index edf4717c89..44047e636f 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -160,6 +160,14 @@ class RobertaTokenizer(PreTrainedTokenizer):
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
         return text
 
+    @property
+    def max_len_single_sentence(self):
+        return self.max_len - 2  # take into account special tokens
+
+    @property
+    def max_len_sentences_pair(self):
+        return self.max_len - 4  # take into account special tokens
+
     def add_special_tokens_single_sentence(self, token_ids):
         """
         Adds special tokens to a sequence for sequence classification tasks.
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index d2855e0922..a128c3fd72 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -67,6 +67,14 @@ class PreTrainedTokenizer(object):
                                  "pad_token", "cls_token", "mask_token",
                                  "additional_special_tokens"]
 
+    @property
+    def max_len_single_sentence(self):
+        return self.max_len  # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
+
+    @property
+    def max_len_sentences_pair(self):
+        return self.max_len  # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
+
     @property
     def bos_token(self):
         """ Beginning of sentence token (string). Log an error if used while not having been set. """
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 2d2f3a8cd4..b544923e35 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -215,6 +215,14 @@ class XLMTokenizer(PreTrainedTokenizer):
         out_string = ''.join(tokens).replace('</w>', ' ').strip()
         return out_string
 
+    @property
+    def max_len_single_sentence(self):
+        return self.max_len - 2  # take into account special tokens
+
+    @property
+    def max_len_sentences_pair(self):
+        return self.max_len - 3  # take into account special tokens
+
     def add_special_tokens_single_sentence(self, token_ids):
         """
         Adds special tokens to a sequence for sequence classification tasks.
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index 371b3c9407..a282d67904 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -177,6 +177,14 @@ class XLNetTokenizer(PreTrainedTokenizer):
         out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
         return out_string
 
+    @property
+    def max_len_single_sentence(self):
+        return self.max_len - 2  # take into account special tokens
+
+    @property
+    def max_len_sentences_pair(self):
+        return self.max_len - 3  # take into account special tokens
+
     def add_special_tokens_single_sentence(self, token_ids):
         """
         Adds special tokens to a sequence pair for sequence classification tasks.

From ab7bd5ef98c797132ab5c3378599b3eeec9041d9 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 23 Aug 2019 17:31:21 +0200
Subject: [PATCH 12/15] fixing tokenization and training

---
 examples/run_lm_finetuning.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index c69d4db53b..015f742299 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -30,7 +30,7 @@ import random
 
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, Dataset, SequentialSampler
+from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
@@ -72,14 +72,9 @@ class TextDataset(Dataset):
 
             tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
 
-            tokenized_text = tokenizer.add_special_tokens_single_sentence(tokenized_text)
             while len(tokenized_text) >= block_size:  # Truncate in block of block_size
-                if isinstance(tokenizer, (BertTokenizer, RobertaTokenizer)):
-                    self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size - 2]))
-                    tokenized_text = tokenized_text[block_size - 2:]
-                else:
-                    self.examples.append(tokenized_text[:block_size])
-                    tokenized_text = tokenized_text[block_size:]
+                self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size]))
+                tokenized_text = tokenized_text[block_size:]
             # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
             # If your dataset is small, first you should loook for a bigger one :-) and second you
             # can change this behavior by adding (model specific) padding.
@@ -112,15 +107,15 @@ def mask_tokens(inputs, tokenizer, args):
     """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
     labels = inputs.clone()
     # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
-    masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).byte()
+    masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).bool()
     labels[~masked_indices] = -1  # We only compute loss on masked tokens
 
     # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).byte() & masked_indices
+    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
     inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
 
     # 10% of the time, we replace masked input tokens with random word
-    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced
+    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
     random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
     inputs[indices_random] = random_words[indices_random]
 
@@ -134,7 +129,7 @@ def train(args, train_dataset, model, tokenizer):
         tb_writer = SummaryWriter()
 
     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 
     if args.max_steps > 0:
@@ -329,7 +324,7 @@ def main():
     parser.add_argument("--block_size", default=-1, type=int,
                         help="Optional input sequence length after tokenization."
                              "The training dataset will be truncated in block of this size for training."
-                             "Default to the model max input length.")
+                             "Default to the model max input length fo single sentences inputs (take into account special tokens).")
     parser.add_argument("--do_train", action='store_true',
                         help="Whether to run training.")
     parser.add_argument("--do_eval", action='store_true',
@@ -433,7 +428,8 @@ def main():
     config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
     if args.block_size <= 0:
-        args.block_size = tokenizer.max_len  # Our input block size will be the max possible for the model
+        args.block_size = tokenizer.max_len_single_sentence  # Our input block size will be the max possible for the model
+    args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
     model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
     model.to(args.device)
 

From 3bcbebd440c220adbaab657f2d13dac7c89f6453 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 23 Aug 2019 22:07:26 +0200
Subject: [PATCH 13/15] max_len_single_sentence & max_len_sentences_pair as
 attributes so they can be modified

---
 pytorch_transformers/tokenization_bert.py       | 11 +++--------
 pytorch_transformers/tokenization_gpt2.py       |  2 ++
 pytorch_transformers/tokenization_openai.py     |  3 +++
 pytorch_transformers/tokenization_roberta.py    | 11 +++--------
 pytorch_transformers/tokenization_transfo_xl.py |  4 ++++
 pytorch_transformers/tokenization_utils.py      | 11 +++--------
 pytorch_transformers/tokenization_xlm.py        | 12 ++++--------
 pytorch_transformers/tokenization_xlnet.py      | 12 ++++--------
 8 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 8ea71ba92b..92f027038d 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -125,6 +125,9 @@ class BertTokenizer(PreTrainedTokenizer):
         super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
                                             pad_token=pad_token, cls_token=cls_token,
                                             mask_token=mask_token, **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
@@ -139,14 +142,6 @@ class BertTokenizer(PreTrainedTokenizer):
                                                   tokenize_chinese_chars=tokenize_chinese_chars)
         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
 
-    @property
-    def max_len_single_sentence(self):
-        return self.max_len - 2  # take into account special tokens
-
-    @property
-    def max_len_sentences_pair(self):
-        return self.max_len - 3  # take into account special tokens
-
     @property
     def vocab_size(self):
         return len(self.vocab)
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index e67f25ff59..13806a3708 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -108,6 +108,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
     def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
                  bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
         super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
+        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
 
         self.encoder = json.load(open(vocab_file))
         self.decoder = {v:k for k,v in self.encoder.items()}
diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
index 51b418ebd3..0efbdb37c0 100644
--- a/pytorch_transformers/tokenization_openai.py
+++ b/pytorch_transformers/tokenization_openai.py
@@ -87,6 +87,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
     def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
         super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
 
+        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+
         try:
             import ftfy
             from spacy.lang.en import English
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index 44047e636f..e8ab29238e 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -77,6 +77,9 @@ class RobertaTokenizer(PreTrainedTokenizer):
                                                sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
                                                mask_token=mask_token, **kwargs)
 
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
+
         self.encoder = json.load(open(vocab_file, encoding="utf-8"))
         self.decoder = {v: k for k, v in self.encoder.items()}
         self.errors = errors  # how to handle errors in decoding
@@ -160,14 +163,6 @@ class RobertaTokenizer(PreTrainedTokenizer):
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
         return text
 
-    @property
-    def max_len_single_sentence(self):
-        return self.max_len - 2  # take into account special tokens
-
-    @property
-    def max_len_sentences_pair(self):
-        return self.max_len - 4  # take into account special tokens
-
     def add_special_tokens_single_sentence(self, token_ids):
         """
         Adds special tokens to a sequence for sequence classification tasks.
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
index 992dff80d5..c603ba695c 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -73,6 +73,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token,
                                                  additional_special_tokens=additional_special_tokens,
                                                  **kwargs)
+
+        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+
         if never_split is None:
             never_split = self.all_special_tokens
         if special is None:
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index a128c3fd72..2fb7f87e9c 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -67,14 +67,6 @@ class PreTrainedTokenizer(object):
                                  "pad_token", "cls_token", "mask_token",
                                  "additional_special_tokens"]
 
-    @property
-    def max_len_single_sentence(self):
-        return self.max_len  # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
-
-    @property
-    def max_len_sentences_pair(self):
-        return self.max_len  # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
-
     @property
     def bos_token(self):
         """ Beginning of sentence token (string). Log an error if used while not having been set. """
@@ -174,6 +166,9 @@ class PreTrainedTokenizer(object):
         self._additional_special_tokens = []
 
         self.max_len = max_len if max_len is not None else int(1e12)
+        self.max_len_single_sentence = self.max_len
+        self.max_len_sentences_pair = self.max_len
+
         self.added_tokens_encoder = {}
         self.added_tokens_decoder = {}
 
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index b544923e35..2b930458bb 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -122,6 +122,10 @@ class XLMTokenizer(PreTrainedTokenizer):
                                            cls_token=cls_token, mask_token=mask_token,
                                            additional_special_tokens=additional_special_tokens,
                                            **kwargs)
+
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
         try:
             import ftfy
             from spacy.lang.en import English
@@ -215,14 +219,6 @@ class XLMTokenizer(PreTrainedTokenizer):
         out_string = ''.join(tokens).replace('</w>', ' ').strip()
         return out_string
 
-    @property
-    def max_len_single_sentence(self):
-        return self.max_len - 2  # take into account special tokens
-
-    @property
-    def max_len_sentences_pair(self):
-        return self.max_len - 3  # take into account special tokens
-
     def add_special_tokens_single_sentence(self, token_ids):
         """
         Adds special tokens to a sequence for sequence classification tasks.
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index a282d67904..ac7231bb68 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
                                              pad_token=pad_token, cls_token=cls_token,
                                              mask_token=mask_token, additional_special_tokens=
                                              additional_special_tokens, **kwargs)
+
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
         try:
             import sentencepiece as spm
         except ImportError:
@@ -177,14 +181,6 @@ class XLNetTokenizer(PreTrainedTokenizer):
         out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
         return out_string
 
-    @property
-    def max_len_single_sentence(self):
-        return self.max_len - 2  # take into account special tokens
-
-    @property
-    def max_len_sentences_pair(self):
-        return self.max_len - 3  # take into account special tokens
-
     def add_special_tokens_single_sentence(self, token_ids):
         """
         Adds special tokens to a sequence pair for sequence classification tasks.

From 06510ccb5314f629816888a5b6eed953b30d1046 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 23 Aug 2019 22:08:10 +0200
Subject: [PATCH 14/15] typo

---
 examples/run_lm_finetuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 015f742299..d37f7a443a 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -324,7 +324,7 @@ def main():
     parser.add_argument("--block_size", default=-1, type=int,
                         help="Optional input sequence length after tokenization."
                              "The training dataset will be truncated in block of this size for training."
-                             "Default to the model max input length fo single sentences inputs (take into account special tokens).")
+                             "Default to the model max input length for single sentence inputs (take into account special tokens).")
     parser.add_argument("--do_train", action='store_true',
                         help="Whether to run training.")
     parser.add_argument("--do_eval", action='store_true',

From 529a16dec6cc9bfcf8954a1b16546960f2fab6fa Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 26 Aug 2019 15:00:43 -0400
Subject: [PATCH 15/15] Generic encoding implementation.

---
 pytorch_transformers/tokenization_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 2fb7f87e9c..3596711bdb 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -593,10 +593,12 @@ class PreTrainedTokenizer(object):
             return first_sentence_tokens, second_sentence_tokens
 
     def add_special_tokens_single_sentence(self, token_ids):
-        raise NotImplementedError
+        logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
+        return token_ids
 
     def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
-        raise NotImplementedError
+        logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
+        return token_ids_0 + token_ids_1
 
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
         """ Converts a single index or a sequence of indices (integers) in a token "