diff --git a/.github/stale.yml b/.github/stale.yml
new file mode 100644
index 0000000000..d9f6563218
--- /dev/null
+++ b/.github/stale.yml
@@ -0,0 +1,17 @@
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 60
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 7
+# Issues with these labels will never be considered stale
+exemptLabels:
+  - pinned
+  - security
+# Label to use when marking an issue as stale
+staleLabel: wontfix
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you
+  for your contributions.
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: false
\ No newline at end of file
diff --git a/examples/lm_finetuning/README.md b/examples/lm_finetuning/README.md
index c48d9b7069..f04e877ef2 100644
--- a/examples/lm_finetuning/README.md
+++ b/examples/lm_finetuning/README.md
@@ -37,6 +37,7 @@ python3 simple_lm_finetuning.py
 --bert_model bert-base-uncased 
 --do_lower_case 
 --output_dir finetuned_lm/
+--do_train
 ```
 
 ### Pregenerating training data
@@ -60,4 +61,4 @@ python3 finetune_on_pregenerated.py
 --do_lower_case
 --output_dir finetuned_lm/
 --epochs 3
-```
\ No newline at end of file
+```
diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 035f97b0c9..6a63324502 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -123,9 +123,8 @@ def main():
     parser = ArgumentParser()
     parser.add_argument('--pregenerated_data', type=Path, required=True)
     parser.add_argument('--output_dir', type=Path, required=True)
-    parser.add_argument("--bert_model", type=str, required=True,
-                        choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
-                                 "bert-base-multilingual", "bert-base-chinese"])
+    parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
     parser.add_argument("--do_lower_case", action="store_true")
     parser.add_argument("--reduce_memory", action="store_true",
                         help="Store training data as on-disc memmaps to massively reduce memory usage")
diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py
index 8cc28d2e78..e6c3598a9f 100644
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -4,7 +4,7 @@ from tqdm import tqdm, trange
 from tempfile import TemporaryDirectory
 import shelve
 
-from random import random, randint, shuffle, choice, sample
+from random import random, randrange, randint, shuffle, choice, sample
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 import numpy as np
 import json
@@ -30,6 +30,8 @@ class DocumentDatabase:
         self.reduce_memory = reduce_memory
 
     def add_document(self, document):
+        if not document:
+            return
         if self.reduce_memory:
             current_idx = len(self.doc_lengths)
             self.document_shelf[str(current_idx)] = document
@@ -49,11 +51,11 @@ class DocumentDatabase:
                 self._precalculate_doc_weights()
             rand_start = self.doc_cumsum[current_idx]
             rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
-            sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max
+            sentence_index = randrange(rand_start, rand_end) % self.cumsum_max
             sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
         else:
             # If we don't use sentence weighting, then every doc has an equal chance to be chosen
-            sampled_doc_index = current_idx + randint(1, len(self.doc_lengths)-1)
+            sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths)
         assert sampled_doc_index != current_idx
         if self.reduce_memory:
             return self.document_shelf[str(sampled_doc_index)]
@@ -170,7 +172,7 @@ def create_instances_from_document(
                 # (first) sentence.
                 a_end = 1
                 if len(current_chunk) >= 2:
-                    a_end = randint(1, len(current_chunk) - 1)
+                    a_end = randrange(1, len(current_chunk))
 
                 tokens_a = []
                 for j in range(a_end):
@@ -186,7 +188,7 @@ def create_instances_from_document(
                     # Sample a random document, with longer docs being sampled more frequently
                     random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True)
 
-                    random_start = randint(0, len(random_document) - 1)
+                    random_start = randrange(0, len(random_document))
                     for j in range(random_start, len(random_document)):
                         tokens_b.extend(random_document[j])
                         if len(tokens_b) >= target_b_length:
@@ -264,6 +266,14 @@ def main():
                 else:
                     tokens = tokenizer.tokenize(line)
                     doc.append(tokens)
+            if doc:
+                docs.add_document(doc)  # If the last doc didn't end on a newline, make sure it still gets added
+        if len(docs) <= 1:
+            exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
+                 "ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
+                 "indicate breaks between documents in your input file. If your dataset does not contain multiple "
+                 "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
+                 "sections or paragraphs.")
 
         args.output_dir.mkdir(exist_ok=True)
         for epoch in trange(args.epochs_to_generate, desc="Epoch"):
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 751d581ad9..4268c41ec6 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -95,7 +95,7 @@ class DataProcessor(object):
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""
-        with open(input_file, "r") as f:
+        with open(input_file, "r", encoding="utf-8") as f:
             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
             lines = []
             for line in reader:
diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
index b22df39b98..61fbf9f323 100644
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -83,8 +83,9 @@ def run_model():
     elif args.length > model.config.n_ctx:
         raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
 
-    if not args.unconditional:
-        while True:
+    while True:
+        context_tokens = []
+        if not args.unconditional:
             raw_text = input("Model prompt >>> ")
             while not raw_text:
                 print('Prompt should not be empty!')
@@ -123,6 +124,8 @@ def run_model():
                 print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                 print(text)
         print("=" * 80)
+        if args.unconditional:
+            break
 
 if __name__ == '__main__':
     run_model()
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 2736e34d7f..037c6e9723 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -930,7 +930,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 7273e75bf6..b6252d097f 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -605,14 +605,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
             return
         # Update config
         self.config.n_special = num_special_tokens
-        # # Build new embeddings and initialize
+        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
         old_embed = self.tokens_embed
         self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
-        # Initialize all new embeddings (in particular the special tokens)
         self.init_weights(self.tokens_embed)
-        # Copy word and positional embeddings from the previous weights
-        self.tokens_embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :]
-        self.tokens_embed.weight.data[-self.config.n_positions :, :] = old_embed.weight.data[-self.config.n_positions :, :]
+        # Copy word embeddings from the previous weights
+        self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None):
         if position_ids is None: