diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 3dc0f245c9..435e6c132b 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -242,7 +242,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         # initialize all new embeddings (in particular added tokens)
         self._init_weights(new_embeddings)
 
-        # Copy word embeddings from the previous weights
+        # Copy token embeddings from the previous weights
         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
         new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
 
@@ -558,7 +558,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                         model.__class__.__name__, "\n\t".join(error_msgs)
                     )
                 )
-        model.tie_weights()  # make sure word embedding weights are still tied if needed
+        model.tie_weights()  # make sure token embedding weights are still tied if needed
 
         # Set model in evaluation mode to desactivate DropOut modules by default
         model.eval()
@@ -843,8 +843,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         """ Generate sequences for each example without beam search (num_beams == 1).
             All returned sequence are generated independantly.
         """
-        # current position / max lengths / length of generated sentences / unfinished sentences
-
+        # length of generated sentences / unfinished sentences
         unfinished_sents = input_ids.new(batch_size).fill_(1)
         sent_lengths = input_ids.new(batch_size).fill_(max_length)
 
@@ -934,7 +933,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         """
 
         # Expand input to num beams
-        # assert input_ids.shape == (batch_size * num_beams, cur_len)
         input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len)
         input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len)  # (batch_size * num_beams, cur_len)
 
@@ -946,7 +944,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         # scores for each sentence in the beam
         beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
 
-        # Greedy decoding it is made sure that only words of the first beam are considered to avoid sampling the exact same words three times
+        # Greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times
         if do_sample is False:
             beam_scores[:, 1:] = -1e9
         beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
@@ -960,7 +958,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         while cur_len < max_length:
             model_inputs = self.prepare_inputs_for_generation(input_ids, past=past)
             outputs = self(**model_inputs)  # (batch_size * num_beams, cur_len, vocab_size)
-            scores = outputs[0][:, -1, :]  # (batch_size * num_beams, vocab_size)
+            next_token_logits = outputs[0][:, -1, :]  # (batch_size * num_beams, vocab_size)
 
             # if model has past, then set the past variable to speed up decoding
             if self._do_output_past(outputs):
@@ -968,14 +966,16 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
 
             # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
             if repetition_penalty != 1.0:
-                self.enforce_repetition_penalty_(scores, batch_size, num_beams, input_ids, repetition_penalty)
+                self.enforce_repetition_penalty_(
+                    next_token_logits, batch_size, num_beams, input_ids, repetition_penalty
+                )
 
             if do_sample:
                 # Temperature (higher temperature => more likely to sample low probability tokens)
                 if temperature != 1.0:
-                    scores = scores / temperature
+                    next_token_logits = next_token_logits / temperature
 
-                scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
+                scores = F.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
                 _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
 
                 # Top-p/top-k filtering
@@ -988,25 +988,27 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                     batch_size, num_beams * vocab_size
                 )  # (batch_size, num_beams * vocab_size)
 
-                # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search)
-                next_words = torch.multinomial(
+                # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
+                next_tokens = torch.multinomial(
                     F.softmax(_scores, dim=-1), num_samples=2 * num_beams
                 )  # (batch_size, num_beams * 2)
 
                 # Compute next scores
-                next_scores = torch.gather(_scores, -1, next_words)  # (batch_size, num_beams * 2)
+                next_scores = torch.gather(_scores, -1, next_tokens)  # (batch_size, num_beams * 2)
 
             else:
                 # do greedy beam search
-                scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
+                scores = F.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
                 assert scores.size() == (batch_size * num_beams, vocab_size)
                 # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
-                _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
+                next_scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
                 # re-organize to group the beam together (we are keeping top hypothesis accross beams)
-                _scores = _scores.view(batch_size, num_beams * vocab_size)  # (batch_size, num_beams * vocab_size)
-                next_scores, next_words = torch.topk(_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
+                next_scores = next_scores.view(
+                    batch_size, num_beams * vocab_size
+                )  # (batch_size, num_beams * vocab_size)
+                next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
 
-            assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams)
+            assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams)
 
             # next batch beam content
             # list of (batch_size * num_beams) tuple(next hypothesis score, next word, current position in the batch)
@@ -1032,21 +1034,21 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                 # next sentence beam content
                 next_sent_beam = []
 
-                # next words for this sentence
-                for idx, score in zip(next_words[batch_idx], next_scores[batch_idx]):
+                # next tokens for this sentence
+                for idx, score in zip(next_tokens[batch_idx], next_scores[batch_idx]):
 
                     # get beam and word IDs
                     beam_id = idx // vocab_size
-                    word_id = idx % vocab_size
+                    token_id = idx % vocab_size
 
                     # add to generated hypotheses if end of sentence or last iteration
-                    if eos_token_ids is not None and word_id.item() in eos_token_ids:
+                    if eos_token_ids is not None and token_id.item() in eos_token_ids:
                         generated_hyps[batch_idx].add(
                             input_ids[batch_idx * num_beams + beam_id, :cur_len].clone(), score.item(),
                         )
                     else:
                         # add next predicted word if it is not eos_token
-                        next_sent_beam.append((score, word_id, batch_idx * num_beams + beam_id))
+                        next_sent_beam.append((score, token_id, batch_idx * num_beams + beam_id))
 
                     # the beam for next step is full
                     if len(next_sent_beam) == num_beams:
@@ -1060,12 +1062,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
             # sanity check / prepare next batch
             assert len(next_batch_beam) == batch_size * num_beams
             beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
-            beam_words = input_ids.new([x[1] for x in next_batch_beam])
+            beam_tokens = input_ids.new([x[1] for x in next_batch_beam])
             beam_idx = input_ids.new([x[2] for x in next_batch_beam])
 
             # re-order batch
             input_ids = input_ids[beam_idx, :]
-            input_ids = torch.cat([input_ids, beam_words.unsqueeze(1)], dim=-1)
+            input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1)
 
             # re-order internal states
             if past:
@@ -1081,11 +1083,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
         for batch_idx in range(batch_size):
             # Add all open beam hypothesis to generated_hyps
             if not done[batch_idx]:
-                for idx, score in zip(next_words[batch_idx], next_scores[batch_idx]):
+                for idx, score in zip(next_tokens[batch_idx], next_scores[batch_idx]):
 
                     # get beam and word IDs
                     beam_id = idx // vocab_size
-                    word_id = idx % vocab_size
+                    token_id = idx % vocab_size
                     generated_hyps[batch_idx].add(
                         input_ids[batch_idx * num_beams + beam_id, :cur_len].clone(), score.item()
                     )