From f5bcde0b2fcfab961580129ec27ac72f43eaa267 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 30 Sep 2019 16:04:55 -0400
Subject: [PATCH] [multiple-choice] Simplify and use tokenizer.encode_plus

---
 examples/README.md                 |   8 +-
 examples/run_multiple_choice.py    |  16 +--
 examples/utils_multiple_choice.py  | 166 ++++++++++-------------------
 transformers/tokenization_utils.py |  68 ++++++++----
 4 files changed, 116 insertions(+), 142 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index fb5de20a2a..382d794fcb 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,7 +9,7 @@ similar API between the different models.
 | [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
 | [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
 | [SQuAD](#squad) | Using BERT for question answering, examples with distributed training.                                                                                  |
-| [Multiple Choice](#multiple choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
+| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
 
 ## Language model fine-tuning
 
@@ -283,17 +283,17 @@ The results  are the following:
   loss = 0.04755385363816904
 ```
 
-##Multiple Choice
+## Multiple Choice
 
 Based on the script [`run_multiple_choice.py`]().
 
 #### Fine-tuning on SWAG
 Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
 
-```
+```bash
 #training on 4 tesla V100(16GB) GPUS
 export SWAG_DIR=/path/to/swag_data_dir
-python ./examples/single_model_scripts/run_multiple_choice.py \
+python ./examples/run_multiple_choice.py \
 --model_type roberta \
 --task_name swag \
 --model_name_or_path roberta-base \
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 54f3a8a904..2ce1327c98 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -306,14 +306,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
         else:
             examples = processor.get_train_examples(args.data_dir)
         logger.info("Training number: %s", str(len(examples)))
-        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer,
-            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
-            cls_token=tokenizer.cls_token,
-            sep_token=tokenizer.sep_token,
-            sep_token_extra=bool(args.model_type in ['roberta']),
-            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
+        features = convert_examples_to_features(
+            examples,
+            label_list,
+            args.max_seq_length,
+            tokenizer,
             pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
+            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0
+        )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
@@ -362,7 +362,7 @@ def main():
                         help="Whether to run eval on the dev set.")
     parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
     parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
+                        help="Run evaluation during training at each logging step.")
     parser.add_argument("--do_lower_case", action='store_true',
                         help="Set this flag if you are using an uncased model.")
 
diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py
index 7abcc5e1e9..50bb491243 100644
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BERT multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """
+""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """
 
 from __future__ import absolute_import, division, print_function
 
@@ -26,6 +26,8 @@ import json
 import csv
 import glob
 import tqdm
+from typing import List
+from transformers import PreTrainedTokenizer
 
 
 logger = logging.getLogger(__name__)
@@ -34,13 +36,13 @@ logger = logging.getLogger(__name__)
 class InputExample(object):
     """A single training/test example for multiple choice"""
 
-    def __init__(self, example_id, question,  contexts, endings, label=None):
+    def __init__(self, example_id, question, contexts, endings, label=None):
         """Constructs a InputExample.
 
         Args:
             example_id: Unique id for the example.
             contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
-            question: string. The untokenized text of the second sequence (qustion).
+            question: string. The untokenized text of the second sequence (question).
             endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
             label: (Optional) string. The label of the example. This should be
             specified for train and dev examples, but not for test examples.
@@ -66,7 +68,7 @@ class InputFeatures(object):
                 'input_mask': input_mask,
                 'segment_ids': segment_ids
             }
-            for _, input_ids, input_mask, segment_ids in choices_features
+            for input_ids, input_mask, segment_ids in choices_features
         ]
         self.label = label
 
@@ -192,7 +194,7 @@ class SwagProcessor(DataProcessor):
             return lines
 
 
-    def _create_examples(self, lines, type):
+    def _create_examples(self, lines: List[List[str]], type: str):
         """Creates examples for the training and dev sets."""
         if type == "train" and lines[0][-1] != 'label':
             raise ValueError(
@@ -300,24 +302,18 @@ class ArcProcessor(DataProcessor):
         return examples
 
 
-def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer,
-                                 cls_token_at_end=False,
-                                 cls_token='[CLS]',
-                                 cls_token_segment_id=1,
-                                 sep_token='[SEP]',
-                                 sequence_a_segment_id=0,
-                                 sequence_b_segment_id=1,
-                                 sep_token_extra=False,
-                                 pad_token_segment_id=0,
-                                 pad_on_left=False,
-                                 pad_token=0,
-                                 mask_padding_with_zero=True):
-    """ Loads a data file into a list of `InputBatch`s
-        `cls_token_at_end` define the location of the CLS token:
-            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+def convert_examples_to_features(
+    examples: List[InputExample],
+    label_list: List[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
+    pad_token_segment_id=0,
+    pad_on_left=False,
+    pad_token=0,
+    mask_padding_with_zero=True,
+) -> List[InputFeatures]:
+    """
+    Loads a data file into a list of `InputFeatures`
     """
 
     label_map = {label : i for i, label in enumerate(label_list)}
@@ -328,125 +324,71 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
             logger.info("Writing example %d of %d" % (ex_index, len(examples)))
         choices_features = []
         for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
-            tokens_a = tokenizer.tokenize(context)
-            tokens_b = None
+            text_a = context
             if example.question.find("_") != -1:
-                #this is for cloze question
-                tokens_b = tokenizer.tokenize(example.question.replace("_", ending))
+                # this is for cloze question
+                text_b = example.question.replace("_", ending)
             else:
-                tokens_b = tokenizer.tokenize(example.question + " " + ending)
-                # you can add seq token between quesiotn and ending. This does not make too much difference.
-                # tokens_b = tokenizer.tokenize(example.question)
-                # tokens_b += [sep_token]
-                # if sep_token_extra:
-                #     tokens_b += [sep_token]
-                # tokens_b += tokenizer.tokenize(ending)
+                text_b = example.question + " " + ending
 
-            special_tokens_count = 4 if sep_token_extra else 3
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
+            inputs = tokenizer.encode_plus(
+                text_a,
+                text_b,
+                add_special_tokens=True,
+                max_length=max_length,
+                truncate_both_sequences=True
+            )
+            if 'overflowing_tokens' in inputs and len(inputs['overflowing_tokens']) > 0:
+                logger.info('Attention! you are cropping tokens (swag task is ok). '
+                        'If you are training ARC and RACE and you are poping question + options,'
+                        'you need to try to use a bigger max seq length!')
 
-            # The convention in BERT is:
-            # (a) For sequence pairs:
-            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-            #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-            # (b) For single sequences:
-            #  tokens:   [CLS] the dog is hairy . [SEP]
-            #  type_ids:   0   0   0   0  0     0   0
-            #
-            # Where "type_ids" are used to indicate whether this is the first
-            # sequence or the second sequence. The embedding vectors for `type=0` and
-            # `type=1` were learned during pre-training and are added to the wordpiece
-            # embedding vector (and position vector). This is not *strictly* necessary
-            # since the [SEP] token unambiguously separates the sequences, but it makes
-            # it easier for the model to learn the concept of sequences.
-            #
-            # For classification tasks, the first vector (corresponding to [CLS]) is
-            # used as as the "sentence vector". Note that this only makes sense because
-            # the entire model is fine-tuned.
-            tokens = tokens_a + [sep_token]
-            if sep_token_extra:
-                # roberta uses an extra separator b/w pairs of sentences
-                tokens += [sep_token]
-
-            segment_ids = [sequence_a_segment_id] * len(tokens)
-
-            if tokens_b:
-                tokens += tokens_b + [sep_token]
-                segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
-
-            if cls_token_at_end:
-                tokens = tokens + [cls_token]
-                segment_ids = segment_ids + [cls_token_segment_id]
-            else:
-                tokens = [cls_token] + tokens
-                segment_ids = [cls_token_segment_id] + segment_ids
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
 
             # The mask has 1 for real tokens and 0 for padding tokens. Only real
             # tokens are attended to.
-            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
 
             # Zero-pad up to the sequence length.
-            padding_length = max_seq_length - len(input_ids)
+            padding_length = max_length - len(input_ids)
             if pad_on_left:
                 input_ids = ([pad_token] * padding_length) + input_ids
-                input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-                segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
+                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
             else:
                 input_ids = input_ids + ([pad_token] * padding_length)
-                input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-                segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
+                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+
+            assert len(input_ids) == max_length
+            assert len(attention_mask) == max_length
+            assert len(token_type_ids) == max_length
+            choices_features.append((input_ids, attention_mask, token_type_ids))
+
 
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-            choices_features.append((tokens, input_ids, input_mask, segment_ids))
         label = label_map[example.label]
 
         if ex_index < 2:
             logger.info("*** Example ***")
             logger.info("race_id: {}".format(example.example_id))
-            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
+            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
                 logger.info("choice: {}".format(choice_idx))
-                logger.info("tokens: {}".format(' '.join(tokens)))
                 logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
-                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
-                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
+                logger.info("attention_mask: {}".format(' '.join(map(str, attention_mask))))
+                logger.info("token_type_ids: {}".format(' '.join(map(str, token_type_ids))))
                 logger.info("label: {}".format(label))
 
         features.append(
             InputFeatures(
-                example_id = example.example_id,
-                choices_features = choices_features,
-                label = label
+                example_id=example.example_id,
+                choices_features=choices_features,
+                label=label,
             )
         )
 
     return features
 
 
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-
-    # However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger
-    # length or only pop from context
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            logger.info('Attention! you are removing from token_b (swag task is ok). '
-                        'If you are training ARC and RACE (you are poping question + options), '
-                        'you need to try to use a bigger max seq length!')
-            tokens_b.pop()
 
 
 processors = {
@@ -456,7 +398,7 @@ processors = {
 }
 
 
-GLUE_TASKS_NUM_LABELS = {
+MULTIPLE_CHOICE_TASKS_NUM_LABELS = {
     "race", 4,
     "swag", 4,
     "arc", 4
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 1e20588f83..ea90f33a28 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -699,6 +699,7 @@ class PreTrainedTokenizer(object):
                 max_length=None,
                 stride=0,
                 truncate_first_sequence=True,
+                truncate_both_sequences=False,
                 return_tensors=None,
                 **kwargs):
         """
@@ -718,7 +719,7 @@ class PreTrainedTokenizer(object):
             max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
                 If there are overflowing tokens, those will be added to the returned dictionary
             stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defined the number of additional tokens.
+                from the main sequence returned. The value of this argument defines the number of additional tokens.
             truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
                 will be truncated.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
@@ -731,6 +732,7 @@ class PreTrainedTokenizer(object):
                                           add_special_tokens=add_special_tokens,
                                           stride=stride,
                                           truncate_first_sequence=truncate_first_sequence,
+                                          truncate_both_sequences=truncate_both_sequences,
                                           return_tensors=return_tensors,
                                           **kwargs)
 
@@ -743,6 +745,7 @@ class PreTrainedTokenizer(object):
                     max_length=None,
                     stride=0,
                     truncate_first_sequence=True,
+                    truncate_both_sequences=False,
                     return_tensors=None,
                     **kwargs):
         """
@@ -761,7 +764,7 @@ class PreTrainedTokenizer(object):
             max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
                 If there are overflowing tokens, those will be added to the returned dictionary
             stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defined the number of additional tokens.
+                from the main sequence returned. The value of this argument defines the number of additional tokens.
             truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
                 will be truncated.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
@@ -788,11 +791,12 @@ class PreTrainedTokenizer(object):
                                       add_special_tokens=add_special_tokens,
                                       stride=stride,
                                       truncate_first_sequence=truncate_first_sequence,
+                                      truncate_both_sequences=truncate_both_sequences,
                                       return_tensors=return_tensors)
 
 
     def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
-                          truncate_first_sequence=True, return_tensors=None):
+                          truncate_first_sequence=True, truncate_both_sequences=True, return_tensors=None):
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
         It adds special tokens, truncates
@@ -825,21 +829,30 @@ class PreTrainedTokenizer(object):
         encoded_inputs = {}
         if max_length:
             n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
-            if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
-                logger.warning(
-                    "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
-                    "This pair of sequences will not be truncated.")
-            else:
-                if n_added_tokens + len_ids + len_pair_ids > max_length:
-                    if truncate_first_sequence or not pair:
-                        encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
-                        ids = ids[:max_length - len_pair_ids - n_added_tokens]
-                    elif not truncate_first_sequence and pair:
-                        encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
-                        pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
-                    else:
-                        logger.warning(
-                            "Cannot truncate second sequence as it is not provided. No truncation.")
+
+            if n_added_tokens + len_ids + len_pair_ids > max_length:
+                if truncate_both_sequences:
+                    tokens_a, tokens_b = self._truncate_seq_pair(
+                        copy.deepcopy(ids),
+                        copy.deepcopy(pair_ids),
+                        max_length=max_length - n_added_tokens
+                    )
+                    encoded_inputs["overflowing_tokens"] = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
+                    ids = tokens_a
+                    pair_ids = tokens_b
+                elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
+                    logger.warning(
+                        "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
+                        "This pair of sequences will not be truncated.")
+                elif truncate_first_sequence or not pair:
+                    encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
+                    ids = ids[:max_length - len_pair_ids - n_added_tokens]
+                elif not truncate_first_sequence and pair:
+                    encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
+                    pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
+                else:
+                    logger.warning(
+                        "Cannot truncate second sequence as it is not provided. No truncation.")
 
         if add_special_tokens:
             sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
@@ -862,6 +875,25 @@ class PreTrainedTokenizer(object):
 
         return encoded_inputs
 
+    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length."""
+
+        # This is a simple heuristic which will always truncate the longer sequence
+        # one token at a time. This makes more sense than truncating an equal percent
+        # of tokens from each, since if one sequence is very short then each token
+        # that's truncated likely contains more information than a longer sequence.
+
+        # However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger
+        # length or only pop from context
+        while True:
+            total_length = len(tokens_a) + len(tokens_b)
+            if total_length <= max_length:
+                return (tokens_a, tokens_b)
+            if len(tokens_a) > len(tokens_b):
+                tokens_a.pop()
+            else:
+                tokens_b.pop()
+
     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
         logger.warning("This tokenizer does not make use of special tokens.")
         return [0] * len(token_ids_0) + [1] * len(token_ids_1)