From bac332fec06c843c8c3436091bb53343c109202d Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 2 Sep 2019 21:46:11 -0400
Subject: [PATCH] Updated the GLUE data processor. Corrections to RoBERTa and
 XLNet.

---
 examples/utils_glue.py                       | 55 +-------------------
 pytorch_transformers/tokenization_roberta.py |  2 +-
 pytorch_transformers/tokenization_xlnet.py   |  3 +-
 3 files changed, 4 insertions(+), 56 deletions(-)

diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index 3e3f104672..841d7376cf 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -415,58 +415,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
         if ex_index % 10000 == 0:
             logger.info("Writing example %d of %d" % (ex_index, len(examples)))
 
-        tokens_a = tokenizer.tokenize(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.tokenize(example.text_b)
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
-            special_tokens_count = 4 if sep_token_extra else 3
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
-        else:
-            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
-            special_tokens_count = 3 if sep_token_extra else 2
-            if len(tokens_a) > max_seq_length - special_tokens_count:
-                tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids:   0   0   0   0  0     0   0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = tokens_a + [sep_token]
-        if sep_token_extra:
-            # roberta uses an extra separator b/w pairs of sentences
-            tokens += [sep_token]
-        segment_ids = [sequence_a_segment_id] * len(tokens)
-
-        if tokens_b:
-            tokens += tokens_b + [sep_token]
-            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
-
-        if cls_token_at_end:
-            tokens = tokens + [cls_token]
-            segment_ids = segment_ids + [cls_token_segment_id]
-        else:
-            tokens = [cls_token] + tokens
-            segment_ids = [cls_token_segment_id] + segment_ids
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        input_ids, input_mask = tokenizer.encode(example.text_a, example.text_b, add_special_tokens=True, output_mask=True)
 
         # The mask has 1 for real tokens and 0 for padding tokens. Only real
         # tokens are attended to.
@@ -497,8 +446,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
         if ex_index < 5:
             logger.info("*** Example ***")
             logger.info("guid: %s" % (example.guid))
-            logger.info("tokens: %s" % " ".join(
-                    [str(x) for x in tokens]))
             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
             logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index db4cbe967f..b41d85e7e2 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -98,7 +98,7 @@ class RobertaTokenizer(GPT2Tokenizer):
         if output_mask:
             return (
                 cls + token_ids_0 + sep + sep + token_ids_1 + sep,
-                [0] * len(cls + token_ids_0 + sep) + [1] * len(sep + token_ids_1 + sep)
+                [0] * len(cls + token_ids_0 + sep + sep) + [1] * len(token_ids_1 + sep)
             )
         else:
             return cls + token_ids_0 + sep + sep + token_ids_1 + sep
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index cbf312bd38..9faa3c7e59 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -198,10 +198,11 @@ class XLNetTokenizer(PreTrainedTokenizer):
 
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
+        cls_segment_ids = [2]
         if output_mask:
             return (
                 token_ids_0 + sep + token_ids_1 + sep + cls,
-                [0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep + cls)
+                [0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep) + cls_segment_ids
             )
         else:
             return token_ids_0 + sep + token_ids_1 + sep + cls