updating GLUE utils for compatibility with XLNet

2019-06-24 14:36:11 +02:00
parent 24ed0b9346
commit 62d78aa37e
9 changed files with 310 additions and 98 deletions
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -388,8 +388,15 @@ class WnliProcessor(DataProcessor):


 def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer, output_mode):
-    """Loads a data file into a list of `InputBatch`s."""
+                                 tokenizer, output_mode,
+                                 cls_token_at_end=False, cls_token='[CLS]',
+                                 sep_token='[SEP]', cls_token_segment_id=0):
+    """ Loads a data file into a list of `InputBatch`s
+        `cls_token_at_end` define the location of the CLS token:
+            - False (BERT pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+    """

    label_map = {label : i for i, label in enumerate(label_list)}

@@ -415,10 +422,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0   0   0   0  0     0 0
+        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
@@ -430,13 +437,20 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
-        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        tokens = tokens_a + [sep_token]
        segment_ids = [0] * len(tokens)

        if tokens_b:
-            tokens += tokens_b + ["[SEP]"]
+            tokens += tokens_b + [sep_token]
            segment_ids += [1] * (len(tokens_b) + 1)

+        if cls_token_at_end:
+            tokens = tokens + [cls_token]
+            segment_ids = segment_ids + [cls_token_segment_id]
+        else:
+            tokens = [cls_token] + tokens
+            segment_ids = [cls_token_segment_id] + segment_ids
+
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real