[RoBERTa] Update run_glue for RoBERTa

2019-08-12 13:49:50 -04:00
parent b3d83d68db
commit 912fdff899
2 changed files with 22 additions and 8 deletions
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -390,10 +390,16 @@ class WnliProcessor(DataProcessor):

 def convert_examples_to_features(examples, label_list, max_seq_length,
                                 tokenizer, output_mode,
-                                 cls_token_at_end=False, pad_on_left=False,
-                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
-                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
-                                 cls_token_segment_id=1, pad_token_segment_id=0,
+                                 cls_token_at_end=False,
+                                 cls_token='[CLS]',
+                                 cls_token_segment_id=1,
+                                 sep_token='[SEP]',
+                                 sep_token_extra=False,
+                                 pad_on_left=False,
+                                 pad_token=0,
+                                 pad_token_segment_id=0,
+                                 sequence_a_segment_id=0, 
+                                 sequence_b_segment_id=1,
                                 mask_padding_with_zero=True):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
@@ -442,6 +448,9 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = tokens_a + [sep_token]
+        if sep_token_extra:
+            # roberta uses an extra separator b/w pairs of sentences
+            tokens += [sep_token]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if tokens_b: