updating GLUE utils for compatibility with XLNet
This commit is contained in:
@@ -388,8 +388,15 @@ class WnliProcessor(DataProcessor):
|
||||
|
||||
|
||||
def convert_examples_to_features(examples, label_list, max_seq_length,
|
||||
tokenizer, output_mode):
|
||||
"""Loads a data file into a list of `InputBatch`s."""
|
||||
tokenizer, output_mode,
|
||||
cls_token_at_end=False, cls_token='[CLS]',
|
||||
sep_token='[SEP]', cls_token_segment_id=0):
|
||||
""" Loads a data file into a list of `InputBatch`s
|
||||
`cls_token_at_end` define the location of the CLS token:
|
||||
- False (BERT pattern): [CLS] + A + [SEP] + B + [SEP]
|
||||
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
|
||||
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
|
||||
"""
|
||||
|
||||
label_map = {label : i for i, label in enumerate(label_list)}
|
||||
|
||||
@@ -415,10 +422,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
|
||||
# The convention in BERT is:
|
||||
# (a) For sequence pairs:
|
||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
||||
# (b) For single sequences:
|
||||
# tokens: [CLS] the dog is hairy . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0
|
||||
# type_ids: 0 0 0 0 0 0 0
|
||||
#
|
||||
# Where "type_ids" are used to indicate whether this is the first
|
||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
||||
@@ -430,13 +437,20 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
|
||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
||||
# used as as the "sentence vector". Note that this only makes sense because
|
||||
# the entire model is fine-tuned.
|
||||
tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
|
||||
tokens = tokens_a + [sep_token]
|
||||
segment_ids = [0] * len(tokens)
|
||||
|
||||
if tokens_b:
|
||||
tokens += tokens_b + ["[SEP]"]
|
||||
tokens += tokens_b + [sep_token]
|
||||
segment_ids += [1] * (len(tokens_b) + 1)
|
||||
|
||||
if cls_token_at_end:
|
||||
tokens = tokens + [cls_token]
|
||||
segment_ids = segment_ids + [cls_token_segment_id]
|
||||
else:
|
||||
tokens = [cls_token] + tokens
|
||||
segment_ids = [cls_token_segment_id] + segment_ids
|
||||
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
|
||||
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
||||
|
||||
Reference in New Issue
Block a user