Updated the GLUE data processor. Corrections to RoBERTa and XLNet.
This commit is contained in:
@@ -415,58 +415,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
|
||||
if ex_index % 10000 == 0:
|
||||
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
|
||||
|
||||
tokens_a = tokenizer.tokenize(example.text_a)
|
||||
|
||||
tokens_b = None
|
||||
if example.text_b:
|
||||
tokens_b = tokenizer.tokenize(example.text_b)
|
||||
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
||||
# length is less than the specified length.
|
||||
# Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
|
||||
special_tokens_count = 4 if sep_token_extra else 3
|
||||
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
|
||||
else:
|
||||
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
|
||||
special_tokens_count = 3 if sep_token_extra else 2
|
||||
if len(tokens_a) > max_seq_length - special_tokens_count:
|
||||
tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
|
||||
|
||||
# The convention in BERT is:
|
||||
# (a) For sequence pairs:
|
||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
||||
# (b) For single sequences:
|
||||
# tokens: [CLS] the dog is hairy . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0
|
||||
#
|
||||
# Where "type_ids" are used to indicate whether this is the first
|
||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
||||
# `type=1` were learned during pre-training and are added to the wordpiece
|
||||
# embedding vector (and position vector). This is not *strictly* necessary
|
||||
# since the [SEP] token unambiguously separates the sequences, but it makes
|
||||
# it easier for the model to learn the concept of sequences.
|
||||
#
|
||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
||||
# used as as the "sentence vector". Note that this only makes sense because
|
||||
# the entire model is fine-tuned.
|
||||
tokens = tokens_a + [sep_token]
|
||||
if sep_token_extra:
|
||||
# roberta uses an extra separator b/w pairs of sentences
|
||||
tokens += [sep_token]
|
||||
segment_ids = [sequence_a_segment_id] * len(tokens)
|
||||
|
||||
if tokens_b:
|
||||
tokens += tokens_b + [sep_token]
|
||||
segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
|
||||
|
||||
if cls_token_at_end:
|
||||
tokens = tokens + [cls_token]
|
||||
segment_ids = segment_ids + [cls_token_segment_id]
|
||||
else:
|
||||
tokens = [cls_token] + tokens
|
||||
segment_ids = [cls_token_segment_id] + segment_ids
|
||||
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
input_ids, input_mask = tokenizer.encode(example.text_a, example.text_b, add_special_tokens=True, output_mask=True)
|
||||
|
||||
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
||||
# tokens are attended to.
|
||||
@@ -497,8 +446,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
|
||||
if ex_index < 5:
|
||||
logger.info("*** Example ***")
|
||||
logger.info("guid: %s" % (example.guid))
|
||||
logger.info("tokens: %s" % " ".join(
|
||||
[str(x) for x in tokens]))
|
||||
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
||||
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
||||
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
|
||||
|
||||
Reference in New Issue
Block a user