Updated the GLUE data processor. Corrections to RoBERTa and XLNet.
This commit is contained in:
@@ -415,58 +415,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
|
|||||||
if ex_index % 10000 == 0:
|
if ex_index % 10000 == 0:
|
||||||
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
|
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
|
||||||
|
|
||||||
tokens_a = tokenizer.tokenize(example.text_a)
|
input_ids, input_mask = tokenizer.encode(example.text_a, example.text_b, add_special_tokens=True, output_mask=True)
|
||||||
|
|
||||||
tokens_b = None
|
|
||||||
if example.text_b:
|
|
||||||
tokens_b = tokenizer.tokenize(example.text_b)
|
|
||||||
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
|
||||||
# length is less than the specified length.
|
|
||||||
# Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
|
|
||||||
special_tokens_count = 4 if sep_token_extra else 3
|
|
||||||
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
|
|
||||||
else:
|
|
||||||
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
|
|
||||||
special_tokens_count = 3 if sep_token_extra else 2
|
|
||||||
if len(tokens_a) > max_seq_length - special_tokens_count:
|
|
||||||
tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
|
|
||||||
|
|
||||||
# The convention in BERT is:
|
|
||||||
# (a) For sequence pairs:
|
|
||||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
|
||||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
|
||||||
# (b) For single sequences:
|
|
||||||
# tokens: [CLS] the dog is hairy . [SEP]
|
|
||||||
# type_ids: 0 0 0 0 0 0 0
|
|
||||||
#
|
|
||||||
# Where "type_ids" are used to indicate whether this is the first
|
|
||||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
|
||||||
# `type=1` were learned during pre-training and are added to the wordpiece
|
|
||||||
# embedding vector (and position vector). This is not *strictly* necessary
|
|
||||||
# since the [SEP] token unambiguously separates the sequences, but it makes
|
|
||||||
# it easier for the model to learn the concept of sequences.
|
|
||||||
#
|
|
||||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
|
||||||
# used as as the "sentence vector". Note that this only makes sense because
|
|
||||||
# the entire model is fine-tuned.
|
|
||||||
tokens = tokens_a + [sep_token]
|
|
||||||
if sep_token_extra:
|
|
||||||
# roberta uses an extra separator b/w pairs of sentences
|
|
||||||
tokens += [sep_token]
|
|
||||||
segment_ids = [sequence_a_segment_id] * len(tokens)
|
|
||||||
|
|
||||||
if tokens_b:
|
|
||||||
tokens += tokens_b + [sep_token]
|
|
||||||
segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
|
|
||||||
|
|
||||||
if cls_token_at_end:
|
|
||||||
tokens = tokens + [cls_token]
|
|
||||||
segment_ids = segment_ids + [cls_token_segment_id]
|
|
||||||
else:
|
|
||||||
tokens = [cls_token] + tokens
|
|
||||||
segment_ids = [cls_token_segment_id] + segment_ids
|
|
||||||
|
|
||||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
|
||||||
|
|
||||||
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
||||||
# tokens are attended to.
|
# tokens are attended to.
|
||||||
@@ -497,8 +446,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
|
|||||||
if ex_index < 5:
|
if ex_index < 5:
|
||||||
logger.info("*** Example ***")
|
logger.info("*** Example ***")
|
||||||
logger.info("guid: %s" % (example.guid))
|
logger.info("guid: %s" % (example.guid))
|
||||||
logger.info("tokens: %s" % " ".join(
|
|
||||||
[str(x) for x in tokens]))
|
|
||||||
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
||||||
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
||||||
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
|
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
if output_mask:
|
if output_mask:
|
||||||
return (
|
return (
|
||||||
cls + token_ids_0 + sep + sep + token_ids_1 + sep,
|
cls + token_ids_0 + sep + sep + token_ids_1 + sep,
|
||||||
[0] * len(cls + token_ids_0 + sep) + [1] * len(sep + token_ids_1 + sep)
|
[0] * len(cls + token_ids_0 + sep + sep) + [1] * len(token_ids_1 + sep)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||||
|
|||||||
@@ -198,10 +198,11 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
cls_segment_ids = [2]
|
||||||
if output_mask:
|
if output_mask:
|
||||||
return (
|
return (
|
||||||
token_ids_0 + sep + token_ids_1 + sep + cls,
|
token_ids_0 + sep + token_ids_1 + sep + cls,
|
||||||
[0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep + cls)
|
[0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep) + cls_segment_ids
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return token_ids_0 + sep + token_ids_1 + sep + cls
|
return token_ids_0 + sep + token_ids_1 + sep + cls
|
||||||
|
|||||||
Reference in New Issue
Block a user