Updated the GLUE data processor. Corrections to RoBERTa and XLNet.

2019-09-02 21:46:11 -04:00
parent c3df2136e1
commit bac332fec0
3 changed files with 4 additions and 56 deletions
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -415,58 +415,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-        tokens_a = tokenizer.tokenize(example.text_a)
+        input_ids, input_mask = tokenizer.encode(example.text_a, example.text_b, add_special_tokens=True, output_mask=True)
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
            special_tokens_count = 4 if sep_token_extra else 3
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
        else:
            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
            special_tokens_count = 3 if sep_token_extra else 2
            if len(tokens_a) > max_seq_length - special_tokens_count:
                tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = tokens_a + [sep_token]
        if sep_token_extra:
            # roberta uses an extra separator b/w pairs of sentences
            tokens += [sep_token]
        segment_ids = [sequence_a_segment_id] * len(tokens)
        if tokens_b:
            tokens += tokens_b + [sep_token]
            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
        if cls_token_at_end:
            tokens = tokens + [cls_token]
            segment_ids = segment_ids + [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            segment_ids = [cls_token_segment_id] + segment_ids
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
@@ -497,8 +446,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -98,7 +98,7 @@ class RobertaTokenizer(GPT2Tokenizer):
        if output_mask:
            return (
                cls + token_ids_0 + sep + sep + token_ids_1 + sep,
-                [0] * len(cls + token_ids_0 + sep) + [1] * len(sep + token_ids_1 + sep)
+                [0] * len(cls + token_ids_0 + sep + sep) + [1] * len(token_ids_1 + sep)
            )
        else:
            return cls + token_ids_0 + sep + sep + token_ids_1 + sep
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -198,10 +198,11 @@ class XLNetTokenizer(PreTrainedTokenizer):
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        cls_segment_ids = [2]
        if output_mask:
            return (
                token_ids_0 + sep + token_ids_1 + sep + cls,
-                [0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep + cls)
+                [0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep) + cls_segment_ids
            )
        else:
            return token_ids_0 + sep + token_ids_1 + sep + cls