From bac332fec06c843c8c3436091bb53343c109202d Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Mon, 2 Sep 2019 21:46:11 -0400 Subject: [PATCH] Updated the GLUE data processor. Corrections to RoBERTa and XLNet. --- examples/utils_glue.py | 55 +------------------- pytorch_transformers/tokenization_roberta.py | 2 +- pytorch_transformers/tokenization_xlnet.py | 3 +- 3 files changed, 4 insertions(+), 56 deletions(-) diff --git a/examples/utils_glue.py b/examples/utils_glue.py index 3e3f104672..841d7376cf 100644 --- a/examples/utils_glue.py +++ b/examples/utils_glue.py @@ -415,58 +415,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, if ex_index % 10000 == 0: logger.info("Writing example %d of %d" % (ex_index, len(examples))) - tokens_a = tokenizer.tokenize(example.text_a) - - tokens_b = None - if example.text_b: - tokens_b = tokenizer.tokenize(example.text_b) - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa. - special_tokens_count = 4 if sep_token_extra else 3 - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) - else: - # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. - special_tokens_count = 3 if sep_token_extra else 2 - if len(tokens_a) > max_seq_length - special_tokens_count: - tokens_a = tokens_a[:(max_seq_length - special_tokens_count)] - - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambiguously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = tokens_a + [sep_token] - if sep_token_extra: - # roberta uses an extra separator b/w pairs of sentences - tokens += [sep_token] - segment_ids = [sequence_a_segment_id] * len(tokens) - - if tokens_b: - tokens += tokens_b + [sep_token] - segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1) - - if cls_token_at_end: - tokens = tokens + [cls_token] - segment_ids = segment_ids + [cls_token_segment_id] - else: - tokens = [cls_token] + tokens - segment_ids = [cls_token_segment_id] + segment_ids - - input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_ids, input_mask = tokenizer.encode(example.text_a, example.text_b, add_special_tokens=True, output_mask=True) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. @@ -497,8 +446,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) - logger.info("tokens: %s" % " ".join( - [str(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py index db4cbe967f..b41d85e7e2 100644 --- a/pytorch_transformers/tokenization_roberta.py +++ b/pytorch_transformers/tokenization_roberta.py @@ -98,7 +98,7 @@ class RobertaTokenizer(GPT2Tokenizer): if output_mask: return ( cls + token_ids_0 + sep + sep + token_ids_1 + sep, - [0] * len(cls + token_ids_0 + sep) + [1] * len(sep + token_ids_1 + sep) + [0] * len(cls + token_ids_0 + sep + sep) + [1] * len(token_ids_1 + sep) ) else: return cls + token_ids_0 + sep + sep + token_ids_1 + sep diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py index cbf312bd38..9faa3c7e59 100644 --- a/pytorch_transformers/tokenization_xlnet.py +++ b/pytorch_transformers/tokenization_xlnet.py @@ -198,10 +198,11 @@ class XLNetTokenizer(PreTrainedTokenizer): sep = [self.sep_token_id] cls = [self.cls_token_id] + cls_segment_ids = [2] if output_mask: return ( token_ids_0 + sep + token_ids_1 + sep + cls, - [0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep + cls) + [0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep) + cls_segment_ids ) else: return token_ids_0 + sep + token_ids_1 + sep + cls