From 403d3098572ac308416653648456a940860da39e Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Sat, 13 Jun 2020 09:35:13 -0400 Subject: [PATCH] Hans data (#4854) * Update hans data to be able to use Trainer * Fixes * Deal with tokenizer that don't have token_ids * Clean up things * Simplify data use * Fix the input dict * Formatting + proper path in README --- examples/adversarial/README.md | 2 +- examples/adversarial/hans_processors.py | 221 -------------- examples/adversarial/test_hans.py | 127 +++----- examples/adversarial/utils_hans.py | 371 +++++++++++++++++++----- 4 files changed, 339 insertions(+), 382 deletions(-) delete mode 100644 examples/adversarial/hans_processors.py diff --git a/examples/adversarial/README.md b/examples/adversarial/README.md index 824867fd26..5d50c84734 100644 --- a/examples/adversarial/README.md +++ b/examples/adversarial/README.md @@ -11,7 +11,7 @@ export HANS_DIR=path-to-hans export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py -python examples/hans/test_hans.py \ +python examples/adversarial/test_hans.py \ --task_name hans \ --model_type $MODEL_TYPE \ --do_eval \ diff --git a/examples/adversarial/hans_processors.py b/examples/adversarial/hans_processors.py deleted file mode 100644 index ff75a0acd1..0000000000 --- a/examples/adversarial/hans_processors.py +++ /dev/null @@ -1,221 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" GLUE processors and helpers """ - -import logging -import os - -from transformers.file_utils import is_tf_available -from utils_hans import DataProcessor, InputExample, InputFeatures - - -if is_tf_available(): - import tensorflow as tf - -logger = logging.getLogger(__name__) - - -def hans_convert_examples_to_features( - examples, - tokenizer, - max_length=512, - task=None, - label_list=None, - output_mode=None, - pad_on_left=False, - pad_token=0, - pad_token_segment_id=0, - mask_padding_with_zero=True, -): - """ - Loads a data file into a list of ``InputFeatures`` - - Args: - examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. - tokenizer: Instance of a tokenizer that will tokenize the examples - max_length: Maximum example length - task: HANS - label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method - output_mode: String indicating the output mode. Either ``regression`` or ``classification`` - pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) - pad_token: Padding token - pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) - mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values - and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for - actual values) - - Returns: - If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` - containing the task-specific features. If the input is a list of ``InputExamples``, will return - a list of task-specific ``InputFeatures`` which can be fed to the model. - - """ - is_tf_dataset = False - if is_tf_available() and isinstance(examples, tf.data.Dataset): - is_tf_dataset = True - - if task is not None: - processor = glue_processors[task]() - if label_list is None: - label_list = processor.get_labels() - logger.info("Using label list %s for task %s" % (label_list, task)) - if output_mode is None: - output_mode = glue_output_modes[task] - logger.info("Using output mode %s for task %s" % (output_mode, task)) - - label_map = {label: i for i, label in enumerate(label_list)} - - features = [] - for (ex_index, example) in enumerate(examples): - if ex_index % 10000 == 0: - logger.info("Writing example %d" % (ex_index)) - if is_tf_dataset: - example = processor.get_example_from_tensor_dict(example) - example = processor.tfds_map(example) - - inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,) - input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_length - len(input_ids) - if pad_on_left: - input_ids = ([pad_token] * padding_length) + input_ids - attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask - token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids - else: - input_ids = input_ids + ([pad_token] * padding_length) - attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) - - assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) - assert len(attention_mask) == max_length, "Error with input length {} vs {}".format( - len(attention_mask), max_length - ) - assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format( - len(token_type_ids), max_length - ) - - if output_mode == "classification": - label = label_map[example.label] if example.label in label_map else 0 - elif output_mode == "regression": - label = float(example.label) - else: - raise KeyError(output_mode) - pairID = str(example.pairID) - - if ex_index < 10: - logger.info("*** Example ***") - logger.info("text_a: %s" % (example.text_a)) - logger.info("text_b: %s" % (example.text_b)) - logger.info("guid: %s" % (example.guid)) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) - logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) - logger.info("label: %s (id = %d)" % (example.label, label)) - - features.append( - InputFeatures( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - label=label, - pairID=pairID, - ) - ) - - if is_tf_available() and is_tf_dataset: - - def gen(): - for ex in features: - yield ( - { - "input_ids": ex.input_ids, - "attention_mask": ex.attention_mask, - "token_type_ids": ex.token_type_ids, - }, - ex.label, - ) - - return tf.data.Dataset.from_generator( - gen, - ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64), - ( - { - "input_ids": tf.TensorShape([None]), - "attention_mask": tf.TensorShape([None]), - "token_type_ids": tf.TensorShape([None]), - }, - tf.TensorShape([]), - ), - ) - - return features - - -class HansProcessor(DataProcessor): - """Processor for the HANS data set.""" - - def get_example_from_tensor_dict(self, tensor_dict): - """See base class.""" - return InputExample( - tensor_dict["idx"].numpy(), - tensor_dict["premise"].numpy().decode("utf-8"), - tensor_dict["hypothesis"].numpy().decode("utf-8"), - str(tensor_dict["label"].numpy()), - ) - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[5] - text_b = line[6] - pairID = line[7][2:] if line[7].startswith("ex") else line[7] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID)) - return examples - - -glue_tasks_num_labels = { - "hans": 3, -} - -glue_processors = { - "hans": HansProcessor, -} - -glue_output_modes = { - "hans": "classification", -} diff --git a/examples/adversarial/test_hans.py b/examples/adversarial/test_hans.py index d22fbc8122..3d8cf08598 100644 --- a/examples/adversarial/test_hans.py +++ b/examples/adversarial/test_hans.py @@ -25,13 +25,10 @@ import random import numpy as np import torch -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange -from hans_processors import glue_output_modes as output_modes -from hans_processors import glue_processors as processors -from hans_processors import hans_convert_examples_to_features as convert_examples_to_features from transformers import ( WEIGHTS_NAME, AdamW, @@ -41,6 +38,7 @@ from transformers import ( BertConfig, BertForSequenceClassification, BertTokenizer, + DefaultDataCollator, DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer, @@ -55,6 +53,7 @@ from transformers import ( XLNetTokenizer, get_linear_schedule_with_warmup, ) +from utils_hans import HansDataset, hans_output_modes, hans_processors try: @@ -91,7 +90,12 @@ def train(args, train_dataset, model, tokenizer): args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) - train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) + train_dataloader = DataLoader( + train_dataset, + sampler=train_sampler, + batch_size=args.train_batch_size, + collate_fn=DefaultDataCollator().collate_batch, + ) if args.max_steps > 0: t_total = args.max_steps @@ -153,12 +157,7 @@ def train(args, train_dataset, model, tokenizer): epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() - batch = tuple(t.to(args.device) for t in batch) - inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} - if args.model_type != "distilbert": - inputs["token_type_ids"] = ( - batch[2] if args.model_type in ["bert", "xlnet"] else None - ) # XLM, DistilBERT and RoBERTa don't use segment_ids + inputs = {k: t.to(args.device) for k, t in batch.items() if k != "pairID"} outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) @@ -230,14 +229,21 @@ def train(args, train_dataset, model, tokenizer): return global_step, tr_loss / global_step -def evaluate(args, model, tokenizer, prefix=""): +def evaluate(args, model, tokenizer, label_list, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): - eval_dataset, label_list = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) + eval_dataset = HansDataset( + args.data_dir, + tokenizer, + args.task_name, + args.max_seq_length, + overwrite_cache=args.overwrite_cache, + evaluate=True, + ) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) @@ -245,7 +251,12 @@ def evaluate(args, model, tokenizer, prefix=""): args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) + eval_dataloader = DataLoader( + eval_dataset, + sampler=eval_sampler, + batch_size=args.eval_batch_size, + collate_fn=DefaultDataCollator().collate_batch, + ) # multi-gpu eval if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): @@ -261,14 +272,9 @@ def evaluate(args, model, tokenizer, prefix=""): out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() - batch = tuple(t.to(args.device) for t in batch) - + inputs = {k: t.to(args.device) for k, t in batch.items() if k != "pairID"} + pair_ids = batch.pop("pairID", None) with torch.no_grad(): - inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} - if args.model_type != "distilbert": - inputs["token_type_ids"] = ( - batch[2] if args.model_type in ["bert", "xlnet"] else None - ) # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] @@ -277,11 +283,11 @@ def evaluate(args, model, tokenizer, prefix=""): if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() - pair_ids = batch[4].detach().cpu().numpy() + pair_ids = pair_ids.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) - pair_ids = np.append(pair_ids, batch[4].detach().cpu().numpy(), axis=0) + pair_ids = np.append(pair_ids, pair_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": @@ -298,67 +304,6 @@ def evaluate(args, model, tokenizer, prefix=""): return results -def load_and_cache_examples(args, task, tokenizer, evaluate=False): - if args.local_rank not in [-1, 0] and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - processor = processors[task]() - output_mode = output_modes[task] - # Load data features from cache or dataset file - cached_features_file = os.path.join( - args.data_dir, - "cached_{}_{}_{}_{}".format( - "dev" if evaluate else "train", - list(filter(None, args.model_name_or_path.split("/"))).pop(), - str(args.max_seq_length), - str(task), - ), - ) - - label_list = processor.get_labels() - - if os.path.exists(cached_features_file) and not args.overwrite_cache: - logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) - else: - logger.info("Creating features from dataset file at %s", args.data_dir) - if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]: - # HACK(label indices are swapped in RoBERTa pretrained model) - label_list[1], label_list[2] = label_list[2], label_list[1] - examples = ( - processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - ) - features = convert_examples_to_features( - examples, - tokenizer, - label_list=label_list, - max_length=args.max_seq_length, - output_mode=output_mode, - pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet - pad_token=tokenizer.pad_token_id, - pad_token_segment_id=tokenizer.pad_token_type_id, - ) - if args.local_rank in [-1, 0]: - logger.info("Saving features into cached file %s", cached_features_file) - torch.save(features, cached_features_file) - - if args.local_rank == 0 and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - # Convert to Tensors and build dataset - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) - if output_mode == "classification": - all_labels = torch.tensor([f.label for f in features], dtype=torch.long) - elif output_mode == "regression": - all_labels = torch.tensor([f.label for f in features], dtype=torch.float) - all_pair_ids = torch.tensor([int(f.pairID) for f in features], dtype=torch.long) - - dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_pair_ids) - return dataset, label_list - - def main(): parser = argparse.ArgumentParser() @@ -389,7 +334,7 @@ def main(): default=None, type=str, required=True, - help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), + help="The name of the task to train selected in the list: " + ", ".join(hans_processors.keys()), ) parser.add_argument( "--output_dir", @@ -541,10 +486,10 @@ def main(): # Prepare GLUE task args.task_name = args.task_name.lower() - if args.task_name not in processors: + if args.task_name not in hans_processors: raise ValueError("Task not found: %s" % (args.task_name)) - processor = processors[args.task_name]() - args.output_mode = output_modes[args.task_name] + processor = hans_processors[args.task_name]() + args.output_mode = hans_output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) @@ -581,7 +526,9 @@ def main(): # Training if args.do_train: - train_dataset, _ = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) + train_dataset = HansDataset( + args.data_dir, tokenizer, args.task_name, args.max_seq_length, overwrite_cache=args.overwrite_cache + ) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) @@ -625,7 +572,7 @@ def main(): model = model_class.from_pretrained(checkpoint) model.to(args.device) - result = evaluate(args, model, tokenizer, prefix=prefix) + result = evaluate(args, model, tokenizer, label_list, prefix=prefix) result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) diff --git a/examples/adversarial/utils_hans.py b/examples/adversarial/utils_hans.py index 8d0b42165c..d99e1d8ecb 100644 --- a/examples/adversarial/utils_hans.py +++ b/examples/adversarial/utils_hans.py @@ -14,108 +14,339 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy -import csv -import json +import logging +import os +from dataclasses import dataclass +from typing import List, Optional, Union + +import tqdm +from filelock import FileLock + +from transformers import ( + DataProcessor, + PreTrainedTokenizer, + RobertaTokenizer, + RobertaTokenizerFast, + XLMRobertaTokenizer, + is_tf_available, + is_torch_available, +) -class InputExample(object): +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class InputExample: """ A single training/test example for simple sequence classification. Args: guid: Unique id for the example. text_a: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. + sequence tasks, only this sequence must be specified. text_b: (Optional) string. The untokenized text of the second sequence. - Only must be specified for sequence pair tasks. + Only must be specified for sequence pair tasks. label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. + specified for train and dev examples, but not for test examples. + pairID: (Optional) string. Unique identifier for the pair of sentences. """ - def __init__(self, guid, text_a, text_b=None, label=None, pairID=None): - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - self.pairID = pairID - - def __repr__(self): - return str(self.to_json_string()) - - def to_dict(self): - """Serializes this instance to a Python dictionary.""" - output = copy.deepcopy(self.__dict__) - return output - - def to_json_string(self): - """Serializes this instance to a JSON string.""" - return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + guid: str + text_a: str + text_b: Optional[str] = None + label: Optional[str] = None + pairID: Optional[str] = None -class InputFeatures(object): +@dataclass(frozen=True) +class InputFeatures: """ A single set of features of data. + Property names are the same names as the corresponding inputs to a model. Args: input_ids: Indices of input sequence tokens in the vocabulary. attention_mask: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. - token_type_ids: Segment token indices to indicate first and second portions of the inputs. - label: Label corresponding to the input + token_type_ids: (Optional) Segment token indices to indicate first and second + portions of the inputs. Only some models use them. + label: (Optional) Label corresponding to the input. Int for classification problems, + float for regression problems. + pairID: (Optional) Unique identifier for the pair of sentences. """ - def __init__(self, input_ids, attention_mask, token_type_ids, label, pairID=None): - self.input_ids = input_ids - self.attention_mask = attention_mask - self.token_type_ids = token_type_ids - self.label = label - self.pairID = pairID - - def __repr__(self): - return str(self.to_json_string()) - - def to_dict(self): - """Serializes this instance to a Python dictionary.""" - output = copy.deepcopy(self.__dict__) - return output - - def to_json_string(self): - """Serializes this instance to a JSON string.""" - return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + input_ids: List[int] + attention_mask: Optional[List[int]] = None + token_type_ids: Optional[List[int]] = None + label: Optional[Union[int, float]] = None + pairID: Optional[int] = None -class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" +if is_torch_available(): + import torch + from torch.utils.data.dataset import Dataset + + class HansDataset(Dataset): + """ + This will be superseded by a framework-agnostic approach + soon. + """ + + features: List[InputFeatures] + + def __init__( + self, + data_dir: str, + tokenizer: PreTrainedTokenizer, + task: str, + max_seq_length: Optional[int] = None, + overwrite_cache=False, + evaluate: bool = False, + ): + processor = hans_processors[task]() + output_mode = hans_output_modes[task] + + cached_features_file = os.path.join( + data_dir, + "cached_{}_{}_{}_{}".format( + "dev" if evaluate else "train", tokenizer.__class__.__name__, str(max_seq_length), task, + ), + ) + + # Make sure only the first process in distributed training processes the dataset, + # and the others will use the cache. + lock_path = cached_features_file + ".lock" + with FileLock(lock_path): + + if os.path.exists(cached_features_file) and not overwrite_cache: + logger.info(f"Loading features from cached file {cached_features_file}") + self.features = torch.load(cached_features_file) + else: + logger.info(f"Creating features from dataset file at {data_dir}") + label_list = processor.get_labels() + + if task in ["mnli", "mnli-mm"] and tokenizer.__class__ in ( + RobertaTokenizer, + RobertaTokenizerFast, + XLMRobertaTokenizer, + ): + # HACK(label indices are swapped in RoBERTa pretrained model) + label_list[1], label_list[2] = label_list[2], label_list[1] + examples = ( + processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) + ) + + logger.info("Training examples: %s", len(examples)) + # TODO clean up all this to leverage built-in features of tokenizers + self.features = hans_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_mode + ) + logger.info("Saving features into cached file %s", cached_features_file) + torch.save(self.features, cached_features_file) + + def __len__(self): + return len(self.features) + + def __getitem__(self, i) -> InputFeatures: + return self.features[i] + + +if is_tf_available(): + import tensorflow as tf + + class TFHansDataset: + """ + This will be superseded by a framework-agnostic approach + soon. + """ + + features: List[InputFeatures] + + def __init__( + self, + data_dir: str, + tokenizer: PreTrainedTokenizer, + task: str, + max_seq_length: Optional[int] = 128, + overwrite_cache=False, + evaluate: bool = False, + ): + processor = hans_processors[task]() + output_mode = hans_output_modes[task] + label_list = processor.get_labels() + + if task in ["mnli", "mnli-mm"] and tokenizer.__class__ in ( + RobertaTokenizer, + RobertaTokenizerFast, + XLMRobertaTokenizer, + ): + # HACK(label indices are swapped in RoBERTa pretrained model) + label_list[1], label_list[2] = label_list[2], label_list[1] + + examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) + self.features = hans_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_mode + ) + + def gen(): + for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"): + if ex_index % 10000 == 0: + logger.info("Writing example %d of %d" % (ex_index, len(examples))) + + yield ( + { + "example_id": 0, + "input_ids": ex.input_ids, + "attention_mask": ex.attention_mask, + "token_type_ids": ex.token_type_ids, + }, + ex.label, + ) + + self.dataset = tf.data.Dataset.from_generator( + gen, + ( + { + "example_id": tf.int32, + "input_ids": tf.int32, + "attention_mask": tf.int32, + "token_type_ids": tf.int32, + }, + tf.int64, + ), + ( + { + "example_id": tf.TensorShape([]), + "input_ids": tf.TensorShape([None, None]), + "attention_mask": tf.TensorShape([None, None]), + "token_type_ids": tf.TensorShape([None, None]), + }, + tf.TensorShape([]), + ), + ) + + def get_dataset(self): + return self.dataset + + def __len__(self): + return len(self.features) + + def __getitem__(self, i) -> InputFeatures: + return self.features[i] + + +class HansProcessor(DataProcessor): + """Processor for the HANS data set.""" def get_example_from_tensor_dict(self, tensor_dict): - """Gets an example from a dict with tensorflow tensors - - Args: - tensor_dict: Keys and values should match the corresponding Glue - tensorflow_dataset examples. - """ - raise NotImplementedError() + """See base class.""" + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["premise"].numpy().decode("utf-8"), + tensor_dict["hypothesis"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train") def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev") def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() + """See base class.""" + return ["contradiction", "entailment", "neutral"] - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, "r", encoding="utf-8-sig") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - lines.append(line) - return lines + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[5] + text_b = line[6] + pairID = line[7][2:] if line[7].startswith("ex") else line[7] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID)) + return examples + + +def hans_convert_examples_to_features( + examples: List[InputExample], + label_list: List[str], + max_length: int, + tokenizer: PreTrainedTokenizer, + output_mode: str, +): + """ + Loads a data file into a list of ``InputFeatures`` + + Args: + examples: List of ``InputExamples`` containing the examples. + tokenizer: Instance of a tokenizer that will tokenize the examples. + max_length: Maximum example length. + label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method. + output_mode: String indicating the output mode. Either ``regression`` or ``classification``. + + Returns: + A list of task-specific ``InputFeatures`` which can be fed to the model. + + """ + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"): + if ex_index % 10000 == 0: + logger.info("Writing example %d" % (ex_index)) + + inputs = tokenizer.encode_plus( + example.text_a, + example.text_b, + add_special_tokens=True, + max_length=max_length, + pad_to_max_length=True, + return_overflowing_tokens=True, + ) + if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0: + logger.info( + "Attention! you are cropping tokens (swag task is ok). " + "If you are training ARC and RACE and you are poping question + options," + "you need to try to use a bigger max seq length!" + ) + + if output_mode == "classification": + label = label_map[example.label] if example.label in label_map else 0 + elif output_mode == "regression": + label = float(example.label) + else: + raise KeyError(output_mode) + + pairID = int(example.pairID) + + features.append(InputFeatures(**inputs, label=label, pairID=pairID)) + + for i, example in enumerate(examples[:5]): + logger.info("*** Example ***") + logger.info(f"guid: {example}") + logger.info(f"features: {features[i]}") + + return features + + +hans_tasks_num_labels = { + "hans": 3, +} + +hans_processors = { + "hans": HansProcessor, +} + +hans_output_modes = { + "hans": "classification", +}