Hans data (#4854)

* Update hans data to be able to use Trainer * Fixes * Deal with tokenizer that don't have token_ids * Clean up things * Simplify data use * Fix the input dict * Formatting + proper path in README
2020-06-13 09:35:13 -04:00
parent ca5e1cdf8e
commit 403d309857
4 changed files with 339 additions and 382 deletions
--- a/examples/adversarial/README.md
+++ b/examples/adversarial/README.md
@@ -11,7 +11,7 @@ export HANS_DIR=path-to-hans
 export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
 export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
-python examples/hans/test_hans.py \
+python examples/adversarial/test_hans.py \
        --task_name hans \
        --model_type $MODEL_TYPE \
        --do_eval \
--- a/examples/adversarial/hans_processors.py
+++ b/examples/adversarial/hans_processors.py
@@ -1,221 +0,0 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ GLUE processors and helpers """
 import logging
 import os
 from transformers.file_utils import is_tf_available
 from utils_hans import DataProcessor, InputExample, InputFeatures
 if is_tf_available():
    import tensorflow as tf
 logger = logging.getLogger(__name__)
 def hans_convert_examples_to_features(
    examples,
    tokenizer,
    max_length=512,
    task=None,
    label_list=None,
    output_mode=None,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    mask_padding_with_zero=True,
 ):
    """
    Loads a data file into a list of ``InputFeatures``
    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: HANS
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)
    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.
    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True
    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" % (output_mode, task))
    label_map = {label: i for i, label in enumerate(label_list)}
    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)
            example = processor.tfds_map(example)
        inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,)
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
            len(attention_mask), max_length
        )
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
            len(token_type_ids), max_length
        )
        if output_mode == "classification":
            label = label_map[example.label] if example.label in label_map else 0
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)
        pairID = str(example.pairID)
        if ex_index < 10:
            logger.info("*** Example ***")
            logger.info("text_a: %s" % (example.text_a))
            logger.info("text_b: %s" % (example.text_b))
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))
        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                label=label,
                pairID=pairID,
            )
        )
    if is_tf_available() and is_tf_dataset:
        def gen():
            for ex in features:
                yield (
                    {
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask,
                        "token_type_ids": ex.token_type_ids,
                    },
                    ex.label,
                )
        return tf.data.Dataset.from_generator(
            gen,
            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                tf.TensorShape([]),
            ),
        )
    return features
 class HansProcessor(DataProcessor):
    """Processor for the HANS data set."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["premise"].numpy().decode("utf-8"),
            tensor_dict["hypothesis"].numpy().decode("utf-8"),
            str(tensor_dict["label"].numpy()),
        )
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
    def get_labels(self):
        """See base class."""
        return ["contradiction", "entailment", "neutral"]
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[5]
            text_b = line[6]
            pairID = line[7][2:] if line[7].startswith("ex") else line[7]
            label = line[-1]
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
        return examples
 glue_tasks_num_labels = {
    "hans": 3,
 }
 glue_processors = {
    "hans": HansProcessor,
 }
 glue_output_modes = {
    "hans": "classification",
 }
--- a/examples/adversarial/test_hans.py
+++ b/examples/adversarial/test_hans.py
@@ -25,13 +25,10 @@ import random
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from hans_processors import glue_output_modes as output_modes
 from hans_processors import glue_processors as processors
 from hans_processors import hans_convert_examples_to_features as convert_examples_to_features
 from transformers import (
    WEIGHTS_NAME,
    AdamW,
@@ -41,6 +38,7 @@ from transformers import (
    BertConfig,
    BertForSequenceClassification,
    BertTokenizer,
    DefaultDataCollator,
    DistilBertConfig,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
@@ -55,6 +53,7 @@ from transformers import (
    XLNetTokenizer,
    get_linear_schedule_with_warmup,
 )
 from utils_hans import HansDataset, hans_output_modes, hans_processors
 try:
@@ -91,7 +90,12 @@ def train(args, train_dataset, model, tokenizer):
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.train_batch_size,
        collate_fn=DefaultDataCollator().collate_batch,
    )
    if args.max_steps > 0:
        t_total = args.max_steps
@@ -153,12 +157,7 @@ def train(args, train_dataset, model, tokenizer):
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {k: t.to(args.device) for k, t in batch.items() if k != "pairID"}
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert", "xlnet"] else None
                )  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
@@ -230,14 +229,21 @@ def train(args, train_dataset, model, tokenizer):
    return global_step, tr_loss / global_step
-def evaluate(args, model, tokenizer, prefix=""):
+def evaluate(args, model, tokenizer, label_list, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-        eval_dataset, label_list = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
+        eval_dataset = HansDataset(
            args.data_dir,
            tokenizer,
            args.task_name,
            args.max_seq_length,
            overwrite_cache=args.overwrite_cache,
            evaluate=True,
        )
        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)
@@ -245,7 +251,12 @@ def evaluate(args, model, tokenizer, prefix=""):
        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset)
-        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+        eval_dataloader = DataLoader(
            eval_dataset,
            sampler=eval_sampler,
            batch_size=args.eval_batch_size,
            collate_fn=DefaultDataCollator().collate_batch,
        )
        # multi-gpu eval
        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
@@ -261,14 +272,9 @@ def evaluate(args, model, tokenizer, prefix=""):
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
-            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {k: t.to(args.device) for k, t in batch.items() if k != "pairID"}
-
+            pair_ids = batch.pop("pairID", None)
            with torch.no_grad():
                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
                if args.model_type != "distilbert":
                    inputs["token_type_ids"] = (
                        batch[2] if args.model_type in ["bert", "xlnet"] else None
                    )  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
@@ -277,11 +283,11 @@ def evaluate(args, model, tokenizer, prefix=""):
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
-                pair_ids = batch[4].detach().cpu().numpy()
+                pair_ids = pair_ids.detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
-                pair_ids = np.append(pair_ids, batch[4].detach().cpu().numpy(), axis=0)
+                pair_ids = np.append(pair_ids, pair_ids.detach().cpu().numpy(), axis=0)
        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
@@ -298,67 +304,6 @@ def evaluate(args, model, tokenizer, prefix=""):
    return results
 def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
            str(task),
        ),
    )
    label_list = processor.get_labels()
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]:
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
        examples = (
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=args.max_seq_length,
            output_mode=output_mode,
            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
            pad_token=tokenizer.pad_token_id,
            pad_token_segment_id=tokenizer.pad_token_type_id,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)
    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
    all_pair_ids = torch.tensor([int(f.pairID) for f in features], dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_pair_ids)
    return dataset, label_list
 def main():
    parser = argparse.ArgumentParser()
@@ -389,7 +334,7 @@ def main():
        default=None,
        type=str,
        required=True,
-        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+        help="The name of the task to train selected in the list: " + ", ".join(hans_processors.keys()),
    )
    parser.add_argument(
        "--output_dir",
@@ -541,10 +486,10 @@ def main():
    # Prepare GLUE task
    args.task_name = args.task_name.lower()
-    if args.task_name not in processors:
+    if args.task_name not in hans_processors:
        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = processors[args.task_name]()
+    processor = hans_processors[args.task_name]()
-    args.output_mode = output_modes[args.task_name]
+    args.output_mode = hans_output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)
@@ -581,7 +526,9 @@ def main():
    # Training
    if args.do_train:
-        train_dataset, _ = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        train_dataset = HansDataset(
            args.data_dir, tokenizer, args.task_name, args.max_seq_length, overwrite_cache=args.overwrite_cache
        )
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
@@ -625,7 +572,7 @@ def main():
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix)
+            result = evaluate(args, model, tokenizer, label_list, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)
--- a/examples/adversarial/utils_hans.py
+++ b/examples/adversarial/utils_hans.py
@@ -14,108 +14,339 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
+import logging
-import csv
+import os
-import json
+from dataclasses import dataclass
 from typing import List, Optional, Union
 import tqdm
 from filelock import FileLock
 from transformers import (
    DataProcessor,
    PreTrainedTokenizer,
    RobertaTokenizer,
    RobertaTokenizerFast,
    XLMRobertaTokenizer,
    is_tf_available,
    is_torch_available,
 )
-class InputExample(object):
+logger = logging.getLogger(__name__)
@dataclass(frozen=True)
 class InputExample:
    """
    A single training/test example for simple sequence classification.
    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
-        sequence tasks, only this sequence must be specified.
+            sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
-        Only must be specified for sequence pair tasks.
+            Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
-        specified for train and dev examples, but not for test examples.
+            specified for train and dev examples, but not for test examples.
        pairID: (Optional) string. Unique identifier for the pair of sentences.
    """
-    def __init__(self, guid, text_a, text_b=None, label=None, pairID=None):
+    guid: str
-        self.guid = guid
+    text_a: str
-        self.text_a = text_a
+    text_b: Optional[str] = None
-        self.text_b = text_b
+    label: Optional[str] = None
-        self.label = label
+    pairID: Optional[str] = None
        self.pairID = pairID
    def __repr__(self):
        return str(self.to_json_string())
    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-class InputFeatures(object):
+@dataclass(frozen=True)
 class InputFeatures:
    """
    A single set of features of data.
    Property names are the same names as the corresponding inputs to a model.
    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
-        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        token_type_ids: (Optional) Segment token indices to indicate first and second
-        label: Label corresponding to the input
+            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
        pairID: (Optional) Unique identifier for the pair of sentences.
    """
-    def __init__(self, input_ids, attention_mask, token_type_ids, label, pairID=None):
+    input_ids: List[int]
-        self.input_ids = input_ids
+    attention_mask: Optional[List[int]] = None
-        self.attention_mask = attention_mask
+    token_type_ids: Optional[List[int]] = None
-        self.token_type_ids = token_type_ids
+    label: Optional[Union[int, float]] = None
-        self.label = label
+    pairID: Optional[int] = None
        self.pairID = pairID
    def __repr__(self):
        return str(self.to_json_string())
    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-class DataProcessor(object):
+if is_torch_available():
-    """Base class for data converters for sequence classification data sets."""
+    import torch
    from torch.utils.data.dataset import Dataset
    class HansDataset(Dataset):
        """
        This will be superseded by a framework-agnostic approach
        soon.
        """
        features: List[InputFeatures]
        def __init__(
            self,
            data_dir: str,
            tokenizer: PreTrainedTokenizer,
            task: str,
            max_seq_length: Optional[int] = None,
            overwrite_cache=False,
            evaluate: bool = False,
        ):
            processor = hans_processors[task]()
            output_mode = hans_output_modes[task]
            cached_features_file = os.path.join(
                data_dir,
                "cached_{}_{}_{}_{}".format(
                    "dev" if evaluate else "train", tokenizer.__class__.__name__, str(max_seq_length), task,
                ),
            )
            # Make sure only the first process in distributed training processes the dataset,
            # and the others will use the cache.
            lock_path = cached_features_file + ".lock"
            with FileLock(lock_path):
                if os.path.exists(cached_features_file) and not overwrite_cache:
                    logger.info(f"Loading features from cached file {cached_features_file}")
                    self.features = torch.load(cached_features_file)
                else:
                    logger.info(f"Creating features from dataset file at {data_dir}")
                    label_list = processor.get_labels()
                    if task in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
                        RobertaTokenizer,
                        RobertaTokenizerFast,
                        XLMRobertaTokenizer,
                    ):
                        # HACK(label indices are swapped in RoBERTa pretrained model)
                        label_list[1], label_list[2] = label_list[2], label_list[1]
                    examples = (
                        processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
                    )
                    logger.info("Training examples: %s", len(examples))
                    # TODO clean up all this to leverage built-in features of tokenizers
                    self.features = hans_convert_examples_to_features(
                        examples, label_list, max_seq_length, tokenizer, output_mode
                    )
                    logger.info("Saving features into cached file %s", cached_features_file)
                    torch.save(self.features, cached_features_file)
        def __len__(self):
            return len(self.features)
        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]
 if is_tf_available():
    import tensorflow as tf
    class TFHansDataset:
        """
        This will be superseded by a framework-agnostic approach
        soon.
        """
        features: List[InputFeatures]
        def __init__(
            self,
            data_dir: str,
            tokenizer: PreTrainedTokenizer,
            task: str,
            max_seq_length: Optional[int] = 128,
            overwrite_cache=False,
            evaluate: bool = False,
        ):
            processor = hans_processors[task]()
            output_mode = hans_output_modes[task]
            label_list = processor.get_labels()
            if task in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
                RobertaTokenizer,
                RobertaTokenizerFast,
                XLMRobertaTokenizer,
            ):
                # HACK(label indices are swapped in RoBERTa pretrained model)
                label_list[1], label_list[2] = label_list[2], label_list[1]
            examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
            self.features = hans_convert_examples_to_features(
                examples, label_list, max_seq_length, tokenizer, output_mode
            )
            def gen():
                for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
                    if ex_index % 10000 == 0:
                        logger.info("Writing example %d of %d" % (ex_index, len(examples)))
                    yield (
                        {
                            "example_id": 0,
                            "input_ids": ex.input_ids,
                            "attention_mask": ex.attention_mask,
                            "token_type_ids": ex.token_type_ids,
                        },
                        ex.label,
                    )
            self.dataset = tf.data.Dataset.from_generator(
                gen,
                (
                    {
                        "example_id": tf.int32,
                        "input_ids": tf.int32,
                        "attention_mask": tf.int32,
                        "token_type_ids": tf.int32,
                    },
                    tf.int64,
                ),
                (
                    {
                        "example_id": tf.TensorShape([]),
                        "input_ids": tf.TensorShape([None, None]),
                        "attention_mask": tf.TensorShape([None, None]),
                        "token_type_ids": tf.TensorShape([None, None]),
                    },
                    tf.TensorShape([]),
                ),
            )
        def get_dataset(self):
            return self.dataset
        def __len__(self):
            return len(self.features)
        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]
 class HansProcessor(DataProcessor):
    """Processor for the HANS data set."""
    def get_example_from_tensor_dict(self, tensor_dict):
-        """Gets an example from a dict with tensorflow tensors
+        """See base class."""
-
+        return InputExample(
-        Args:
+            tensor_dict["idx"].numpy(),
-            tensor_dict: Keys and values should match the corresponding Glue
+            tensor_dict["premise"].numpy().decode("utf-8"),
-                tensorflow_dataset examples.
+            tensor_dict["hypothesis"].numpy().decode("utf-8"),
-        """
+            str(tensor_dict["label"].numpy()),
-        raise NotImplementedError()
+        )
    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
+        """See base class."""
-        raise NotImplementedError()
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
+        """See base class."""
-        raise NotImplementedError()
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
    def get_labels(self):
-        """Gets the list of labels for this data set."""
+        """See base class."""
-        raise NotImplementedError()
+        return ["contradiction", "entailment", "neutral"]
-    @classmethod
+    def _create_examples(self, lines, set_type):
-    def _read_tsv(cls, input_file, quotechar=None):
+        """Creates examples for the training and dev sets."""
-        """Reads a tab separated value file."""
+        examples = []
-        with open(input_file, "r", encoding="utf-8-sig") as f:
+        for (i, line) in enumerate(lines):
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            if i == 0:
-            lines = []
+                continue
-            for line in reader:
+            guid = "%s-%s" % (set_type, line[0])
-                lines.append(line)
+            text_a = line[5]
-            return lines
+            text_b = line[6]
            pairID = line[7][2:] if line[7].startswith("ex") else line[7]
            label = line[-1]
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
        return examples
 def hans_convert_examples_to_features(
    examples: List[InputExample],
    label_list: List[str],
    max_length: int,
    tokenizer: PreTrainedTokenizer,
    output_mode: str,
 ):
    """
    Loads a data file into a list of ``InputFeatures``
    Args:
        examples: List of ``InputExamples`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples.
        max_length: Maximum example length.
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method.
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``.
    Returns:
        A list of task-specific ``InputFeatures`` which can be fed to the model.
    """
    label_map = {label: i for i, label in enumerate(label_list)}
    features = []
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))
        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_overflowing_tokens=True,
        )
        if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
            logger.info(
                "Attention! you are cropping tokens (swag task is ok). "
                "If you are training ARC and RACE and you are poping question + options,"
                "you need to try to use a bigger max seq length!"
            )
        if output_mode == "classification":
            label = label_map[example.label] if example.label in label_map else 0
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)
        pairID = int(example.pairID)
        features.append(InputFeatures(**inputs, label=label, pairID=pairID))
    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info(f"guid: {example}")
        logger.info(f"features: {features[i]}")
    return features
 hans_tasks_num_labels = {
    "hans": 3,
 }
 hans_processors = {
    "hans": HansProcessor,
 }
 hans_output_modes = {
    "hans": "classification",
 }