Add POS tagging and Phrase chunking token classification examples (#6457)

* Add more token classification examples * POS tagging example * Phrase chunking example * PR review fixes * Add conllu to third party list (used in token classification examples)
2020-08-13 12:09:51 -04:00
parent f51161e230
commit eda07efaa5
10 changed files with 473 additions and 204 deletions
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -15,3 +15,4 @@ pandas
 nlp
 fire
 pytest
 conllu
--- a/examples/token-classification/run.sh
+++ b/examples/token-classification/run.sh
@@ -18,6 +18,7 @@ export SAVE_STEPS=750
 export SEED=1
 python3 run_ner.py \
 --task_type NER \
 --data_dir . \
 --labels ./labels.txt \
 --model_name_or_path $BERT_MODEL \
--- a/examples/token-classification/run_chunk.sh
+++ b/examples/token-classification/run_chunk.sh
@@ -0,0 +1,37 @@
 if ! [ -f ./dev.txt ]; then
  echo "Downloading CONLL2003 dev dataset...."
  curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt'
 fi
 if ! [ -f ./test.txt ]; then
  echo "Downloading CONLL2003 test dataset...."
  curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt'
 fi
 if ! [ -f ./train.txt ]; then
  echo "Downloading CONLL2003 train dataset...."
  curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt'
 fi
 export MAX_LENGTH=200
 export BERT_MODEL=bert-base-uncased
 export OUTPUT_DIR=chunker-model
 export BATCH_SIZE=32
 export NUM_EPOCHS=3
 export SAVE_STEPS=750
 export SEED=1
 python3 run_ner.py \
 --task_type Chunk \
 --data_dir . \
 --model_name_or_path $BERT_MODEL \
 --output_dir $OUTPUT_DIR \
 --max_seq_length  $MAX_LENGTH \
 --num_train_epochs $NUM_EPOCHS \
 --per_gpu_train_batch_size $BATCH_SIZE \
 --save_steps $SAVE_STEPS \
 --seed $SEED \
 --do_train \
 --do_eval \
 --do_predict
--- a/examples/token-classification/run_ner.py
+++ b/examples/token-classification/run_ner.py
@@ -14,16 +14,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Fine-tuning the library models for named entity recognition on CoNLL-2003. """
 import logging
 import os
 import sys
 from dataclasses import dataclass, field
 from importlib import import_module
 from typing import Dict, List, Optional, Tuple
 import numpy as np
-from seqeval.metrics import f1_score, precision_score, recall_score
+from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
 from torch import nn
 from transformers import (
@@ -36,7 +35,7 @@ from transformers import (
    TrainingArguments,
    set_seed,
 )
-from utils_ner import NerDataset, Split, get_labels
+from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask
 logger = logging.getLogger(__name__)
@@ -54,6 +53,9 @@ class ModelArguments:
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    task_type: Optional[str] = field(
        default="NER", metadata={"help": "Task type to fine tune in training (e.g. NER, POS, etc)"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
@@ -113,6 +115,16 @@ def main():
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )
    module = import_module("tasks")
    try:
        token_classification_task_clazz = getattr(module, model_args.task_type)
        token_classification_task: TokenClassificationTask = token_classification_task_clazz()
    except AttributeError:
        raise ValueError(
            f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
            f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
        )
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
@@ -133,7 +145,7 @@ def main():
    set_seed(training_args.seed)
    # Prepare CONLL-2003 task
-    labels = get_labels(data_args.labels)
+    labels = token_classification_task.get_labels(data_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)
@@ -164,7 +176,8 @@ def main():
    # Get datasets
    train_dataset = (
-        NerDataset(
+        TokenClassificationDataset(
            token_classification_task=token_classification_task,
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
@@ -177,7 +190,8 @@ def main():
        else None
    )
    eval_dataset = (
-        NerDataset(
+        TokenClassificationDataset(
            token_classification_task=token_classification_task,
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
@@ -209,6 +223,7 @@ def main():
    def compute_metrics(p: EvalPrediction) -> Dict:
        preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
        return {
            "accuracy_score": accuracy_score(out_label_list, preds_list),
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1": f1_score(out_label_list, preds_list),
@@ -253,7 +268,8 @@ def main():
    # Predict
    if training_args.do_predict:
-        test_dataset = NerDataset(
+        test_dataset = TokenClassificationDataset(
            token_classification_task=token_classification_task,
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
@@ -278,19 +294,7 @@ def main():
        if trainer.is_world_master():
            with open(output_test_predictions_file, "w") as writer:
                with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
-                    example_id = 0
+                    token_classification_task.write_predictions_to_file(writer, f, preds_list)
                    for line in f:
                        if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                            writer.write(line)
                            if not preds_list[example_id]:
                                example_id += 1
                        elif preds_list[example_id]:
                            output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
                            writer.write(output_line)
                        else:
                            logger.warning(
                                "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
                            )
    return results
--- a/examples/token-classification/run_pl_ner.py
+++ b/examples/token-classification/run_pl_ner.py
@@ -2,15 +2,17 @@ import argparse
 import glob
 import logging
 import os
 from argparse import Namespace
 from importlib import import_module
 import numpy as np
 import torch
-from seqeval.metrics import f1_score, precision_score, recall_score
+from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
 from torch.nn import CrossEntropyLoss
 from torch.utils.data import DataLoader, TensorDataset
 from lightning_base import BaseTransformer, add_generic_args, generic_train
-from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
+from utils_ner import TokenClassificationTask
 logger = logging.getLogger(__name__)
@@ -24,10 +26,20 @@ class NERTransformer(BaseTransformer):
    mode = "token-classification"
    def __init__(self, hparams):
-        self.labels = get_labels(hparams.labels)
+        if type(hparams) == dict:
-        num_labels = len(self.labels)
+            hparams = Namespace(**hparams)
        module = import_module("tasks")
        try:
            token_classification_task_clazz = getattr(module, hparams.task_type)
            self.token_classification_task: TokenClassificationTask = token_classification_task_clazz()
        except AttributeError:
            raise ValueError(
                f"Task {hparams.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
                f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
            )
        self.labels = self.token_classification_task.get_labels(hparams.labels)
        self.pad_token_label_id = CrossEntropyLoss().ignore_index
-        super().__init__(hparams, num_labels, self.mode)
+        super().__init__(hparams, len(self.labels), self.mode)
    def forward(self, **inputs):
        return self.model(**inputs)
@@ -42,8 +54,8 @@ class NERTransformer(BaseTransformer):
        outputs = self(**inputs)
        loss = outputs[0]
-        tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
+        # tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
-        return {"loss": loss, "log": tensorboard_logs}
+        return {"loss": loss}
    def prepare_data(self):
        "Called to initialize data. Use the call to construct features"
@@ -55,8 +67,8 @@ class NERTransformer(BaseTransformer):
                features = torch.load(cached_features_file)
            else:
                logger.info("Creating features from dataset file at %s", args.data_dir)
-                examples = read_examples_from_file(args.data_dir, mode)
+                examples = self.token_classification_task.read_examples_from_file(args.data_dir, mode)
-                features = convert_examples_to_features(
+                features = self.token_classification_task.convert_examples_to_features(
                    examples,
                    self.labels,
                    args.max_seq_length,
@@ -74,7 +86,7 @@ class NERTransformer(BaseTransformer):
                logger.info("Saving features into cached file %s", cached_features_file)
                torch.save(features, cached_features_file)
-    def load_dataset(self, mode, batch_size):
+    def get_dataloader(self, mode: int, batch_size: int) -> DataLoader:
        "Load datasets. Called after prepare data."
        cached_features_file = self._feature_file(mode)
        logger.info("Loading features from cached file %s", cached_features_file)
@@ -124,6 +136,7 @@ class NERTransformer(BaseTransformer):
        results = {
            "val_loss": val_loss_mean,
            "accuracy_score": accuracy_score(out_label_list, preds_list),
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1": f1_score(out_label_list, preds_list),
@@ -154,6 +167,9 @@ class NERTransformer(BaseTransformer):
    def add_model_specific_args(parser, root_dir):
        # Add NER specific options
        BaseTransformer.add_model_specific_args(parser, root_dir)
        parser.add_argument(
            "--task_type", default="NER", type=str, help="Task type to fine tune in training (e.g. NER, POS, etc)"
        )
        parser.add_argument(
            "--max_seq_length",
            default=128,
--- a/examples/token-classification/run_pos.sh
+++ b/examples/token-classification/run_pos.sh
@@ -0,0 +1,37 @@
 if ! [ -f ./dev.txt ]; then
  echo "Download dev dataset...."
  curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
 fi
 if ! [ -f ./test.txt ]; then
  echo "Download test dataset...."
  curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
 fi
 if ! [ -f ./train.txt ]; then
  echo "Download train dataset...."
  curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
 fi
 export MAX_LENGTH=200
 export BERT_MODEL=bert-base-uncased
 export OUTPUT_DIR=postagger-model
 export BATCH_SIZE=32
 export NUM_EPOCHS=3
 export SAVE_STEPS=750
 export SEED=1
 python3 run_ner.py \
 --task_type POS \
 --data_dir . \
 --model_name_or_path $BERT_MODEL \
 --output_dir $OUTPUT_DIR \
 --max_seq_length  $MAX_LENGTH \
 --num_train_epochs $NUM_EPOCHS \
 --per_gpu_train_batch_size $BATCH_SIZE \
 --save_steps $SAVE_STEPS \
 --seed $SEED \
 --do_train \
 --do_eval \
 --do_predict
--- a/examples/token-classification/run_pos_pl.sh
+++ b/examples/token-classification/run_pos_pl.sh
@@ -0,0 +1,39 @@
 #!/usr/bin/env bash
 if ! [ -f ./dev.txt ]; then
  echo "Download dev dataset...."
  curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
 fi
 if ! [ -f ./test.txt ]; then
  echo "Download test dataset...."
  curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
 fi
 if ! [ -f ./train.txt ]; then
  echo "Download train dataset...."
  curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
 fi
 export MAX_LENGTH=200
 export BERT_MODEL=bert-base-uncased
 export OUTPUT_DIR=postagger-model
 export BATCH_SIZE=32
 export NUM_EPOCHS=3
 export SAVE_STEPS=750
 export SEED=1
 # Add parent directory to python path to access lightning_base.py
 export PYTHONPATH="../":"${PYTHONPATH}"
 python3 run_pl_ner.py --data_dir ./ \
 --task_type POS \
 --model_name_or_path $BERT_MODEL \
 --output_dir $OUTPUT_DIR \
 --max_seq_length  $MAX_LENGTH \
 --num_train_epochs $NUM_EPOCHS \
 --train_batch_size $BATCH_SIZE \
 --seed $SEED \
 --gpus 1 \
 --do_train \
 --do_predict
--- a/examples/token-classification/tasks.py
+++ b/examples/token-classification/tasks.py
@@ -0,0 +1,163 @@
 import logging
 import os
 from typing import List, TextIO, Union
 from conllu import parse_incr
 from utils_ner import InputExample, Split, TokenClassificationTask
 logger = logging.getLogger(__name__)
 class NER(TokenClassificationTask):
    def __init__(self, label_idx=-1):
        # in NER datasets, the last column is usually reserved for NER label
        self.label_idx = label_idx
    def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
        if isinstance(mode, Split):
            mode = mode.value
        file_path = os.path.join(data_dir, f"{mode}.txt")
        guid_index = 1
        examples = []
        with open(file_path, encoding="utf-8") as f:
            words = []
            labels = []
            for line in f:
                if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                    if words:
                        examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
                        guid_index += 1
                        words = []
                        labels = []
                else:
                    splits = line.split(" ")
                    words.append(splits[0])
                    if len(splits) > 1:
                        labels.append(splits[self.label_idx].replace("\n", ""))
                    else:
                        # Examples could have no label for mode = "test"
                        labels.append("O")
            if words:
                examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
        return examples
    def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
        example_id = 0
        for line in test_input_reader:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                writer.write(line)
                if not preds_list[example_id]:
                    example_id += 1
            elif preds_list[example_id]:
                output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
                writer.write(output_line)
            else:
                logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
    def get_labels(self, path: str) -> List[str]:
        if path:
            with open(path, "r") as f:
                labels = f.read().splitlines()
            if "O" not in labels:
                labels = ["O"] + labels
            return labels
        else:
            return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
 class Chunk(NER):
    def __init__(self):
        # in CONLL2003 dataset chunk column is second-to-last
        super().__init__(label_idx=-2)
    def get_labels(self, path: str) -> List[str]:
        if path:
            with open(path, "r") as f:
                labels = f.read().splitlines()
            if "O" not in labels:
                labels = ["O"] + labels
            return labels
        else:
            return [
                "O",
                "B-ADVP",
                "B-INTJ",
                "B-LST",
                "B-PRT",
                "B-NP",
                "B-SBAR",
                "B-VP",
                "B-ADJP",
                "B-CONJP",
                "B-PP",
                "I-ADVP",
                "I-INTJ",
                "I-LST",
                "I-PRT",
                "I-NP",
                "I-SBAR",
                "I-VP",
                "I-ADJP",
                "I-CONJP",
                "I-PP",
            ]
 class POS(TokenClassificationTask):
    def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
        if isinstance(mode, Split):
            mode = mode.value
        file_path = os.path.join(data_dir, f"{mode}.txt")
        guid_index = 1
        examples = []
        with open(file_path, encoding="utf-8") as f:
            for sentence in parse_incr(f):
                words = []
                labels = []
                for token in sentence:
                    words.append(token["form"])
                    labels.append(token["upos"])
                assert len(words) == len(labels)
                if words:
                    examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
                    guid_index += 1
        return examples
    def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
        example_id = 0
        for sentence in parse_incr(test_input_reader):
            s_p = preds_list[example_id]
            out = ""
            for token in sentence:
                out += f'{token["form"]} ({token["upos"]}|{s_p.pop(0)}) '
            out += "\n"
            writer.write(out)
            example_id += 1
    def get_labels(self, path: str) -> List[str]:
        if path:
            with open(path, "r") as f:
                return f.read().splitlines()
        else:
            return [
                "ADJ",
                "ADP",
                "ADV",
                "AUX",
                "CCONJ",
                "DET",
                "INTJ",
                "NOUN",
                "NUM",
                "PART",
                "PRON",
                "PROPN",
                "PUNCT",
                "SCONJ",
                "SYM",
                "VERB",
                "X",
            ]
--- a/examples/token-classification/utils_ner.py
+++ b/examples/token-classification/utils_ner.py
@@ -66,202 +66,15 @@ class Split(Enum):
    test = "test"
-if is_torch_available():
+class TokenClassificationTask:
-    import torch
+    def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
-    from torch import nn
+        raise NotImplementedError
    from torch.utils.data.dataset import Dataset
    class NerDataset(Dataset):
        """
        This will be superseded by a framework-agnostic approach
        soon.
        """
        features: List[InputFeatures]
        pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
        # Use cross entropy ignore_index as padding label id so that only
        # real label ids contribute to the loss later.
        def __init__(
            self,
            data_dir: str,
            tokenizer: PreTrainedTokenizer,
            labels: List[str],
            model_type: str,
            max_seq_length: Optional[int] = None,
            overwrite_cache=False,
            mode: Split = Split.train,
        ):
            # Load data features from cache or dataset file
            cached_features_file = os.path.join(
                data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
            )
            # Make sure only the first process in distributed training processes the dataset,
            # and the others will use the cache.
            lock_path = cached_features_file + ".lock"
            with FileLock(lock_path):
                if os.path.exists(cached_features_file) and not overwrite_cache:
                    logger.info(f"Loading features from cached file {cached_features_file}")
                    self.features = torch.load(cached_features_file)
                else:
                    logger.info(f"Creating features from dataset file at {data_dir}")
                    examples = read_examples_from_file(data_dir, mode)
                    # TODO clean up all this to leverage built-in features of tokenizers
                    self.features = convert_examples_to_features(
                        examples,
                        labels,
                        max_seq_length,
                        tokenizer,
                        cls_token_at_end=bool(model_type in ["xlnet"]),
                        # xlnet has a cls token at the end
                        cls_token=tokenizer.cls_token,
                        cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
                        sep_token=tokenizer.sep_token,
                        sep_token_extra=False,
                        # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                        pad_on_left=bool(tokenizer.padding_side == "left"),
                        pad_token=tokenizer.pad_token_id,
                        pad_token_segment_id=tokenizer.pad_token_type_id,
                        pad_token_label_id=self.pad_token_label_id,
                    )
                    logger.info(f"Saving features into cached file {cached_features_file}")
                    torch.save(self.features, cached_features_file)
        def __len__(self):
            return len(self.features)
        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]
 if is_tf_available():
    import tensorflow as tf
    class TFNerDataset:
        """
        This will be superseded by a framework-agnostic approach
        soon.
        """
        features: List[InputFeatures]
        pad_token_label_id: int = -100
        # Use cross entropy ignore_index as padding label id so that only
        # real label ids contribute to the loss later.
        def __init__(
            self,
            data_dir: str,
            tokenizer: PreTrainedTokenizer,
            labels: List[str],
            model_type: str,
            max_seq_length: Optional[int] = None,
            overwrite_cache=False,
            mode: Split = Split.train,
        ):
            examples = read_examples_from_file(data_dir, mode)
            # TODO clean up all this to leverage built-in features of tokenizers
            self.features = convert_examples_to_features(
                examples,
                labels,
                max_seq_length,
                tokenizer,
                cls_token_at_end=bool(model_type in ["xlnet"]),
                # xlnet has a cls token at the end
                cls_token=tokenizer.cls_token,
                cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
                sep_token=tokenizer.sep_token,
                sep_token_extra=False,
                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                pad_on_left=bool(tokenizer.padding_side == "left"),
                pad_token=tokenizer.pad_token_id,
                pad_token_segment_id=tokenizer.pad_token_type_id,
                pad_token_label_id=self.pad_token_label_id,
            )
            def gen():
                for ex in self.features:
                    if ex.token_type_ids is None:
                        yield (
                            {"input_ids": ex.input_ids, "attention_mask": ex.attention_mask},
                            ex.label_ids,
                        )
                    else:
                        yield (
                            {
                                "input_ids": ex.input_ids,
                                "attention_mask": ex.attention_mask,
                                "token_type_ids": ex.token_type_ids,
                            },
                            ex.label_ids,
                        )
            if "token_type_ids" not in tokenizer.model_input_names:
                self.dataset = tf.data.Dataset.from_generator(
                    gen,
                    ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
                    (
                        {"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])},
                        tf.TensorShape([None]),
                    ),
                )
            else:
                self.dataset = tf.data.Dataset.from_generator(
                    gen,
                    ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
                    (
                        {
                            "input_ids": tf.TensorShape([None]),
                            "attention_mask": tf.TensorShape([None]),
                            "token_type_ids": tf.TensorShape([None]),
                        },
                        tf.TensorShape([None]),
                    ),
                )
        def get_dataset(self):
            self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
            return self.dataset
        def __len__(self):
            return len(self.features)
        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]
 def read_examples_from_file(data_dir, mode: Union[Split, str]) -> List[InputExample]:
    if isinstance(mode, Split):
        mode = mode.value
    file_path = os.path.join(data_dir, f"{mode}.txt")
    guid_index = 1
    examples = []
    with open(file_path, encoding="utf-8") as f:
        words = []
        labels = []
        for line in f:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                if words:
                    examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
                    guid_index += 1
                    words = []
                    labels = []
            else:
                splits = line.split(" ")
                words.append(splits[0])
                if len(splits) > 1:
                    labels.append(splits[-1].replace("\n", ""))
                else:
                    # Examples could have no label for mode = "test"
                    labels.append("O")
        if words:
            examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
    return examples
    def get_labels(self, path: str) -> List[str]:
        raise NotImplementedError
    def convert_examples_to_features(
        self,
        examples: List[InputExample],
        label_list: List[str],
        max_seq_length: int,
@@ -389,12 +202,169 @@ def convert_examples_to_features(
        return features
-def get_labels(path: str) -> List[str]:
+if is_torch_available():
-    if path:
+    import torch
-        with open(path, "r") as f:
+    from torch import nn
-            labels = f.read().splitlines()
+    from torch.utils.data.dataset import Dataset
-        if "O" not in labels:
+
-            labels = ["O"] + labels
+    class TokenClassificationDataset(Dataset):
-        return labels
+        """
        This will be superseded by a framework-agnostic approach
        soon.
        """
        features: List[InputFeatures]
        pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
        # Use cross entropy ignore_index as padding label id so that only
        # real label ids contribute to the loss later.
        def __init__(
            self,
            token_classification_task: TokenClassificationTask,
            data_dir: str,
            tokenizer: PreTrainedTokenizer,
            labels: List[str],
            model_type: str,
            max_seq_length: Optional[int] = None,
            overwrite_cache=False,
            mode: Split = Split.train,
        ):
            # Load data features from cache or dataset file
            cached_features_file = os.path.join(
                data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
            )
            # Make sure only the first process in distributed training processes the dataset,
            # and the others will use the cache.
            lock_path = cached_features_file + ".lock"
            with FileLock(lock_path):
                if os.path.exists(cached_features_file) and not overwrite_cache:
                    logger.info(f"Loading features from cached file {cached_features_file}")
                    self.features = torch.load(cached_features_file)
                else:
-        return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
+                    logger.info(f"Creating features from dataset file at {data_dir}")
                    examples = token_classification_task.read_examples_from_file(data_dir, mode)
                    # TODO clean up all this to leverage built-in features of tokenizers
                    self.features = token_classification_task.convert_examples_to_features(
                        examples,
                        labels,
                        max_seq_length,
                        tokenizer,
                        cls_token_at_end=bool(model_type in ["xlnet"]),
                        # xlnet has a cls token at the end
                        cls_token=tokenizer.cls_token,
                        cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
                        sep_token=tokenizer.sep_token,
                        sep_token_extra=False,
                        # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                        pad_on_left=bool(tokenizer.padding_side == "left"),
                        pad_token=tokenizer.pad_token_id,
                        pad_token_segment_id=tokenizer.pad_token_type_id,
                        pad_token_label_id=self.pad_token_label_id,
                    )
                    logger.info(f"Saving features into cached file {cached_features_file}")
                    torch.save(self.features, cached_features_file)
        def __len__(self):
            return len(self.features)
        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]
 if is_tf_available():
    import tensorflow as tf
    class TFNerDataset:
        """
        This will be superseded by a framework-agnostic approach
        soon.
        """
        features: List[InputFeatures]
        pad_token_label_id: int = -100
        # Use cross entropy ignore_index as padding label id so that only
        # real label ids contribute to the loss later.
        def __init__(
            self,
            token_classification_task: TokenClassificationTask,
            data_dir: str,
            tokenizer: PreTrainedTokenizer,
            labels: List[str],
            model_type: str,
            max_seq_length: Optional[int] = None,
            overwrite_cache=False,
            mode: Split = Split.train,
        ):
            examples = token_classification_task.read_examples_from_file(data_dir, mode)
            # TODO clean up all this to leverage built-in features of tokenizers
            self.features = token_classification_task.convert_examples_to_features(
                examples,
                labels,
                max_seq_length,
                tokenizer,
                cls_token_at_end=bool(model_type in ["xlnet"]),
                # xlnet has a cls token at the end
                cls_token=tokenizer.cls_token,
                cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
                sep_token=tokenizer.sep_token,
                sep_token_extra=False,
                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                pad_on_left=bool(tokenizer.padding_side == "left"),
                pad_token=tokenizer.pad_token_id,
                pad_token_segment_id=tokenizer.pad_token_type_id,
                pad_token_label_id=self.pad_token_label_id,
            )
            def gen():
                for ex in self.features:
                    if ex.token_type_ids is None:
                        yield (
                            {"input_ids": ex.input_ids, "attention_mask": ex.attention_mask},
                            ex.label_ids,
                        )
                    else:
                        yield (
                            {
                                "input_ids": ex.input_ids,
                                "attention_mask": ex.attention_mask,
                                "token_type_ids": ex.token_type_ids,
                            },
                            ex.label_ids,
                        )
            if "token_type_ids" not in tokenizer.model_input_names:
                self.dataset = tf.data.Dataset.from_generator(
                    gen,
                    ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
                    (
                        {"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])},
                        tf.TensorShape([None]),
                    ),
                )
            else:
                self.dataset = tf.data.Dataset.from_generator(
                    gen,
                    ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
                    (
                        {
                            "input_ids": tf.TensorShape([None]),
                            "attention_mask": tf.TensorShape([None]),
                            "token_type_ids": tf.TensorShape([None]),
                        },
                        tf.TensorShape([None]),
                    ),
                )
        def get_dataset(self):
            self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
            return self.dataset
        def __len__(self):
            return len(self.features)
        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]
--- a/setup.cfg
+++ b/setup.cfg
@@ -5,6 +5,7 @@ include_trailing_comma = True
 known_first_party = transformers
 known_third_party =
    absl
    conllu
    elasticsearch
    fairseq
    faiss