From eda07efaa58f6c70ac92e7947d8e134cf99d6ec8 Mon Sep 17 00:00:00 2001 From: vblagoje Date: Thu, 13 Aug 2020 12:09:51 -0400 Subject: [PATCH] Add POS tagging and Phrase chunking token classification examples (#6457) * Add more token classification examples * POS tagging example * Phrase chunking example * PR review fixes * Add conllu to third party list (used in token classification examples) --- examples/requirements.txt | 1 + examples/token-classification/run.sh | 1 + examples/token-classification/run_chunk.sh | 37 +++ examples/token-classification/run_ner.py | 46 +-- examples/token-classification/run_pl_ner.py | 36 ++- examples/token-classification/run_pos.sh | 37 +++ examples/token-classification/run_pos_pl.sh | 39 +++ examples/token-classification/tasks.py | 163 ++++++++++ examples/token-classification/utils_ner.py | 316 +++++++++----------- setup.cfg | 1 + 10 files changed, 473 insertions(+), 204 deletions(-) mode change 100644 => 100755 examples/token-classification/run.sh create mode 100755 examples/token-classification/run_chunk.sh create mode 100755 examples/token-classification/run_pos.sh create mode 100755 examples/token-classification/run_pos_pl.sh create mode 100644 examples/token-classification/tasks.py diff --git a/examples/requirements.txt b/examples/requirements.txt index 32b5355297..65d266f63c 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -15,3 +15,4 @@ pandas nlp fire pytest +conllu \ No newline at end of file diff --git a/examples/token-classification/run.sh b/examples/token-classification/run.sh old mode 100644 new mode 100755 index bfb1f1b9f3..9ff10ca36d --- a/examples/token-classification/run.sh +++ b/examples/token-classification/run.sh @@ -18,6 +18,7 @@ export SAVE_STEPS=750 export SEED=1 python3 run_ner.py \ +--task_type NER \ --data_dir . \ --labels ./labels.txt \ --model_name_or_path $BERT_MODEL \ diff --git a/examples/token-classification/run_chunk.sh b/examples/token-classification/run_chunk.sh new file mode 100755 index 0000000000..13341555b6 --- /dev/null +++ b/examples/token-classification/run_chunk.sh @@ -0,0 +1,37 @@ +if ! [ -f ./dev.txt ]; then + echo "Downloading CONLL2003 dev dataset...." + curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt' +fi + +if ! [ -f ./test.txt ]; then + echo "Downloading CONLL2003 test dataset...." + curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt' +fi + +if ! [ -f ./train.txt ]; then + echo "Downloading CONLL2003 train dataset...." + curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt' +fi + +export MAX_LENGTH=200 +export BERT_MODEL=bert-base-uncased +export OUTPUT_DIR=chunker-model +export BATCH_SIZE=32 +export NUM_EPOCHS=3 +export SAVE_STEPS=750 +export SEED=1 + +python3 run_ner.py \ +--task_type Chunk \ +--data_dir . \ +--model_name_or_path $BERT_MODEL \ +--output_dir $OUTPUT_DIR \ +--max_seq_length $MAX_LENGTH \ +--num_train_epochs $NUM_EPOCHS \ +--per_gpu_train_batch_size $BATCH_SIZE \ +--save_steps $SAVE_STEPS \ +--seed $SEED \ +--do_train \ +--do_eval \ +--do_predict + diff --git a/examples/token-classification/run_ner.py b/examples/token-classification/run_ner.py index 6cc2a5c9aa..a2981415f6 100644 --- a/examples/token-classification/run_ner.py +++ b/examples/token-classification/run_ner.py @@ -14,16 +14,15 @@ # See the License for the specific language governing permissions and # limitations under the License. """ Fine-tuning the library models for named entity recognition on CoNLL-2003. """ - - import logging import os import sys from dataclasses import dataclass, field +from importlib import import_module from typing import Dict, List, Optional, Tuple import numpy as np -from seqeval.metrics import f1_score, precision_score, recall_score +from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score from torch import nn from transformers import ( @@ -36,7 +35,7 @@ from transformers import ( TrainingArguments, set_seed, ) -from utils_ner import NerDataset, Split, get_labels +from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask logger = logging.getLogger(__name__) @@ -54,6 +53,9 @@ class ModelArguments: config_name: Optional[str] = field( default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} ) + task_type: Optional[str] = field( + default="NER", metadata={"help": "Task type to fine tune in training (e.g. NER, POS, etc)"} + ) tokenizer_name: Optional[str] = field( default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) @@ -113,6 +115,16 @@ def main(): f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) + module = import_module("tasks") + try: + token_classification_task_clazz = getattr(module, model_args.task_type) + token_classification_task: TokenClassificationTask = token_classification_task_clazz() + except AttributeError: + raise ValueError( + f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. " + f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}" + ) + # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -133,7 +145,7 @@ def main(): set_seed(training_args.seed) # Prepare CONLL-2003 task - labels = get_labels(data_args.labels) + labels = token_classification_task.get_labels(data_args.labels) label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} num_labels = len(labels) @@ -164,7 +176,8 @@ def main(): # Get datasets train_dataset = ( - NerDataset( + TokenClassificationDataset( + token_classification_task=token_classification_task, data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, @@ -177,7 +190,8 @@ def main(): else None ) eval_dataset = ( - NerDataset( + TokenClassificationDataset( + token_classification_task=token_classification_task, data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, @@ -209,6 +223,7 @@ def main(): def compute_metrics(p: EvalPrediction) -> Dict: preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) return { + "accuracy_score": accuracy_score(out_label_list, preds_list), "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), @@ -253,7 +268,8 @@ def main(): # Predict if training_args.do_predict: - test_dataset = NerDataset( + test_dataset = TokenClassificationDataset( + token_classification_task=token_classification_task, data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, @@ -278,19 +294,7 @@ def main(): if trainer.is_world_master(): with open(output_test_predictions_file, "w") as writer: with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f: - example_id = 0 - for line in f: - if line.startswith("-DOCSTART-") or line == "" or line == "\n": - writer.write(line) - if not preds_list[example_id]: - example_id += 1 - elif preds_list[example_id]: - output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n" - writer.write(output_line) - else: - logger.warning( - "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0] - ) + token_classification_task.write_predictions_to_file(writer, f, preds_list) return results diff --git a/examples/token-classification/run_pl_ner.py b/examples/token-classification/run_pl_ner.py index 022f7f32b9..ed5280c793 100644 --- a/examples/token-classification/run_pl_ner.py +++ b/examples/token-classification/run_pl_ner.py @@ -2,15 +2,17 @@ import argparse import glob import logging import os +from argparse import Namespace +from importlib import import_module import numpy as np import torch -from seqeval.metrics import f1_score, precision_score, recall_score +from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score from torch.nn import CrossEntropyLoss from torch.utils.data import DataLoader, TensorDataset from lightning_base import BaseTransformer, add_generic_args, generic_train -from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file +from utils_ner import TokenClassificationTask logger = logging.getLogger(__name__) @@ -24,10 +26,20 @@ class NERTransformer(BaseTransformer): mode = "token-classification" def __init__(self, hparams): - self.labels = get_labels(hparams.labels) - num_labels = len(self.labels) + if type(hparams) == dict: + hparams = Namespace(**hparams) + module = import_module("tasks") + try: + token_classification_task_clazz = getattr(module, hparams.task_type) + self.token_classification_task: TokenClassificationTask = token_classification_task_clazz() + except AttributeError: + raise ValueError( + f"Task {hparams.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. " + f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}" + ) + self.labels = self.token_classification_task.get_labels(hparams.labels) self.pad_token_label_id = CrossEntropyLoss().ignore_index - super().__init__(hparams, num_labels, self.mode) + super().__init__(hparams, len(self.labels), self.mode) def forward(self, **inputs): return self.model(**inputs) @@ -42,8 +54,8 @@ class NERTransformer(BaseTransformer): outputs = self(**inputs) loss = outputs[0] - tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]} - return {"loss": loss, "log": tensorboard_logs} + # tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]} + return {"loss": loss} def prepare_data(self): "Called to initialize data. Use the call to construct features" @@ -55,8 +67,8 @@ class NERTransformer(BaseTransformer): features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) - examples = read_examples_from_file(args.data_dir, mode) - features = convert_examples_to_features( + examples = self.token_classification_task.read_examples_from_file(args.data_dir, mode) + features = self.token_classification_task.convert_examples_to_features( examples, self.labels, args.max_seq_length, @@ -74,7 +86,7 @@ class NERTransformer(BaseTransformer): logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) - def load_dataset(self, mode, batch_size): + def get_dataloader(self, mode: int, batch_size: int) -> DataLoader: "Load datasets. Called after prepare data." cached_features_file = self._feature_file(mode) logger.info("Loading features from cached file %s", cached_features_file) @@ -124,6 +136,7 @@ class NERTransformer(BaseTransformer): results = { "val_loss": val_loss_mean, + "accuracy_score": accuracy_score(out_label_list, preds_list), "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), @@ -154,6 +167,9 @@ class NERTransformer(BaseTransformer): def add_model_specific_args(parser, root_dir): # Add NER specific options BaseTransformer.add_model_specific_args(parser, root_dir) + parser.add_argument( + "--task_type", default="NER", type=str, help="Task type to fine tune in training (e.g. NER, POS, etc)" + ) parser.add_argument( "--max_seq_length", default=128, diff --git a/examples/token-classification/run_pos.sh b/examples/token-classification/run_pos.sh new file mode 100755 index 0000000000..7d76ed8a2a --- /dev/null +++ b/examples/token-classification/run_pos.sh @@ -0,0 +1,37 @@ +if ! [ -f ./dev.txt ]; then + echo "Download dev dataset...." + curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu' +fi + +if ! [ -f ./test.txt ]; then + echo "Download test dataset...." + curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu' +fi + +if ! [ -f ./train.txt ]; then + echo "Download train dataset...." + curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu' +fi + +export MAX_LENGTH=200 +export BERT_MODEL=bert-base-uncased +export OUTPUT_DIR=postagger-model +export BATCH_SIZE=32 +export NUM_EPOCHS=3 +export SAVE_STEPS=750 +export SEED=1 + +python3 run_ner.py \ +--task_type POS \ +--data_dir . \ +--model_name_or_path $BERT_MODEL \ +--output_dir $OUTPUT_DIR \ +--max_seq_length $MAX_LENGTH \ +--num_train_epochs $NUM_EPOCHS \ +--per_gpu_train_batch_size $BATCH_SIZE \ +--save_steps $SAVE_STEPS \ +--seed $SEED \ +--do_train \ +--do_eval \ +--do_predict + diff --git a/examples/token-classification/run_pos_pl.sh b/examples/token-classification/run_pos_pl.sh new file mode 100755 index 0000000000..e2539ea71f --- /dev/null +++ b/examples/token-classification/run_pos_pl.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +if ! [ -f ./dev.txt ]; then + echo "Download dev dataset...." + curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu' +fi + +if ! [ -f ./test.txt ]; then + echo "Download test dataset...." + curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu' +fi + +if ! [ -f ./train.txt ]; then + echo "Download train dataset...." + curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu' +fi + +export MAX_LENGTH=200 +export BERT_MODEL=bert-base-uncased +export OUTPUT_DIR=postagger-model +export BATCH_SIZE=32 +export NUM_EPOCHS=3 +export SAVE_STEPS=750 +export SEED=1 + + +# Add parent directory to python path to access lightning_base.py +export PYTHONPATH="../":"${PYTHONPATH}" + +python3 run_pl_ner.py --data_dir ./ \ +--task_type POS \ +--model_name_or_path $BERT_MODEL \ +--output_dir $OUTPUT_DIR \ +--max_seq_length $MAX_LENGTH \ +--num_train_epochs $NUM_EPOCHS \ +--train_batch_size $BATCH_SIZE \ +--seed $SEED \ +--gpus 1 \ +--do_train \ +--do_predict diff --git a/examples/token-classification/tasks.py b/examples/token-classification/tasks.py new file mode 100644 index 0000000000..409be0715d --- /dev/null +++ b/examples/token-classification/tasks.py @@ -0,0 +1,163 @@ +import logging +import os +from typing import List, TextIO, Union + +from conllu import parse_incr + +from utils_ner import InputExample, Split, TokenClassificationTask + + +logger = logging.getLogger(__name__) + + +class NER(TokenClassificationTask): + def __init__(self, label_idx=-1): + # in NER datasets, the last column is usually reserved for NER label + self.label_idx = label_idx + + def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]: + if isinstance(mode, Split): + mode = mode.value + file_path = os.path.join(data_dir, f"{mode}.txt") + guid_index = 1 + examples = [] + with open(file_path, encoding="utf-8") as f: + words = [] + labels = [] + for line in f: + if line.startswith("-DOCSTART-") or line == "" or line == "\n": + if words: + examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels)) + guid_index += 1 + words = [] + labels = [] + else: + splits = line.split(" ") + words.append(splits[0]) + if len(splits) > 1: + labels.append(splits[self.label_idx].replace("\n", "")) + else: + # Examples could have no label for mode = "test" + labels.append("O") + if words: + examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels)) + return examples + + def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List): + example_id = 0 + for line in test_input_reader: + if line.startswith("-DOCSTART-") or line == "" or line == "\n": + writer.write(line) + if not preds_list[example_id]: + example_id += 1 + elif preds_list[example_id]: + output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n" + writer.write(output_line) + else: + logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) + + def get_labels(self, path: str) -> List[str]: + if path: + with open(path, "r") as f: + labels = f.read().splitlines() + if "O" not in labels: + labels = ["O"] + labels + return labels + else: + return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] + + +class Chunk(NER): + def __init__(self): + # in CONLL2003 dataset chunk column is second-to-last + super().__init__(label_idx=-2) + + def get_labels(self, path: str) -> List[str]: + if path: + with open(path, "r") as f: + labels = f.read().splitlines() + if "O" not in labels: + labels = ["O"] + labels + return labels + else: + return [ + "O", + "B-ADVP", + "B-INTJ", + "B-LST", + "B-PRT", + "B-NP", + "B-SBAR", + "B-VP", + "B-ADJP", + "B-CONJP", + "B-PP", + "I-ADVP", + "I-INTJ", + "I-LST", + "I-PRT", + "I-NP", + "I-SBAR", + "I-VP", + "I-ADJP", + "I-CONJP", + "I-PP", + ] + + +class POS(TokenClassificationTask): + def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]: + if isinstance(mode, Split): + mode = mode.value + file_path = os.path.join(data_dir, f"{mode}.txt") + guid_index = 1 + examples = [] + + with open(file_path, encoding="utf-8") as f: + for sentence in parse_incr(f): + words = [] + labels = [] + for token in sentence: + words.append(token["form"]) + labels.append(token["upos"]) + assert len(words) == len(labels) + if words: + examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels)) + guid_index += 1 + return examples + + def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List): + example_id = 0 + for sentence in parse_incr(test_input_reader): + s_p = preds_list[example_id] + out = "" + for token in sentence: + out += f'{token["form"]} ({token["upos"]}|{s_p.pop(0)}) ' + out += "\n" + writer.write(out) + example_id += 1 + + def get_labels(self, path: str) -> List[str]: + if path: + with open(path, "r") as f: + return f.read().splitlines() + else: + return [ + "ADJ", + "ADP", + "ADV", + "AUX", + "CCONJ", + "DET", + "INTJ", + "NOUN", + "NUM", + "PART", + "PRON", + "PROPN", + "PUNCT", + "SCONJ", + "SYM", + "VERB", + "X", + ] diff --git a/examples/token-classification/utils_ner.py b/examples/token-classification/utils_ner.py index af9680d26c..946abf17d4 100644 --- a/examples/token-classification/utils_ner.py +++ b/examples/token-classification/utils_ner.py @@ -66,12 +66,148 @@ class Split(Enum): test = "test" +class TokenClassificationTask: + def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]: + raise NotImplementedError + + def get_labels(self, path: str) -> List[str]: + raise NotImplementedError + + def convert_examples_to_features( + self, + examples: List[InputExample], + label_list: List[str], + max_seq_length: int, + tokenizer: PreTrainedTokenizer, + cls_token_at_end=False, + cls_token="[CLS]", + cls_token_segment_id=1, + sep_token="[SEP]", + sep_token_extra=False, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + pad_token_label_id=-100, + sequence_a_segment_id=0, + mask_padding_with_zero=True, + ) -> List[InputFeatures]: + """ Loads a data file into a list of `InputFeatures` + `cls_token_at_end` define the location of the CLS token: + - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] + - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] + `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) + """ + # TODO clean up all this to leverage built-in features of tokenizers + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10_000 == 0: + logger.info("Writing example %d of %d", ex_index, len(examples)) + + tokens = [] + label_ids = [] + for word, label in zip(example.words, example.labels): + word_tokens = tokenizer.tokenize(word) + + # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space. + if len(word_tokens) > 0: + tokens.extend(word_tokens) + # Use the real label id for the first token of the word, and padding ids for the remaining tokens + label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) + + # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. + special_tokens_count = tokenizer.num_special_tokens_to_add() + if len(tokens) > max_seq_length - special_tokens_count: + tokens = tokens[: (max_seq_length - special_tokens_count)] + label_ids = label_ids[: (max_seq_length - special_tokens_count)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens += [sep_token] + label_ids += [pad_token_label_id] + if sep_token_extra: + # roberta uses an extra separator b/w pairs of sentences + tokens += [sep_token] + label_ids += [pad_token_label_id] + segment_ids = [sequence_a_segment_id] * len(tokens) + + if cls_token_at_end: + tokens += [cls_token] + label_ids += [pad_token_label_id] + segment_ids += [cls_token_segment_id] + else: + tokens = [cls_token] + tokens + label_ids = [pad_token_label_id] + label_ids + segment_ids = [cls_token_segment_id] + segment_ids + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + padding_length = max_seq_length - len(input_ids) + if pad_on_left: + input_ids = ([pad_token] * padding_length) + input_ids + input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask + segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids + label_ids = ([pad_token_label_id] * padding_length) + label_ids + else: + input_ids += [pad_token] * padding_length + input_mask += [0 if mask_padding_with_zero else 1] * padding_length + segment_ids += [pad_token_segment_id] * padding_length + label_ids += [pad_token_label_id] * padding_length + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(label_ids) == max_seq_length + + if ex_index < 5: + logger.info("*** Example ***") + logger.info("guid: %s", example.guid) + logger.info("tokens: %s", " ".join([str(x) for x in tokens])) + logger.info("input_ids: %s", " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s", " ".join([str(x) for x in input_mask])) + logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) + logger.info("label_ids: %s", " ".join([str(x) for x in label_ids])) + + if "token_type_ids" not in tokenizer.model_input_names: + segment_ids = None + + features.append( + InputFeatures( + input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids + ) + ) + return features + + if is_torch_available(): import torch from torch import nn from torch.utils.data.dataset import Dataset - class NerDataset(Dataset): + class TokenClassificationDataset(Dataset): """ This will be superseded by a framework-agnostic approach soon. @@ -84,6 +220,7 @@ if is_torch_available(): def __init__( self, + token_classification_task: TokenClassificationTask, data_dir: str, tokenizer: PreTrainedTokenizer, labels: List[str], @@ -107,9 +244,9 @@ if is_torch_available(): self.features = torch.load(cached_features_file) else: logger.info(f"Creating features from dataset file at {data_dir}") - examples = read_examples_from_file(data_dir, mode) + examples = token_classification_task.read_examples_from_file(data_dir, mode) # TODO clean up all this to leverage built-in features of tokenizers - self.features = convert_examples_to_features( + self.features = token_classification_task.convert_examples_to_features( examples, labels, max_seq_length, @@ -152,6 +289,7 @@ if is_tf_available(): def __init__( self, + token_classification_task: TokenClassificationTask, data_dir: str, tokenizer: PreTrainedTokenizer, labels: List[str], @@ -160,9 +298,9 @@ if is_tf_available(): overwrite_cache=False, mode: Split = Split.train, ): - examples = read_examples_from_file(data_dir, mode) + examples = token_classification_task.read_examples_from_file(data_dir, mode) # TODO clean up all this to leverage built-in features of tokenizers - self.features = convert_examples_to_features( + self.features = token_classification_task.convert_examples_to_features( examples, labels, max_seq_length, @@ -230,171 +368,3 @@ if is_tf_available(): def __getitem__(self, i) -> InputFeatures: return self.features[i] - - -def read_examples_from_file(data_dir, mode: Union[Split, str]) -> List[InputExample]: - if isinstance(mode, Split): - mode = mode.value - file_path = os.path.join(data_dir, f"{mode}.txt") - guid_index = 1 - examples = [] - with open(file_path, encoding="utf-8") as f: - words = [] - labels = [] - for line in f: - if line.startswith("-DOCSTART-") or line == "" or line == "\n": - if words: - examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels)) - guid_index += 1 - words = [] - labels = [] - else: - splits = line.split(" ") - words.append(splits[0]) - if len(splits) > 1: - labels.append(splits[-1].replace("\n", "")) - else: - # Examples could have no label for mode = "test" - labels.append("O") - if words: - examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels)) - return examples - - -def convert_examples_to_features( - examples: List[InputExample], - label_list: List[str], - max_seq_length: int, - tokenizer: PreTrainedTokenizer, - cls_token_at_end=False, - cls_token="[CLS]", - cls_token_segment_id=1, - sep_token="[SEP]", - sep_token_extra=False, - pad_on_left=False, - pad_token=0, - pad_token_segment_id=0, - pad_token_label_id=-100, - sequence_a_segment_id=0, - mask_padding_with_zero=True, -) -> List[InputFeatures]: - """ Loads a data file into a list of `InputFeatures` - `cls_token_at_end` define the location of the CLS token: - - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) - """ - # TODO clean up all this to leverage built-in features of tokenizers - - label_map = {label: i for i, label in enumerate(label_list)} - - features = [] - for (ex_index, example) in enumerate(examples): - if ex_index % 10_000 == 0: - logger.info("Writing example %d of %d", ex_index, len(examples)) - - tokens = [] - label_ids = [] - for word, label in zip(example.words, example.labels): - word_tokens = tokenizer.tokenize(word) - - # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space. - if len(word_tokens) > 0: - tokens.extend(word_tokens) - # Use the real label id for the first token of the word, and padding ids for the remaining tokens - label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) - - # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. - special_tokens_count = tokenizer.num_special_tokens_to_add() - if len(tokens) > max_seq_length - special_tokens_count: - tokens = tokens[: (max_seq_length - special_tokens_count)] - label_ids = label_ids[: (max_seq_length - special_tokens_count)] - - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambiguously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens += [sep_token] - label_ids += [pad_token_label_id] - if sep_token_extra: - # roberta uses an extra separator b/w pairs of sentences - tokens += [sep_token] - label_ids += [pad_token_label_id] - segment_ids = [sequence_a_segment_id] * len(tokens) - - if cls_token_at_end: - tokens += [cls_token] - label_ids += [pad_token_label_id] - segment_ids += [cls_token_segment_id] - else: - tokens = [cls_token] + tokens - label_ids = [pad_token_label_id] + label_ids - segment_ids = [cls_token_segment_id] + segment_ids - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_seq_length - len(input_ids) - if pad_on_left: - input_ids = ([pad_token] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask - segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids - label_ids = ([pad_token_label_id] * padding_length) + label_ids - else: - input_ids += [pad_token] * padding_length - input_mask += [0 if mask_padding_with_zero else 1] * padding_length - segment_ids += [pad_token_segment_id] * padding_length - label_ids += [pad_token_label_id] * padding_length - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - assert len(label_ids) == max_seq_length - - if ex_index < 5: - logger.info("*** Example ***") - logger.info("guid: %s", example.guid) - logger.info("tokens: %s", " ".join([str(x) for x in tokens])) - logger.info("input_ids: %s", " ".join([str(x) for x in input_ids])) - logger.info("input_mask: %s", " ".join([str(x) for x in input_mask])) - logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) - logger.info("label_ids: %s", " ".join([str(x) for x in label_ids])) - - if "token_type_ids" not in tokenizer.model_input_names: - segment_ids = None - - features.append( - InputFeatures( - input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids - ) - ) - return features - - -def get_labels(path: str) -> List[str]: - if path: - with open(path, "r") as f: - labels = f.read().splitlines() - if "O" not in labels: - labels = ["O"] + labels - return labels - else: - return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] diff --git a/setup.cfg b/setup.cfg index e5467ab623..c17da97a77 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,6 +5,7 @@ include_trailing_comma = True known_first_party = transformers known_third_party = absl + conllu elasticsearch fairseq faiss