Add POS tagging and Phrase chunking token classification examples (#6457)
* Add more token classification examples * POS tagging example * Phrase chunking example * PR review fixes * Add conllu to third party list (used in token classification examples)
This commit is contained in:
@@ -15,3 +15,4 @@ pandas
|
|||||||
nlp
|
nlp
|
||||||
fire
|
fire
|
||||||
pytest
|
pytest
|
||||||
|
conllu
|
||||||
1
examples/token-classification/run.sh
Normal file → Executable file
1
examples/token-classification/run.sh
Normal file → Executable file
@@ -18,6 +18,7 @@ export SAVE_STEPS=750
|
|||||||
export SEED=1
|
export SEED=1
|
||||||
|
|
||||||
python3 run_ner.py \
|
python3 run_ner.py \
|
||||||
|
--task_type NER \
|
||||||
--data_dir . \
|
--data_dir . \
|
||||||
--labels ./labels.txt \
|
--labels ./labels.txt \
|
||||||
--model_name_or_path $BERT_MODEL \
|
--model_name_or_path $BERT_MODEL \
|
||||||
|
|||||||
37
examples/token-classification/run_chunk.sh
Executable file
37
examples/token-classification/run_chunk.sh
Executable file
@@ -0,0 +1,37 @@
|
|||||||
|
if ! [ -f ./dev.txt ]; then
|
||||||
|
echo "Downloading CONLL2003 dev dataset...."
|
||||||
|
curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt'
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! [ -f ./test.txt ]; then
|
||||||
|
echo "Downloading CONLL2003 test dataset...."
|
||||||
|
curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt'
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! [ -f ./train.txt ]; then
|
||||||
|
echo "Downloading CONLL2003 train dataset...."
|
||||||
|
curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt'
|
||||||
|
fi
|
||||||
|
|
||||||
|
export MAX_LENGTH=200
|
||||||
|
export BERT_MODEL=bert-base-uncased
|
||||||
|
export OUTPUT_DIR=chunker-model
|
||||||
|
export BATCH_SIZE=32
|
||||||
|
export NUM_EPOCHS=3
|
||||||
|
export SAVE_STEPS=750
|
||||||
|
export SEED=1
|
||||||
|
|
||||||
|
python3 run_ner.py \
|
||||||
|
--task_type Chunk \
|
||||||
|
--data_dir . \
|
||||||
|
--model_name_or_path $BERT_MODEL \
|
||||||
|
--output_dir $OUTPUT_DIR \
|
||||||
|
--max_seq_length $MAX_LENGTH \
|
||||||
|
--num_train_epochs $NUM_EPOCHS \
|
||||||
|
--per_gpu_train_batch_size $BATCH_SIZE \
|
||||||
|
--save_steps $SAVE_STEPS \
|
||||||
|
--seed $SEED \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--do_predict
|
||||||
|
|
||||||
@@ -14,16 +14,15 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
""" Fine-tuning the library models for named entity recognition on CoNLL-2003. """
|
""" Fine-tuning the library models for named entity recognition on CoNLL-2003. """
|
||||||
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from importlib import import_module
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from seqeval.metrics import f1_score, precision_score, recall_score
|
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
@@ -36,7 +35,7 @@ from transformers import (
|
|||||||
TrainingArguments,
|
TrainingArguments,
|
||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
from utils_ner import NerDataset, Split, get_labels
|
from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -54,6 +53,9 @@ class ModelArguments:
|
|||||||
config_name: Optional[str] = field(
|
config_name: Optional[str] = field(
|
||||||
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
||||||
)
|
)
|
||||||
|
task_type: Optional[str] = field(
|
||||||
|
default="NER", metadata={"help": "Task type to fine tune in training (e.g. NER, POS, etc)"}
|
||||||
|
)
|
||||||
tokenizer_name: Optional[str] = field(
|
tokenizer_name: Optional[str] = field(
|
||||||
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
||||||
)
|
)
|
||||||
@@ -113,6 +115,16 @@ def main():
|
|||||||
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
|
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
module = import_module("tasks")
|
||||||
|
try:
|
||||||
|
token_classification_task_clazz = getattr(module, model_args.task_type)
|
||||||
|
token_classification_task: TokenClassificationTask = token_classification_task_clazz()
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError(
|
||||||
|
f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
|
||||||
|
f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
|
||||||
|
)
|
||||||
|
|
||||||
# Setup logging
|
# Setup logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||||
@@ -133,7 +145,7 @@ def main():
|
|||||||
set_seed(training_args.seed)
|
set_seed(training_args.seed)
|
||||||
|
|
||||||
# Prepare CONLL-2003 task
|
# Prepare CONLL-2003 task
|
||||||
labels = get_labels(data_args.labels)
|
labels = token_classification_task.get_labels(data_args.labels)
|
||||||
label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
|
label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
|
||||||
num_labels = len(labels)
|
num_labels = len(labels)
|
||||||
|
|
||||||
@@ -164,7 +176,8 @@ def main():
|
|||||||
|
|
||||||
# Get datasets
|
# Get datasets
|
||||||
train_dataset = (
|
train_dataset = (
|
||||||
NerDataset(
|
TokenClassificationDataset(
|
||||||
|
token_classification_task=token_classification_task,
|
||||||
data_dir=data_args.data_dir,
|
data_dir=data_args.data_dir,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
labels=labels,
|
labels=labels,
|
||||||
@@ -177,7 +190,8 @@ def main():
|
|||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
eval_dataset = (
|
eval_dataset = (
|
||||||
NerDataset(
|
TokenClassificationDataset(
|
||||||
|
token_classification_task=token_classification_task,
|
||||||
data_dir=data_args.data_dir,
|
data_dir=data_args.data_dir,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
labels=labels,
|
labels=labels,
|
||||||
@@ -209,6 +223,7 @@ def main():
|
|||||||
def compute_metrics(p: EvalPrediction) -> Dict:
|
def compute_metrics(p: EvalPrediction) -> Dict:
|
||||||
preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
|
preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
|
||||||
return {
|
return {
|
||||||
|
"accuracy_score": accuracy_score(out_label_list, preds_list),
|
||||||
"precision": precision_score(out_label_list, preds_list),
|
"precision": precision_score(out_label_list, preds_list),
|
||||||
"recall": recall_score(out_label_list, preds_list),
|
"recall": recall_score(out_label_list, preds_list),
|
||||||
"f1": f1_score(out_label_list, preds_list),
|
"f1": f1_score(out_label_list, preds_list),
|
||||||
@@ -253,7 +268,8 @@ def main():
|
|||||||
|
|
||||||
# Predict
|
# Predict
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
test_dataset = NerDataset(
|
test_dataset = TokenClassificationDataset(
|
||||||
|
token_classification_task=token_classification_task,
|
||||||
data_dir=data_args.data_dir,
|
data_dir=data_args.data_dir,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
labels=labels,
|
labels=labels,
|
||||||
@@ -278,19 +294,7 @@ def main():
|
|||||||
if trainer.is_world_master():
|
if trainer.is_world_master():
|
||||||
with open(output_test_predictions_file, "w") as writer:
|
with open(output_test_predictions_file, "w") as writer:
|
||||||
with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
|
with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
|
||||||
example_id = 0
|
token_classification_task.write_predictions_to_file(writer, f, preds_list)
|
||||||
for line in f:
|
|
||||||
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
|
|
||||||
writer.write(line)
|
|
||||||
if not preds_list[example_id]:
|
|
||||||
example_id += 1
|
|
||||||
elif preds_list[example_id]:
|
|
||||||
output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
|
|
||||||
writer.write(output_line)
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
"Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
|
|
||||||
)
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
@@ -2,15 +2,17 @@ import argparse
|
|||||||
import glob
|
import glob
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
from argparse import Namespace
|
||||||
|
from importlib import import_module
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from seqeval.metrics import f1_score, precision_score, recall_score
|
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from torch.utils.data import DataLoader, TensorDataset
|
from torch.utils.data import DataLoader, TensorDataset
|
||||||
|
|
||||||
from lightning_base import BaseTransformer, add_generic_args, generic_train
|
from lightning_base import BaseTransformer, add_generic_args, generic_train
|
||||||
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
|
from utils_ner import TokenClassificationTask
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -24,10 +26,20 @@ class NERTransformer(BaseTransformer):
|
|||||||
mode = "token-classification"
|
mode = "token-classification"
|
||||||
|
|
||||||
def __init__(self, hparams):
|
def __init__(self, hparams):
|
||||||
self.labels = get_labels(hparams.labels)
|
if type(hparams) == dict:
|
||||||
num_labels = len(self.labels)
|
hparams = Namespace(**hparams)
|
||||||
|
module = import_module("tasks")
|
||||||
|
try:
|
||||||
|
token_classification_task_clazz = getattr(module, hparams.task_type)
|
||||||
|
self.token_classification_task: TokenClassificationTask = token_classification_task_clazz()
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError(
|
||||||
|
f"Task {hparams.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
|
||||||
|
f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
|
||||||
|
)
|
||||||
|
self.labels = self.token_classification_task.get_labels(hparams.labels)
|
||||||
self.pad_token_label_id = CrossEntropyLoss().ignore_index
|
self.pad_token_label_id = CrossEntropyLoss().ignore_index
|
||||||
super().__init__(hparams, num_labels, self.mode)
|
super().__init__(hparams, len(self.labels), self.mode)
|
||||||
|
|
||||||
def forward(self, **inputs):
|
def forward(self, **inputs):
|
||||||
return self.model(**inputs)
|
return self.model(**inputs)
|
||||||
@@ -42,8 +54,8 @@ class NERTransformer(BaseTransformer):
|
|||||||
|
|
||||||
outputs = self(**inputs)
|
outputs = self(**inputs)
|
||||||
loss = outputs[0]
|
loss = outputs[0]
|
||||||
tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
|
# tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
|
||||||
return {"loss": loss, "log": tensorboard_logs}
|
return {"loss": loss}
|
||||||
|
|
||||||
def prepare_data(self):
|
def prepare_data(self):
|
||||||
"Called to initialize data. Use the call to construct features"
|
"Called to initialize data. Use the call to construct features"
|
||||||
@@ -55,8 +67,8 @@ class NERTransformer(BaseTransformer):
|
|||||||
features = torch.load(cached_features_file)
|
features = torch.load(cached_features_file)
|
||||||
else:
|
else:
|
||||||
logger.info("Creating features from dataset file at %s", args.data_dir)
|
logger.info("Creating features from dataset file at %s", args.data_dir)
|
||||||
examples = read_examples_from_file(args.data_dir, mode)
|
examples = self.token_classification_task.read_examples_from_file(args.data_dir, mode)
|
||||||
features = convert_examples_to_features(
|
features = self.token_classification_task.convert_examples_to_features(
|
||||||
examples,
|
examples,
|
||||||
self.labels,
|
self.labels,
|
||||||
args.max_seq_length,
|
args.max_seq_length,
|
||||||
@@ -74,7 +86,7 @@ class NERTransformer(BaseTransformer):
|
|||||||
logger.info("Saving features into cached file %s", cached_features_file)
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
torch.save(features, cached_features_file)
|
torch.save(features, cached_features_file)
|
||||||
|
|
||||||
def load_dataset(self, mode, batch_size):
|
def get_dataloader(self, mode: int, batch_size: int) -> DataLoader:
|
||||||
"Load datasets. Called after prepare data."
|
"Load datasets. Called after prepare data."
|
||||||
cached_features_file = self._feature_file(mode)
|
cached_features_file = self._feature_file(mode)
|
||||||
logger.info("Loading features from cached file %s", cached_features_file)
|
logger.info("Loading features from cached file %s", cached_features_file)
|
||||||
@@ -124,6 +136,7 @@ class NERTransformer(BaseTransformer):
|
|||||||
|
|
||||||
results = {
|
results = {
|
||||||
"val_loss": val_loss_mean,
|
"val_loss": val_loss_mean,
|
||||||
|
"accuracy_score": accuracy_score(out_label_list, preds_list),
|
||||||
"precision": precision_score(out_label_list, preds_list),
|
"precision": precision_score(out_label_list, preds_list),
|
||||||
"recall": recall_score(out_label_list, preds_list),
|
"recall": recall_score(out_label_list, preds_list),
|
||||||
"f1": f1_score(out_label_list, preds_list),
|
"f1": f1_score(out_label_list, preds_list),
|
||||||
@@ -154,6 +167,9 @@ class NERTransformer(BaseTransformer):
|
|||||||
def add_model_specific_args(parser, root_dir):
|
def add_model_specific_args(parser, root_dir):
|
||||||
# Add NER specific options
|
# Add NER specific options
|
||||||
BaseTransformer.add_model_specific_args(parser, root_dir)
|
BaseTransformer.add_model_specific_args(parser, root_dir)
|
||||||
|
parser.add_argument(
|
||||||
|
"--task_type", default="NER", type=str, help="Task type to fine tune in training (e.g. NER, POS, etc)"
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max_seq_length",
|
"--max_seq_length",
|
||||||
default=128,
|
default=128,
|
||||||
|
|||||||
37
examples/token-classification/run_pos.sh
Executable file
37
examples/token-classification/run_pos.sh
Executable file
@@ -0,0 +1,37 @@
|
|||||||
|
if ! [ -f ./dev.txt ]; then
|
||||||
|
echo "Download dev dataset...."
|
||||||
|
curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! [ -f ./test.txt ]; then
|
||||||
|
echo "Download test dataset...."
|
||||||
|
curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! [ -f ./train.txt ]; then
|
||||||
|
echo "Download train dataset...."
|
||||||
|
curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
|
||||||
|
fi
|
||||||
|
|
||||||
|
export MAX_LENGTH=200
|
||||||
|
export BERT_MODEL=bert-base-uncased
|
||||||
|
export OUTPUT_DIR=postagger-model
|
||||||
|
export BATCH_SIZE=32
|
||||||
|
export NUM_EPOCHS=3
|
||||||
|
export SAVE_STEPS=750
|
||||||
|
export SEED=1
|
||||||
|
|
||||||
|
python3 run_ner.py \
|
||||||
|
--task_type POS \
|
||||||
|
--data_dir . \
|
||||||
|
--model_name_or_path $BERT_MODEL \
|
||||||
|
--output_dir $OUTPUT_DIR \
|
||||||
|
--max_seq_length $MAX_LENGTH \
|
||||||
|
--num_train_epochs $NUM_EPOCHS \
|
||||||
|
--per_gpu_train_batch_size $BATCH_SIZE \
|
||||||
|
--save_steps $SAVE_STEPS \
|
||||||
|
--seed $SEED \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--do_predict
|
||||||
|
|
||||||
39
examples/token-classification/run_pos_pl.sh
Executable file
39
examples/token-classification/run_pos_pl.sh
Executable file
@@ -0,0 +1,39 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
if ! [ -f ./dev.txt ]; then
|
||||||
|
echo "Download dev dataset...."
|
||||||
|
curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! [ -f ./test.txt ]; then
|
||||||
|
echo "Download test dataset...."
|
||||||
|
curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! [ -f ./train.txt ]; then
|
||||||
|
echo "Download train dataset...."
|
||||||
|
curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
|
||||||
|
fi
|
||||||
|
|
||||||
|
export MAX_LENGTH=200
|
||||||
|
export BERT_MODEL=bert-base-uncased
|
||||||
|
export OUTPUT_DIR=postagger-model
|
||||||
|
export BATCH_SIZE=32
|
||||||
|
export NUM_EPOCHS=3
|
||||||
|
export SAVE_STEPS=750
|
||||||
|
export SEED=1
|
||||||
|
|
||||||
|
|
||||||
|
# Add parent directory to python path to access lightning_base.py
|
||||||
|
export PYTHONPATH="../":"${PYTHONPATH}"
|
||||||
|
|
||||||
|
python3 run_pl_ner.py --data_dir ./ \
|
||||||
|
--task_type POS \
|
||||||
|
--model_name_or_path $BERT_MODEL \
|
||||||
|
--output_dir $OUTPUT_DIR \
|
||||||
|
--max_seq_length $MAX_LENGTH \
|
||||||
|
--num_train_epochs $NUM_EPOCHS \
|
||||||
|
--train_batch_size $BATCH_SIZE \
|
||||||
|
--seed $SEED \
|
||||||
|
--gpus 1 \
|
||||||
|
--do_train \
|
||||||
|
--do_predict
|
||||||
163
examples/token-classification/tasks.py
Normal file
163
examples/token-classification/tasks.py
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import List, TextIO, Union
|
||||||
|
|
||||||
|
from conllu import parse_incr
|
||||||
|
|
||||||
|
from utils_ner import InputExample, Split, TokenClassificationTask
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class NER(TokenClassificationTask):
|
||||||
|
def __init__(self, label_idx=-1):
|
||||||
|
# in NER datasets, the last column is usually reserved for NER label
|
||||||
|
self.label_idx = label_idx
|
||||||
|
|
||||||
|
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
||||||
|
if isinstance(mode, Split):
|
||||||
|
mode = mode.value
|
||||||
|
file_path = os.path.join(data_dir, f"{mode}.txt")
|
||||||
|
guid_index = 1
|
||||||
|
examples = []
|
||||||
|
with open(file_path, encoding="utf-8") as f:
|
||||||
|
words = []
|
||||||
|
labels = []
|
||||||
|
for line in f:
|
||||||
|
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
|
||||||
|
if words:
|
||||||
|
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
|
||||||
|
guid_index += 1
|
||||||
|
words = []
|
||||||
|
labels = []
|
||||||
|
else:
|
||||||
|
splits = line.split(" ")
|
||||||
|
words.append(splits[0])
|
||||||
|
if len(splits) > 1:
|
||||||
|
labels.append(splits[self.label_idx].replace("\n", ""))
|
||||||
|
else:
|
||||||
|
# Examples could have no label for mode = "test"
|
||||||
|
labels.append("O")
|
||||||
|
if words:
|
||||||
|
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
|
||||||
|
return examples
|
||||||
|
|
||||||
|
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
|
||||||
|
example_id = 0
|
||||||
|
for line in test_input_reader:
|
||||||
|
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
|
||||||
|
writer.write(line)
|
||||||
|
if not preds_list[example_id]:
|
||||||
|
example_id += 1
|
||||||
|
elif preds_list[example_id]:
|
||||||
|
output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
|
||||||
|
writer.write(output_line)
|
||||||
|
else:
|
||||||
|
logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
|
||||||
|
|
||||||
|
def get_labels(self, path: str) -> List[str]:
|
||||||
|
if path:
|
||||||
|
with open(path, "r") as f:
|
||||||
|
labels = f.read().splitlines()
|
||||||
|
if "O" not in labels:
|
||||||
|
labels = ["O"] + labels
|
||||||
|
return labels
|
||||||
|
else:
|
||||||
|
return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
|
||||||
|
|
||||||
|
|
||||||
|
class Chunk(NER):
|
||||||
|
def __init__(self):
|
||||||
|
# in CONLL2003 dataset chunk column is second-to-last
|
||||||
|
super().__init__(label_idx=-2)
|
||||||
|
|
||||||
|
def get_labels(self, path: str) -> List[str]:
|
||||||
|
if path:
|
||||||
|
with open(path, "r") as f:
|
||||||
|
labels = f.read().splitlines()
|
||||||
|
if "O" not in labels:
|
||||||
|
labels = ["O"] + labels
|
||||||
|
return labels
|
||||||
|
else:
|
||||||
|
return [
|
||||||
|
"O",
|
||||||
|
"B-ADVP",
|
||||||
|
"B-INTJ",
|
||||||
|
"B-LST",
|
||||||
|
"B-PRT",
|
||||||
|
"B-NP",
|
||||||
|
"B-SBAR",
|
||||||
|
"B-VP",
|
||||||
|
"B-ADJP",
|
||||||
|
"B-CONJP",
|
||||||
|
"B-PP",
|
||||||
|
"I-ADVP",
|
||||||
|
"I-INTJ",
|
||||||
|
"I-LST",
|
||||||
|
"I-PRT",
|
||||||
|
"I-NP",
|
||||||
|
"I-SBAR",
|
||||||
|
"I-VP",
|
||||||
|
"I-ADJP",
|
||||||
|
"I-CONJP",
|
||||||
|
"I-PP",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class POS(TokenClassificationTask):
|
||||||
|
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
||||||
|
if isinstance(mode, Split):
|
||||||
|
mode = mode.value
|
||||||
|
file_path = os.path.join(data_dir, f"{mode}.txt")
|
||||||
|
guid_index = 1
|
||||||
|
examples = []
|
||||||
|
|
||||||
|
with open(file_path, encoding="utf-8") as f:
|
||||||
|
for sentence in parse_incr(f):
|
||||||
|
words = []
|
||||||
|
labels = []
|
||||||
|
for token in sentence:
|
||||||
|
words.append(token["form"])
|
||||||
|
labels.append(token["upos"])
|
||||||
|
assert len(words) == len(labels)
|
||||||
|
if words:
|
||||||
|
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
|
||||||
|
guid_index += 1
|
||||||
|
return examples
|
||||||
|
|
||||||
|
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
|
||||||
|
example_id = 0
|
||||||
|
for sentence in parse_incr(test_input_reader):
|
||||||
|
s_p = preds_list[example_id]
|
||||||
|
out = ""
|
||||||
|
for token in sentence:
|
||||||
|
out += f'{token["form"]} ({token["upos"]}|{s_p.pop(0)}) '
|
||||||
|
out += "\n"
|
||||||
|
writer.write(out)
|
||||||
|
example_id += 1
|
||||||
|
|
||||||
|
def get_labels(self, path: str) -> List[str]:
|
||||||
|
if path:
|
||||||
|
with open(path, "r") as f:
|
||||||
|
return f.read().splitlines()
|
||||||
|
else:
|
||||||
|
return [
|
||||||
|
"ADJ",
|
||||||
|
"ADP",
|
||||||
|
"ADV",
|
||||||
|
"AUX",
|
||||||
|
"CCONJ",
|
||||||
|
"DET",
|
||||||
|
"INTJ",
|
||||||
|
"NOUN",
|
||||||
|
"NUM",
|
||||||
|
"PART",
|
||||||
|
"PRON",
|
||||||
|
"PROPN",
|
||||||
|
"PUNCT",
|
||||||
|
"SCONJ",
|
||||||
|
"SYM",
|
||||||
|
"VERB",
|
||||||
|
"X",
|
||||||
|
]
|
||||||
@@ -66,202 +66,15 @@ class Split(Enum):
|
|||||||
test = "test"
|
test = "test"
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
class TokenClassificationTask:
|
||||||
import torch
|
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
||||||
from torch import nn
|
raise NotImplementedError
|
||||||
from torch.utils.data.dataset import Dataset
|
|
||||||
|
|
||||||
class NerDataset(Dataset):
|
|
||||||
"""
|
|
||||||
This will be superseded by a framework-agnostic approach
|
|
||||||
soon.
|
|
||||||
"""
|
|
||||||
|
|
||||||
features: List[InputFeatures]
|
|
||||||
pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
|
|
||||||
# Use cross entropy ignore_index as padding label id so that only
|
|
||||||
# real label ids contribute to the loss later.
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
data_dir: str,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
labels: List[str],
|
|
||||||
model_type: str,
|
|
||||||
max_seq_length: Optional[int] = None,
|
|
||||||
overwrite_cache=False,
|
|
||||||
mode: Split = Split.train,
|
|
||||||
):
|
|
||||||
# Load data features from cache or dataset file
|
|
||||||
cached_features_file = os.path.join(
|
|
||||||
data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Make sure only the first process in distributed training processes the dataset,
|
|
||||||
# and the others will use the cache.
|
|
||||||
lock_path = cached_features_file + ".lock"
|
|
||||||
with FileLock(lock_path):
|
|
||||||
|
|
||||||
if os.path.exists(cached_features_file) and not overwrite_cache:
|
|
||||||
logger.info(f"Loading features from cached file {cached_features_file}")
|
|
||||||
self.features = torch.load(cached_features_file)
|
|
||||||
else:
|
|
||||||
logger.info(f"Creating features from dataset file at {data_dir}")
|
|
||||||
examples = read_examples_from_file(data_dir, mode)
|
|
||||||
# TODO clean up all this to leverage built-in features of tokenizers
|
|
||||||
self.features = convert_examples_to_features(
|
|
||||||
examples,
|
|
||||||
labels,
|
|
||||||
max_seq_length,
|
|
||||||
tokenizer,
|
|
||||||
cls_token_at_end=bool(model_type in ["xlnet"]),
|
|
||||||
# xlnet has a cls token at the end
|
|
||||||
cls_token=tokenizer.cls_token,
|
|
||||||
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
|
|
||||||
sep_token=tokenizer.sep_token,
|
|
||||||
sep_token_extra=False,
|
|
||||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
|
||||||
pad_on_left=bool(tokenizer.padding_side == "left"),
|
|
||||||
pad_token=tokenizer.pad_token_id,
|
|
||||||
pad_token_segment_id=tokenizer.pad_token_type_id,
|
|
||||||
pad_token_label_id=self.pad_token_label_id,
|
|
||||||
)
|
|
||||||
logger.info(f"Saving features into cached file {cached_features_file}")
|
|
||||||
torch.save(self.features, cached_features_file)
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self.features)
|
|
||||||
|
|
||||||
def __getitem__(self, i) -> InputFeatures:
|
|
||||||
return self.features[i]
|
|
||||||
|
|
||||||
|
|
||||||
if is_tf_available():
|
|
||||||
import tensorflow as tf
|
|
||||||
|
|
||||||
class TFNerDataset:
|
|
||||||
"""
|
|
||||||
This will be superseded by a framework-agnostic approach
|
|
||||||
soon.
|
|
||||||
"""
|
|
||||||
|
|
||||||
features: List[InputFeatures]
|
|
||||||
pad_token_label_id: int = -100
|
|
||||||
# Use cross entropy ignore_index as padding label id so that only
|
|
||||||
# real label ids contribute to the loss later.
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
data_dir: str,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
labels: List[str],
|
|
||||||
model_type: str,
|
|
||||||
max_seq_length: Optional[int] = None,
|
|
||||||
overwrite_cache=False,
|
|
||||||
mode: Split = Split.train,
|
|
||||||
):
|
|
||||||
examples = read_examples_from_file(data_dir, mode)
|
|
||||||
# TODO clean up all this to leverage built-in features of tokenizers
|
|
||||||
self.features = convert_examples_to_features(
|
|
||||||
examples,
|
|
||||||
labels,
|
|
||||||
max_seq_length,
|
|
||||||
tokenizer,
|
|
||||||
cls_token_at_end=bool(model_type in ["xlnet"]),
|
|
||||||
# xlnet has a cls token at the end
|
|
||||||
cls_token=tokenizer.cls_token,
|
|
||||||
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
|
|
||||||
sep_token=tokenizer.sep_token,
|
|
||||||
sep_token_extra=False,
|
|
||||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
|
||||||
pad_on_left=bool(tokenizer.padding_side == "left"),
|
|
||||||
pad_token=tokenizer.pad_token_id,
|
|
||||||
pad_token_segment_id=tokenizer.pad_token_type_id,
|
|
||||||
pad_token_label_id=self.pad_token_label_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
def gen():
|
|
||||||
for ex in self.features:
|
|
||||||
if ex.token_type_ids is None:
|
|
||||||
yield (
|
|
||||||
{"input_ids": ex.input_ids, "attention_mask": ex.attention_mask},
|
|
||||||
ex.label_ids,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
yield (
|
|
||||||
{
|
|
||||||
"input_ids": ex.input_ids,
|
|
||||||
"attention_mask": ex.attention_mask,
|
|
||||||
"token_type_ids": ex.token_type_ids,
|
|
||||||
},
|
|
||||||
ex.label_ids,
|
|
||||||
)
|
|
||||||
|
|
||||||
if "token_type_ids" not in tokenizer.model_input_names:
|
|
||||||
self.dataset = tf.data.Dataset.from_generator(
|
|
||||||
gen,
|
|
||||||
({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
|
|
||||||
(
|
|
||||||
{"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])},
|
|
||||||
tf.TensorShape([None]),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.dataset = tf.data.Dataset.from_generator(
|
|
||||||
gen,
|
|
||||||
({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
|
|
||||||
(
|
|
||||||
{
|
|
||||||
"input_ids": tf.TensorShape([None]),
|
|
||||||
"attention_mask": tf.TensorShape([None]),
|
|
||||||
"token_type_ids": tf.TensorShape([None]),
|
|
||||||
},
|
|
||||||
tf.TensorShape([None]),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_dataset(self):
|
|
||||||
self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
|
|
||||||
|
|
||||||
return self.dataset
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self.features)
|
|
||||||
|
|
||||||
def __getitem__(self, i) -> InputFeatures:
|
|
||||||
return self.features[i]
|
|
||||||
|
|
||||||
|
|
||||||
def read_examples_from_file(data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
|
||||||
if isinstance(mode, Split):
|
|
||||||
mode = mode.value
|
|
||||||
file_path = os.path.join(data_dir, f"{mode}.txt")
|
|
||||||
guid_index = 1
|
|
||||||
examples = []
|
|
||||||
with open(file_path, encoding="utf-8") as f:
|
|
||||||
words = []
|
|
||||||
labels = []
|
|
||||||
for line in f:
|
|
||||||
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
|
|
||||||
if words:
|
|
||||||
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
|
|
||||||
guid_index += 1
|
|
||||||
words = []
|
|
||||||
labels = []
|
|
||||||
else:
|
|
||||||
splits = line.split(" ")
|
|
||||||
words.append(splits[0])
|
|
||||||
if len(splits) > 1:
|
|
||||||
labels.append(splits[-1].replace("\n", ""))
|
|
||||||
else:
|
|
||||||
# Examples could have no label for mode = "test"
|
|
||||||
labels.append("O")
|
|
||||||
if words:
|
|
||||||
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
|
|
||||||
return examples
|
|
||||||
|
|
||||||
|
def get_labels(self, path: str) -> List[str]:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def convert_examples_to_features(
|
def convert_examples_to_features(
|
||||||
|
self,
|
||||||
examples: List[InputExample],
|
examples: List[InputExample],
|
||||||
label_list: List[str],
|
label_list: List[str],
|
||||||
max_seq_length: int,
|
max_seq_length: int,
|
||||||
@@ -389,12 +202,169 @@ def convert_examples_to_features(
|
|||||||
return features
|
return features
|
||||||
|
|
||||||
|
|
||||||
def get_labels(path: str) -> List[str]:
|
if is_torch_available():
|
||||||
if path:
|
import torch
|
||||||
with open(path, "r") as f:
|
from torch import nn
|
||||||
labels = f.read().splitlines()
|
from torch.utils.data.dataset import Dataset
|
||||||
if "O" not in labels:
|
|
||||||
labels = ["O"] + labels
|
class TokenClassificationDataset(Dataset):
|
||||||
return labels
|
"""
|
||||||
|
This will be superseded by a framework-agnostic approach
|
||||||
|
soon.
|
||||||
|
"""
|
||||||
|
|
||||||
|
features: List[InputFeatures]
|
||||||
|
pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
|
||||||
|
# Use cross entropy ignore_index as padding label id so that only
|
||||||
|
# real label ids contribute to the loss later.
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
token_classification_task: TokenClassificationTask,
|
||||||
|
data_dir: str,
|
||||||
|
tokenizer: PreTrainedTokenizer,
|
||||||
|
labels: List[str],
|
||||||
|
model_type: str,
|
||||||
|
max_seq_length: Optional[int] = None,
|
||||||
|
overwrite_cache=False,
|
||||||
|
mode: Split = Split.train,
|
||||||
|
):
|
||||||
|
# Load data features from cache or dataset file
|
||||||
|
cached_features_file = os.path.join(
|
||||||
|
data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make sure only the first process in distributed training processes the dataset,
|
||||||
|
# and the others will use the cache.
|
||||||
|
lock_path = cached_features_file + ".lock"
|
||||||
|
with FileLock(lock_path):
|
||||||
|
|
||||||
|
if os.path.exists(cached_features_file) and not overwrite_cache:
|
||||||
|
logger.info(f"Loading features from cached file {cached_features_file}")
|
||||||
|
self.features = torch.load(cached_features_file)
|
||||||
else:
|
else:
|
||||||
return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
|
logger.info(f"Creating features from dataset file at {data_dir}")
|
||||||
|
examples = token_classification_task.read_examples_from_file(data_dir, mode)
|
||||||
|
# TODO clean up all this to leverage built-in features of tokenizers
|
||||||
|
self.features = token_classification_task.convert_examples_to_features(
|
||||||
|
examples,
|
||||||
|
labels,
|
||||||
|
max_seq_length,
|
||||||
|
tokenizer,
|
||||||
|
cls_token_at_end=bool(model_type in ["xlnet"]),
|
||||||
|
# xlnet has a cls token at the end
|
||||||
|
cls_token=tokenizer.cls_token,
|
||||||
|
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
|
||||||
|
sep_token=tokenizer.sep_token,
|
||||||
|
sep_token_extra=False,
|
||||||
|
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||||
|
pad_on_left=bool(tokenizer.padding_side == "left"),
|
||||||
|
pad_token=tokenizer.pad_token_id,
|
||||||
|
pad_token_segment_id=tokenizer.pad_token_type_id,
|
||||||
|
pad_token_label_id=self.pad_token_label_id,
|
||||||
|
)
|
||||||
|
logger.info(f"Saving features into cached file {cached_features_file}")
|
||||||
|
torch.save(self.features, cached_features_file)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.features)
|
||||||
|
|
||||||
|
def __getitem__(self, i) -> InputFeatures:
|
||||||
|
return self.features[i]
|
||||||
|
|
||||||
|
|
||||||
|
if is_tf_available():
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
class TFNerDataset:
|
||||||
|
"""
|
||||||
|
This will be superseded by a framework-agnostic approach
|
||||||
|
soon.
|
||||||
|
"""
|
||||||
|
|
||||||
|
features: List[InputFeatures]
|
||||||
|
pad_token_label_id: int = -100
|
||||||
|
# Use cross entropy ignore_index as padding label id so that only
|
||||||
|
# real label ids contribute to the loss later.
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
token_classification_task: TokenClassificationTask,
|
||||||
|
data_dir: str,
|
||||||
|
tokenizer: PreTrainedTokenizer,
|
||||||
|
labels: List[str],
|
||||||
|
model_type: str,
|
||||||
|
max_seq_length: Optional[int] = None,
|
||||||
|
overwrite_cache=False,
|
||||||
|
mode: Split = Split.train,
|
||||||
|
):
|
||||||
|
examples = token_classification_task.read_examples_from_file(data_dir, mode)
|
||||||
|
# TODO clean up all this to leverage built-in features of tokenizers
|
||||||
|
self.features = token_classification_task.convert_examples_to_features(
|
||||||
|
examples,
|
||||||
|
labels,
|
||||||
|
max_seq_length,
|
||||||
|
tokenizer,
|
||||||
|
cls_token_at_end=bool(model_type in ["xlnet"]),
|
||||||
|
# xlnet has a cls token at the end
|
||||||
|
cls_token=tokenizer.cls_token,
|
||||||
|
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
|
||||||
|
sep_token=tokenizer.sep_token,
|
||||||
|
sep_token_extra=False,
|
||||||
|
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||||
|
pad_on_left=bool(tokenizer.padding_side == "left"),
|
||||||
|
pad_token=tokenizer.pad_token_id,
|
||||||
|
pad_token_segment_id=tokenizer.pad_token_type_id,
|
||||||
|
pad_token_label_id=self.pad_token_label_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
def gen():
|
||||||
|
for ex in self.features:
|
||||||
|
if ex.token_type_ids is None:
|
||||||
|
yield (
|
||||||
|
{"input_ids": ex.input_ids, "attention_mask": ex.attention_mask},
|
||||||
|
ex.label_ids,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
yield (
|
||||||
|
{
|
||||||
|
"input_ids": ex.input_ids,
|
||||||
|
"attention_mask": ex.attention_mask,
|
||||||
|
"token_type_ids": ex.token_type_ids,
|
||||||
|
},
|
||||||
|
ex.label_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
if "token_type_ids" not in tokenizer.model_input_names:
|
||||||
|
self.dataset = tf.data.Dataset.from_generator(
|
||||||
|
gen,
|
||||||
|
({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
|
||||||
|
(
|
||||||
|
{"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])},
|
||||||
|
tf.TensorShape([None]),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.dataset = tf.data.Dataset.from_generator(
|
||||||
|
gen,
|
||||||
|
({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
|
||||||
|
(
|
||||||
|
{
|
||||||
|
"input_ids": tf.TensorShape([None]),
|
||||||
|
"attention_mask": tf.TensorShape([None]),
|
||||||
|
"token_type_ids": tf.TensorShape([None]),
|
||||||
|
},
|
||||||
|
tf.TensorShape([None]),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_dataset(self):
|
||||||
|
self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
|
||||||
|
|
||||||
|
return self.dataset
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.features)
|
||||||
|
|
||||||
|
def __getitem__(self, i) -> InputFeatures:
|
||||||
|
return self.features[i]
|
||||||
|
|||||||
Reference in New Issue
Block a user