Add new token classification example (#8340)
* Add new token classification example * Remove txt file * Add test * With actual testing done * Less warmup is better * Update examples/token-classification/run_ner_new.py Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com> * Address review comments * Fix test * Make Lysandre happy * Last touches and rename * Rename in tests * Address review comments * More run_ner -> run_ner_old Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com>
This commit is contained in:
@@ -37,7 +37,7 @@ git checkout tags/v3.4.0
|
|||||||
|---|---|:---:|:---:|:---:|:---:|
|
|---|---|:---:|:---:|:---:|:---:|
|
||||||
| [**`language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/language-modeling) | Raw text | ✅ | - | ✅ | [](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)
|
| [**`language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/language-modeling) | Raw text | ✅ | - | ✅ | [](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)
|
||||||
| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification) | GLUE, XNLI | ✅ | ✅ | ✅ | [](https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb)
|
| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification) | GLUE, XNLI | ✅ | ✅ | ✅ | [](https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb)
|
||||||
| [**`token-classification`**](https://github.com/huggingface/transformers/tree/master/examples/token-classification) | CoNLL NER | ✅ | ✅ | - | -
|
| [**`token-classification`**](https://github.com/huggingface/transformers/tree/master/examples/token-classification) | CoNLL NER | ✅ | ✅ | ✅ | -
|
||||||
| [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice) | SWAG, RACE, ARC | ✅ | ✅ | - | [](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
|
| [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice) | SWAG, RACE, ARC | ✅ | ✅ | - | [](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
|
||||||
| [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering) | SQuAD | ✅ | ✅ | - | -
|
| [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering) | SQuAD | ✅ | ✅ | - | -
|
||||||
| [**`text-generation`**](https://github.com/huggingface/transformers/tree/master/examples/text-generation) | - | n/a | n/a | - | [](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)
|
| [**`text-generation`**](https://github.com/huggingface/transformers/tree/master/examples/text-generation) | - | n/a | n/a | - | [](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)
|
||||||
|
|||||||
@@ -28,7 +28,13 @@ from transformers.testing_utils import TestCasePlus, torch_device
|
|||||||
|
|
||||||
SRC_DIRS = [
|
SRC_DIRS = [
|
||||||
os.path.join(os.path.dirname(__file__), dirname)
|
os.path.join(os.path.dirname(__file__), dirname)
|
||||||
for dirname in ["text-generation", "text-classification", "language-modeling", "question-answering"]
|
for dirname in [
|
||||||
|
"text-generation",
|
||||||
|
"text-classification",
|
||||||
|
"token-classification",
|
||||||
|
"language-modeling",
|
||||||
|
"question-answering",
|
||||||
|
]
|
||||||
]
|
]
|
||||||
sys.path.extend(SRC_DIRS)
|
sys.path.extend(SRC_DIRS)
|
||||||
|
|
||||||
@@ -38,6 +44,7 @@ if SRC_DIRS is not None:
|
|||||||
import run_generation
|
import run_generation
|
||||||
import run_glue
|
import run_glue
|
||||||
import run_mlm
|
import run_mlm
|
||||||
|
import run_ner
|
||||||
import run_pl_glue
|
import run_pl_glue
|
||||||
import run_squad
|
import run_squad
|
||||||
|
|
||||||
@@ -185,6 +192,36 @@ class ExamplesTests(TestCasePlus):
|
|||||||
result = run_mlm.main()
|
result = run_mlm.main()
|
||||||
self.assertLess(result["perplexity"], 42)
|
self.assertLess(result["perplexity"], 42)
|
||||||
|
|
||||||
|
def test_run_ner(self):
|
||||||
|
stream_handler = logging.StreamHandler(sys.stdout)
|
||||||
|
logger.addHandler(stream_handler)
|
||||||
|
|
||||||
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
|
testargs = f"""
|
||||||
|
run_ner.py
|
||||||
|
--model_name_or_path bert-base-uncased
|
||||||
|
--train_file tests/fixtures/tests_samples/conll/sample.json
|
||||||
|
--validation_file tests/fixtures/tests_samples/conll/sample.json
|
||||||
|
--output_dir {tmp_dir}
|
||||||
|
--overwrite_output_dir
|
||||||
|
--do_train
|
||||||
|
--do_eval
|
||||||
|
--warmup_steps=2
|
||||||
|
--learning_rate=2e-4
|
||||||
|
--per_gpu_train_batch_size=2
|
||||||
|
--per_gpu_eval_batch_size=2
|
||||||
|
--num_train_epochs=2
|
||||||
|
""".split()
|
||||||
|
|
||||||
|
if torch_device != "cuda":
|
||||||
|
testargs.append("--no_cuda")
|
||||||
|
|
||||||
|
with patch.object(sys, "argv", testargs):
|
||||||
|
result = run_ner.main()
|
||||||
|
self.assertGreaterEqual(result["eval_accuracy_score"], 0.75)
|
||||||
|
self.assertGreaterEqual(result["eval_precision"], 0.75)
|
||||||
|
self.assertLess(result["eval_loss"], 0.5)
|
||||||
|
|
||||||
def test_run_squad(self):
|
def test_run_squad(self):
|
||||||
stream_handler = logging.StreamHandler(sys.stdout)
|
stream_handler = logging.StreamHandler(sys.stdout)
|
||||||
logger.addHandler(stream_handler)
|
logger.addHandler(stream_handler)
|
||||||
|
|||||||
@@ -1,6 +1,40 @@
|
|||||||
## Named Entity Recognition
|
## Token classification
|
||||||
|
|
||||||
Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner.py) for Pytorch and
|
Fine-tuning the library models for token classification task such as Named Entity Recognition (NER) or Parts-of-speech
|
||||||
|
tagging (POS). The main scrip `run_ner.py` leverages the 🤗 Datasets library and the Trainer API. You can easily
|
||||||
|
customize it to your needs if you need extra processing on your datasets.
|
||||||
|
|
||||||
|
It will either run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own text files for
|
||||||
|
training and validation.
|
||||||
|
|
||||||
|
The following example fine-tunes BERT on CoNLL-2003:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python run_ner.py \
|
||||||
|
--model_name_or_path bert-base-uncased \
|
||||||
|
--dataset_name conll2003 \
|
||||||
|
--output_dir /tmp/test-ner \
|
||||||
|
--do_train \
|
||||||
|
--do_eval
|
||||||
|
```
|
||||||
|
|
||||||
|
or just can just run the bash script `run.sh`.
|
||||||
|
|
||||||
|
To run on your own training and validation files, use the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python run_ner.py \
|
||||||
|
--model_name_or_path bert-base-uncased \
|
||||||
|
--train_file path_to_train_file \
|
||||||
|
--validation_file path_to_validation_file \
|
||||||
|
--output_dir /tmp/test-ner \
|
||||||
|
--do_train \
|
||||||
|
--do_eval
|
||||||
|
```
|
||||||
|
|
||||||
|
## Old version of the script
|
||||||
|
|
||||||
|
Based on the scripts [`run_ner_old.py`](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner_old.py) for Pytorch and
|
||||||
[`run_tf_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_tf_ner.py) for Tensorflow 2.
|
[`run_tf_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_tf_ner.py) for Tensorflow 2.
|
||||||
|
|
||||||
The following examples are covered in this section:
|
The following examples are covered in this section:
|
||||||
@@ -69,7 +103,7 @@ export SEED=1
|
|||||||
To start training, just run:
|
To start training, just run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 run_ner.py --data_dir ./ \
|
python3 run_ner_old.py --data_dir ./ \
|
||||||
--labels ./labels.txt \
|
--labels ./labels.txt \
|
||||||
--model_name_or_path $BERT_MODEL \
|
--model_name_or_path $BERT_MODEL \
|
||||||
--output_dir $OUTPUT_DIR \
|
--output_dir $OUTPUT_DIR \
|
||||||
@@ -87,7 +121,7 @@ If your GPU supports half-precision training, just add the `--fp16` flag. After
|
|||||||
|
|
||||||
#### JSON-based configuration file
|
#### JSON-based configuration file
|
||||||
|
|
||||||
Instead of passing all parameters via commandline arguments, the `run_ner.py` script also supports reading parameters from a json-based configuration file:
|
Instead of passing all parameters via commandline arguments, the `run_ner_old.py` script also supports reading parameters from a json-based configuration file:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
@@ -106,7 +140,7 @@ Instead of passing all parameters via commandline arguments, the `run_ner.py` sc
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
It must be saved with a `.json` extension and can be used by running `python3 run_ner.py config.json`.
|
It must be saved with a `.json` extension and can be used by running `python3 run_ner_old.py config.json`.
|
||||||
|
|
||||||
#### Evaluation
|
#### Evaluation
|
||||||
|
|
||||||
@@ -250,7 +284,7 @@ cat data_wnut_17/train.txt data_wnut_17/dev.txt data_wnut_17/test.txt | cut -d "
|
|||||||
|
|
||||||
#### Run the Pytorch version
|
#### Run the Pytorch version
|
||||||
|
|
||||||
Fine-tuning with the PyTorch version can be started using the `run_ner.py` script. In this example we use a JSON-based configuration file.
|
Fine-tuning with the PyTorch version can be started using the `run_ner_old.py` script. In this example we use a JSON-based configuration file.
|
||||||
|
|
||||||
This configuration file looks like:
|
This configuration file looks like:
|
||||||
|
|
||||||
@@ -274,7 +308,7 @@ This configuration file looks like:
|
|||||||
|
|
||||||
If your GPU supports half-precision training, please set `fp16` to `true`.
|
If your GPU supports half-precision training, please set `fp16` to `true`.
|
||||||
|
|
||||||
Save this JSON-based configuration under `wnut_17.json`. The fine-tuning can be started with `python3 run_ner.py wnut_17.json`.
|
Save this JSON-based configuration under `wnut_17.json`. The fine-tuning can be started with `python3 run_ner_old.py wnut_17.json`.
|
||||||
|
|
||||||
#### Evaluation
|
#### Evaluation
|
||||||
|
|
||||||
|
|||||||
@@ -1,36 +1,6 @@
|
|||||||
## The relevant files are currently on a shared Google
|
|
||||||
## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J
|
|
||||||
## Monitor for changes and eventually migrate to nlp dataset
|
|
||||||
curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
|
|
||||||
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
|
|
||||||
curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
|
|
||||||
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
|
|
||||||
curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \
|
|
||||||
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
|
|
||||||
|
|
||||||
export MAX_LENGTH=128
|
|
||||||
export BERT_MODEL=bert-base-multilingual-cased
|
|
||||||
python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
|
|
||||||
python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
|
|
||||||
python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
|
|
||||||
cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
|
|
||||||
export OUTPUT_DIR=germeval-model
|
|
||||||
export BATCH_SIZE=32
|
|
||||||
export NUM_EPOCHS=3
|
|
||||||
export SAVE_STEPS=750
|
|
||||||
export SEED=1
|
|
||||||
|
|
||||||
python3 run_ner.py \
|
python3 run_ner.py \
|
||||||
--task_type NER \
|
--model_name_or_path bert-base-uncased \
|
||||||
--data_dir . \
|
--dataset_name conll2003 \
|
||||||
--labels ./labels.txt \
|
--output_dir /tmp/test-ner \
|
||||||
--model_name_or_path $BERT_MODEL \
|
--do_train \
|
||||||
--output_dir $OUTPUT_DIR \
|
--do_eval
|
||||||
--max_seq_length $MAX_LENGTH \
|
|
||||||
--num_train_epochs $NUM_EPOCHS \
|
|
||||||
--per_gpu_train_batch_size $BATCH_SIZE \
|
|
||||||
--save_steps $SAVE_STEPS \
|
|
||||||
--seed $SEED \
|
|
||||||
--do_train \
|
|
||||||
--do_eval \
|
|
||||||
--do_predict
|
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ export NUM_EPOCHS=3
|
|||||||
export SAVE_STEPS=750
|
export SAVE_STEPS=750
|
||||||
export SEED=1
|
export SEED=1
|
||||||
|
|
||||||
python3 run_ner.py \
|
python3 run_ner_old.py \
|
||||||
--task_type Chunk \
|
--task_type Chunk \
|
||||||
--data_dir . \
|
--data_dir . \
|
||||||
--model_name_or_path $BERT_MODEL \
|
--model_name_or_path $BERT_MODEL \
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
# Copyright 2020 The HuggingFace Team All rights reserved.
|
||||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
# you may not use this file except in compliance with the License.
|
||||||
@@ -13,29 +12,33 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
""" Fine-tuning the library models for named entity recognition on CoNLL-2003. """
|
"""
|
||||||
|
Fine-tuning the library models for token classification.
|
||||||
|
"""
|
||||||
|
# You can also adapt this script on your own token classification task and datasets. Pointers for this are left as comments.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from importlib import import_module
|
from typing import Optional
|
||||||
from typing import Dict, List, Optional, Tuple
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from datasets import load_dataset
|
||||||
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
||||||
from torch import nn
|
|
||||||
|
|
||||||
|
import transformers
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoModelForTokenClassification,
|
AutoModelForTokenClassification,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
EvalPrediction,
|
DataCollatorForTokenClassification,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
Trainer,
|
Trainer,
|
||||||
TrainingArguments,
|
TrainingArguments,
|
||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask
|
from transformers.trainer_utils import is_main_process
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -53,15 +56,9 @@ class ModelArguments:
|
|||||||
config_name: Optional[str] = field(
|
config_name: Optional[str] = field(
|
||||||
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
||||||
)
|
)
|
||||||
task_type: Optional[str] = field(
|
|
||||||
default="NER", metadata={"help": "Task type to fine tune in training (e.g. NER, POS, etc)"}
|
|
||||||
)
|
|
||||||
tokenizer_name: Optional[str] = field(
|
tokenizer_name: Optional[str] = field(
|
||||||
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
||||||
)
|
)
|
||||||
use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
|
|
||||||
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
|
|
||||||
# or just modify its tokenizer_config.json.
|
|
||||||
cache_dir: Optional[str] = field(
|
cache_dir: Optional[str] = field(
|
||||||
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
|
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
|
||||||
)
|
)
|
||||||
@@ -73,23 +70,58 @@ class DataTrainingArguments:
|
|||||||
Arguments pertaining to what data we are going to input our model for training and eval.
|
Arguments pertaining to what data we are going to input our model for training and eval.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
data_dir: str = field(
|
task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."})
|
||||||
metadata={"help": "The input data dir. Should contain the .txt files for a CoNLL-2003-formatted task."}
|
dataset_name: Optional[str] = field(
|
||||||
|
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
|
||||||
)
|
)
|
||||||
labels: Optional[str] = field(
|
dataset_config_name: Optional[str] = field(
|
||||||
|
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||||
|
)
|
||||||
|
train_file: Optional[str] = field(
|
||||||
|
default=None, metadata={"help": "The input training data file (a csv or JSON file)."}
|
||||||
|
)
|
||||||
|
validation_file: Optional[str] = field(
|
||||||
default=None,
|
default=None,
|
||||||
metadata={"help": "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."},
|
metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
|
||||||
)
|
)
|
||||||
max_seq_length: int = field(
|
test_file: Optional[str] = field(
|
||||||
default=128,
|
default=None,
|
||||||
metadata={
|
metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
|
||||||
"help": "The maximum total input sequence length after tokenization. Sequences longer "
|
|
||||||
"than this will be truncated, sequences shorter will be padded."
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
overwrite_cache: bool = field(
|
overwrite_cache: bool = field(
|
||||||
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
||||||
)
|
)
|
||||||
|
preprocessing_num_workers: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "The number of processes to use for the preprocessing."},
|
||||||
|
)
|
||||||
|
pad_to_max_length: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={
|
||||||
|
"help": "Whether to pad all samples to model maximum sentence length. "
|
||||||
|
"If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
|
||||||
|
"efficient on GPU but very bad for TPU."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
label_all_tokens: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={
|
||||||
|
"help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
|
||||||
|
"one (in which case the other tokens will have a padding index)."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
||||||
|
raise ValueError("Need either a dataset name or a training/validation file.")
|
||||||
|
else:
|
||||||
|
if self.train_file is not None:
|
||||||
|
extension = self.train_file.split(".")[-1]
|
||||||
|
assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
|
||||||
|
if self.validation_file is not None:
|
||||||
|
extension = self.validation_file.split(".")[-1]
|
||||||
|
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
|
||||||
|
self.task_name = self.task_name.lower()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@@ -112,60 +144,90 @@ def main():
|
|||||||
and not training_args.overwrite_output_dir
|
and not training_args.overwrite_output_dir
|
||||||
):
|
):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
|
f"Output directory ({training_args.output_dir}) already exists and is not empty."
|
||||||
)
|
"Use --overwrite_output_dir to overcome."
|
||||||
|
|
||||||
module = import_module("tasks")
|
|
||||||
try:
|
|
||||||
token_classification_task_clazz = getattr(module, model_args.task_type)
|
|
||||||
token_classification_task: TokenClassificationTask = token_classification_task_clazz()
|
|
||||||
except AttributeError:
|
|
||||||
raise ValueError(
|
|
||||||
f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
|
|
||||||
f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Setup logging
|
# Setup logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||||
datefmt="%m/%d/%Y %H:%M:%S",
|
datefmt="%m/%d/%Y %H:%M:%S",
|
||||||
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
|
level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Log on each process the small summary:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||||
training_args.local_rank,
|
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||||
training_args.device,
|
|
||||||
training_args.n_gpu,
|
|
||||||
bool(training_args.local_rank != -1),
|
|
||||||
training_args.fp16,
|
|
||||||
)
|
)
|
||||||
|
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||||
|
if is_main_process(training_args.local_rank):
|
||||||
|
transformers.utils.logging.set_verbosity_info()
|
||||||
logger.info("Training/evaluation parameters %s", training_args)
|
logger.info("Training/evaluation parameters %s", training_args)
|
||||||
|
|
||||||
# Set seed
|
# Set seed before initializing model.
|
||||||
set_seed(training_args.seed)
|
set_seed(training_args.seed)
|
||||||
|
|
||||||
# Prepare CONLL-2003 task
|
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
|
||||||
labels = token_classification_task.get_labels(data_args.labels)
|
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
|
||||||
label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
|
# (the dataset will be downloaded automatically from the datasets Hub).
|
||||||
num_labels = len(labels)
|
#
|
||||||
|
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
|
||||||
|
# 'text' is found. You can easily tweak this behavior (see below).
|
||||||
|
#
|
||||||
|
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
|
||||||
|
# download the dataset.
|
||||||
|
if data_args.dataset_name is not None:
|
||||||
|
# Downloading and loading a dataset from the hub.
|
||||||
|
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
|
||||||
|
else:
|
||||||
|
data_files = {}
|
||||||
|
if data_args.train_file is not None:
|
||||||
|
data_files["train"] = data_args.train_file
|
||||||
|
if data_args.validation_file is not None:
|
||||||
|
data_files["validation"] = data_args.validation_file
|
||||||
|
if data_args.test_file is not None:
|
||||||
|
data_files["test"] = data_args.test_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
|
datasets = load_dataset(extension, data_files=data_files)
|
||||||
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
|
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||||
|
|
||||||
|
if training_args.do_train:
|
||||||
|
column_names = datasets["train"].column_names
|
||||||
|
else:
|
||||||
|
column_names = datasets["validation"].column_names
|
||||||
|
text_column_name = "words" if "words" in column_names else column_names[0]
|
||||||
|
label_column_name = data_args.task_name if data_args.task_name in column_names else column_names[1]
|
||||||
|
|
||||||
|
# Labeling (this part will be easier when https://github.com/huggingface/datasets/issues/797 is solved)
|
||||||
|
def get_label_list(labels):
|
||||||
|
unique_labels = set()
|
||||||
|
for label in labels:
|
||||||
|
unique_labels = unique_labels | set(label)
|
||||||
|
label_list = list(unique_labels)
|
||||||
|
label_list.sort()
|
||||||
|
return label_list
|
||||||
|
|
||||||
|
label_list = get_label_list(datasets["train"][label_column_name])
|
||||||
|
label_to_id = {l: i for i, l in enumerate(label_list)}
|
||||||
|
num_labels = len(label_list)
|
||||||
|
|
||||||
# Load pretrained model and tokenizer
|
# Load pretrained model and tokenizer
|
||||||
#
|
#
|
||||||
# Distributed training:
|
# Distributed training:
|
||||||
# The .from_pretrained methods guarantee that only one local process can concurrently
|
# The .from_pretrained methods guarantee that only one local process can concurrently
|
||||||
# download model & vocab.
|
# download model & vocab.
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(
|
config = AutoConfig.from_pretrained(
|
||||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||||
num_labels=num_labels,
|
num_labels=num_labels,
|
||||||
id2label=label_map,
|
finetuning_task=data_args.task_name,
|
||||||
label2id={label: i for i, label in enumerate(labels)},
|
|
||||||
cache_dir=model_args.cache_dir,
|
cache_dir=model_args.cache_dir,
|
||||||
)
|
)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||||
cache_dir=model_args.cache_dir,
|
cache_dir=model_args.cache_dir,
|
||||||
use_fast=model_args.use_fast,
|
use_fast=True,
|
||||||
)
|
)
|
||||||
model = AutoModelForTokenClassification.from_pretrained(
|
model = AutoModelForTokenClassification.from_pretrained(
|
||||||
model_args.model_name_or_path,
|
model_args.model_name_or_path,
|
||||||
@@ -174,67 +236,85 @@ def main():
|
|||||||
cache_dir=model_args.cache_dir,
|
cache_dir=model_args.cache_dir,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get datasets
|
# Preprocessing the dataset
|
||||||
train_dataset = (
|
# Padding strategy
|
||||||
TokenClassificationDataset(
|
padding = "max_length" if data_args.pad_to_max_length else False
|
||||||
token_classification_task=token_classification_task,
|
|
||||||
data_dir=data_args.data_dir,
|
# Tokenize all texts and align the labels with them.
|
||||||
tokenizer=tokenizer,
|
def tokenize_and_align_labels(examples):
|
||||||
labels=labels,
|
tokenized_inputs = tokenizer(
|
||||||
model_type=config.model_type,
|
examples[text_column_name],
|
||||||
max_seq_length=data_args.max_seq_length,
|
padding=padding,
|
||||||
overwrite_cache=data_args.overwrite_cache,
|
truncation=True,
|
||||||
mode=Split.train,
|
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
|
||||||
|
is_split_into_words=True,
|
||||||
|
return_offsets_mapping=True,
|
||||||
)
|
)
|
||||||
if training_args.do_train
|
offset_mappings = tokenized_inputs.pop("offset_mapping")
|
||||||
else None
|
labels = []
|
||||||
)
|
for label, offset_mapping in zip(examples[label_column_name], offset_mappings):
|
||||||
eval_dataset = (
|
label_index = 0
|
||||||
TokenClassificationDataset(
|
current_label = -100
|
||||||
token_classification_task=token_classification_task,
|
label_ids = []
|
||||||
data_dir=data_args.data_dir,
|
for offset in offset_mapping:
|
||||||
tokenizer=tokenizer,
|
# We set the label for the first token of each word. Special characters will have an offset of (0, 0)
|
||||||
labels=labels,
|
# so the test ignores them.
|
||||||
model_type=config.model_type,
|
if offset[0] == 0 and offset[1] != 0:
|
||||||
max_seq_length=data_args.max_seq_length,
|
current_label = label_to_id[label[label_index]]
|
||||||
overwrite_cache=data_args.overwrite_cache,
|
label_index += 1
|
||||||
mode=Split.dev,
|
label_ids.append(current_label)
|
||||||
)
|
# For special tokens, we set the label to -100 so it's automatically ignored in the loss function.
|
||||||
if training_args.do_eval
|
elif offset[0] == 0 and offset[1] == 0:
|
||||||
else None
|
label_ids.append(-100)
|
||||||
|
# For the other tokens in a word, we set the label to either the current label or -100, depending on
|
||||||
|
# the label_all_tokens flag.
|
||||||
|
else:
|
||||||
|
label_ids.append(current_label if data_args.label_all_tokens else -100)
|
||||||
|
|
||||||
|
labels.append(label_ids)
|
||||||
|
tokenized_inputs["labels"] = labels
|
||||||
|
return tokenized_inputs
|
||||||
|
|
||||||
|
tokenized_datasets = datasets.map(
|
||||||
|
tokenize_and_align_labels,
|
||||||
|
batched=True,
|
||||||
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
)
|
||||||
|
|
||||||
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
|
# Data collator
|
||||||
preds = np.argmax(predictions, axis=2)
|
data_collator = DataCollatorForTokenClassification(tokenizer)
|
||||||
|
|
||||||
batch_size, seq_len = preds.shape
|
# Metrics
|
||||||
|
def compute_metrics(p):
|
||||||
|
predictions, labels = p
|
||||||
|
predictions = np.argmax(predictions, axis=2)
|
||||||
|
|
||||||
out_label_list = [[] for _ in range(batch_size)]
|
# Remove ignored index (special tokens)
|
||||||
preds_list = [[] for _ in range(batch_size)]
|
true_predictions = [
|
||||||
|
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
|
||||||
|
for prediction, label in zip(predictions, labels)
|
||||||
|
]
|
||||||
|
true_labels = [
|
||||||
|
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
|
||||||
|
for prediction, label in zip(predictions, labels)
|
||||||
|
]
|
||||||
|
|
||||||
for i in range(batch_size):
|
|
||||||
for j in range(seq_len):
|
|
||||||
if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
|
|
||||||
out_label_list[i].append(label_map[label_ids[i][j]])
|
|
||||||
preds_list[i].append(label_map[preds[i][j]])
|
|
||||||
|
|
||||||
return preds_list, out_label_list
|
|
||||||
|
|
||||||
def compute_metrics(p: EvalPrediction) -> Dict:
|
|
||||||
preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
|
|
||||||
return {
|
return {
|
||||||
"accuracy_score": accuracy_score(out_label_list, preds_list),
|
"accuracy_score": accuracy_score(true_labels, true_predictions),
|
||||||
"precision": precision_score(out_label_list, preds_list),
|
"precision": precision_score(true_labels, true_predictions),
|
||||||
"recall": recall_score(out_label_list, preds_list),
|
"recall": recall_score(true_labels, true_predictions),
|
||||||
"f1": f1_score(out_label_list, preds_list),
|
"f1": f1_score(true_labels, true_predictions),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Initialize our Trainer
|
# Initialize our Trainer
|
||||||
trainer = Trainer(
|
trainer = Trainer(
|
||||||
model=model,
|
model=model,
|
||||||
args=training_args,
|
args=training_args,
|
||||||
train_dataset=train_dataset,
|
train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
|
||||||
eval_dataset=eval_dataset,
|
eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
data_collator=data_collator,
|
||||||
compute_metrics=compute_metrics,
|
compute_metrics=compute_metrics,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -243,58 +323,50 @@ def main():
|
|||||||
trainer.train(
|
trainer.train(
|
||||||
model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
|
model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
|
||||||
)
|
)
|
||||||
trainer.save_model()
|
trainer.save_model() # Saves the tokenizer too for easy upload
|
||||||
# For convenience, we also re-save the tokenizer to the same directory,
|
|
||||||
# so that you can share your model easily on huggingface.co/models =)
|
|
||||||
if trainer.is_world_master():
|
|
||||||
tokenizer.save_pretrained(training_args.output_dir)
|
|
||||||
|
|
||||||
# Evaluation
|
# Evaluation
|
||||||
results = {}
|
results = {}
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
logger.info("*** Evaluate ***")
|
logger.info("*** Evaluate ***")
|
||||||
|
|
||||||
result = trainer.evaluate()
|
results = trainer.evaluate()
|
||||||
|
|
||||||
output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
|
output_eval_file = os.path.join(training_args.output_dir, "eval_results_ner.txt")
|
||||||
if trainer.is_world_master():
|
if trainer.is_world_process_zero():
|
||||||
with open(output_eval_file, "w") as writer:
|
with open(output_eval_file, "w") as writer:
|
||||||
logger.info("***** Eval results *****")
|
logger.info("***** Eval results *****")
|
||||||
for key, value in result.items():
|
for key, value in results.items():
|
||||||
logger.info(" %s = %s", key, value)
|
logger.info(f" {key} = {value}")
|
||||||
writer.write("%s = %s\n" % (key, value))
|
writer.write(f"{key} = {value}\n")
|
||||||
|
|
||||||
results.update(result)
|
|
||||||
|
|
||||||
# Predict
|
# Predict
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
test_dataset = TokenClassificationDataset(
|
logger.info("*** Predict ***")
|
||||||
token_classification_task=token_classification_task,
|
|
||||||
data_dir=data_args.data_dir,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
labels=labels,
|
|
||||||
model_type=config.model_type,
|
|
||||||
max_seq_length=data_args.max_seq_length,
|
|
||||||
overwrite_cache=data_args.overwrite_cache,
|
|
||||||
mode=Split.test,
|
|
||||||
)
|
|
||||||
|
|
||||||
predictions, label_ids, metrics = trainer.predict(test_dataset)
|
test_dataset = datasets["test"]
|
||||||
preds_list, _ = align_predictions(predictions, label_ids)
|
predictions, labels, metrics = trainer.predict(test_dataset)
|
||||||
|
predictions = np.argmax(predictions, axis=2)
|
||||||
|
|
||||||
|
# Remove ignored index (special tokens)
|
||||||
|
true_predictions = [
|
||||||
|
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
|
||||||
|
for prediction, label in zip(predictions, labels)
|
||||||
|
]
|
||||||
|
|
||||||
output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
|
output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
|
||||||
if trainer.is_world_master():
|
if trainer.is_world_master():
|
||||||
with open(output_test_results_file, "w") as writer:
|
with open(output_test_results_file, "w") as writer:
|
||||||
for key, value in metrics.items():
|
for key, value in metrics.items():
|
||||||
logger.info(" %s = %s", key, value)
|
logger.info(f" {key} = {value}")
|
||||||
writer.write("%s = %s\n" % (key, value))
|
writer.write(f"{key} = {value}\n")
|
||||||
|
|
||||||
# Save predictions
|
# Save predictions
|
||||||
output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
|
output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
|
||||||
if trainer.is_world_master():
|
if trainer.is_world_master():
|
||||||
with open(output_test_predictions_file, "w") as writer:
|
with open(output_test_predictions_file, "w") as writer:
|
||||||
with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
|
for prediction in true_predictions:
|
||||||
token_classification_task.write_predictions_to_file(writer, f, preds_list)
|
writer.write(" ".join(prediction) + "\n")
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
308
examples/token-classification/run_ner_old.py
Normal file
308
examples/token-classification/run_ner_old.py
Normal file
@@ -0,0 +1,308 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Fine-tuning the library models for named entity recognition on CoNLL-2003. """
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from importlib import import_module
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
from transformers import (
|
||||||
|
AutoConfig,
|
||||||
|
AutoModelForTokenClassification,
|
||||||
|
AutoTokenizer,
|
||||||
|
EvalPrediction,
|
||||||
|
HfArgumentParser,
|
||||||
|
Trainer,
|
||||||
|
TrainingArguments,
|
||||||
|
set_seed,
|
||||||
|
)
|
||||||
|
from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ModelArguments:
|
||||||
|
"""
|
||||||
|
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_name_or_path: str = field(
|
||||||
|
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
|
||||||
|
)
|
||||||
|
config_name: Optional[str] = field(
|
||||||
|
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
||||||
|
)
|
||||||
|
task_type: Optional[str] = field(
|
||||||
|
default="NER", metadata={"help": "Task type to fine tune in training (e.g. NER, POS, etc)"}
|
||||||
|
)
|
||||||
|
tokenizer_name: Optional[str] = field(
|
||||||
|
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
||||||
|
)
|
||||||
|
use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
|
||||||
|
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
|
||||||
|
# or just modify its tokenizer_config.json.
|
||||||
|
cache_dir: Optional[str] = field(
|
||||||
|
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DataTrainingArguments:
|
||||||
|
"""
|
||||||
|
Arguments pertaining to what data we are going to input our model for training and eval.
|
||||||
|
"""
|
||||||
|
|
||||||
|
data_dir: str = field(
|
||||||
|
metadata={"help": "The input data dir. Should contain the .txt files for a CoNLL-2003-formatted task."}
|
||||||
|
)
|
||||||
|
labels: Optional[str] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."},
|
||||||
|
)
|
||||||
|
max_seq_length: int = field(
|
||||||
|
default=128,
|
||||||
|
metadata={
|
||||||
|
"help": "The maximum total input sequence length after tokenization. Sequences longer "
|
||||||
|
"than this will be truncated, sequences shorter will be padded."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
overwrite_cache: bool = field(
|
||||||
|
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# See all possible arguments in src/transformers/training_args.py
|
||||||
|
# or by passing the --help flag to this script.
|
||||||
|
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
||||||
|
|
||||||
|
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
|
||||||
|
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
||||||
|
# If we pass only one argument to the script and it's the path to a json file,
|
||||||
|
# let's parse it to get our arguments.
|
||||||
|
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
|
||||||
|
else:
|
||||||
|
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
||||||
|
|
||||||
|
if (
|
||||||
|
os.path.exists(training_args.output_dir)
|
||||||
|
and os.listdir(training_args.output_dir)
|
||||||
|
and training_args.do_train
|
||||||
|
and not training_args.overwrite_output_dir
|
||||||
|
):
|
||||||
|
raise ValueError(
|
||||||
|
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
|
||||||
|
)
|
||||||
|
|
||||||
|
module = import_module("tasks")
|
||||||
|
try:
|
||||||
|
token_classification_task_clazz = getattr(module, model_args.task_type)
|
||||||
|
token_classification_task: TokenClassificationTask = token_classification_task_clazz()
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError(
|
||||||
|
f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
|
||||||
|
f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.basicConfig(
|
||||||
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||||
|
datefmt="%m/%d/%Y %H:%M:%S",
|
||||||
|
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
|
||||||
|
)
|
||||||
|
logger.warning(
|
||||||
|
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||||
|
training_args.local_rank,
|
||||||
|
training_args.device,
|
||||||
|
training_args.n_gpu,
|
||||||
|
bool(training_args.local_rank != -1),
|
||||||
|
training_args.fp16,
|
||||||
|
)
|
||||||
|
logger.info("Training/evaluation parameters %s", training_args)
|
||||||
|
|
||||||
|
# Set seed
|
||||||
|
set_seed(training_args.seed)
|
||||||
|
|
||||||
|
# Prepare CONLL-2003 task
|
||||||
|
labels = token_classification_task.get_labels(data_args.labels)
|
||||||
|
label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
|
||||||
|
num_labels = len(labels)
|
||||||
|
|
||||||
|
# Load pretrained model and tokenizer
|
||||||
|
#
|
||||||
|
# Distributed training:
|
||||||
|
# The .from_pretrained methods guarantee that only one local process can concurrently
|
||||||
|
# download model & vocab.
|
||||||
|
|
||||||
|
config = AutoConfig.from_pretrained(
|
||||||
|
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||||
|
num_labels=num_labels,
|
||||||
|
id2label=label_map,
|
||||||
|
label2id={label: i for i, label in enumerate(labels)},
|
||||||
|
cache_dir=model_args.cache_dir,
|
||||||
|
)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||||
|
cache_dir=model_args.cache_dir,
|
||||||
|
use_fast=model_args.use_fast,
|
||||||
|
)
|
||||||
|
model = AutoModelForTokenClassification.from_pretrained(
|
||||||
|
model_args.model_name_or_path,
|
||||||
|
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||||
|
config=config,
|
||||||
|
cache_dir=model_args.cache_dir,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get datasets
|
||||||
|
train_dataset = (
|
||||||
|
TokenClassificationDataset(
|
||||||
|
token_classification_task=token_classification_task,
|
||||||
|
data_dir=data_args.data_dir,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
labels=labels,
|
||||||
|
model_type=config.model_type,
|
||||||
|
max_seq_length=data_args.max_seq_length,
|
||||||
|
overwrite_cache=data_args.overwrite_cache,
|
||||||
|
mode=Split.train,
|
||||||
|
)
|
||||||
|
if training_args.do_train
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
eval_dataset = (
|
||||||
|
TokenClassificationDataset(
|
||||||
|
token_classification_task=token_classification_task,
|
||||||
|
data_dir=data_args.data_dir,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
labels=labels,
|
||||||
|
model_type=config.model_type,
|
||||||
|
max_seq_length=data_args.max_seq_length,
|
||||||
|
overwrite_cache=data_args.overwrite_cache,
|
||||||
|
mode=Split.dev,
|
||||||
|
)
|
||||||
|
if training_args.do_eval
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
|
||||||
|
preds = np.argmax(predictions, axis=2)
|
||||||
|
|
||||||
|
batch_size, seq_len = preds.shape
|
||||||
|
|
||||||
|
out_label_list = [[] for _ in range(batch_size)]
|
||||||
|
preds_list = [[] for _ in range(batch_size)]
|
||||||
|
|
||||||
|
for i in range(batch_size):
|
||||||
|
for j in range(seq_len):
|
||||||
|
if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
|
||||||
|
out_label_list[i].append(label_map[label_ids[i][j]])
|
||||||
|
preds_list[i].append(label_map[preds[i][j]])
|
||||||
|
|
||||||
|
return preds_list, out_label_list
|
||||||
|
|
||||||
|
def compute_metrics(p: EvalPrediction) -> Dict:
|
||||||
|
preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
|
||||||
|
return {
|
||||||
|
"accuracy_score": accuracy_score(out_label_list, preds_list),
|
||||||
|
"precision": precision_score(out_label_list, preds_list),
|
||||||
|
"recall": recall_score(out_label_list, preds_list),
|
||||||
|
"f1": f1_score(out_label_list, preds_list),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Initialize our Trainer
|
||||||
|
trainer = Trainer(
|
||||||
|
model=model,
|
||||||
|
args=training_args,
|
||||||
|
train_dataset=train_dataset,
|
||||||
|
eval_dataset=eval_dataset,
|
||||||
|
compute_metrics=compute_metrics,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Training
|
||||||
|
if training_args.do_train:
|
||||||
|
trainer.train(
|
||||||
|
model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
|
||||||
|
)
|
||||||
|
trainer.save_model()
|
||||||
|
# For convenience, we also re-save the tokenizer to the same directory,
|
||||||
|
# so that you can share your model easily on huggingface.co/models =)
|
||||||
|
if trainer.is_world_master():
|
||||||
|
tokenizer.save_pretrained(training_args.output_dir)
|
||||||
|
|
||||||
|
# Evaluation
|
||||||
|
results = {}
|
||||||
|
if training_args.do_eval:
|
||||||
|
logger.info("*** Evaluate ***")
|
||||||
|
|
||||||
|
result = trainer.evaluate()
|
||||||
|
|
||||||
|
output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
|
||||||
|
if trainer.is_world_master():
|
||||||
|
with open(output_eval_file, "w") as writer:
|
||||||
|
logger.info("***** Eval results *****")
|
||||||
|
for key, value in result.items():
|
||||||
|
logger.info(" %s = %s", key, value)
|
||||||
|
writer.write("%s = %s\n" % (key, value))
|
||||||
|
|
||||||
|
results.update(result)
|
||||||
|
|
||||||
|
# Predict
|
||||||
|
if training_args.do_predict:
|
||||||
|
test_dataset = TokenClassificationDataset(
|
||||||
|
token_classification_task=token_classification_task,
|
||||||
|
data_dir=data_args.data_dir,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
labels=labels,
|
||||||
|
model_type=config.model_type,
|
||||||
|
max_seq_length=data_args.max_seq_length,
|
||||||
|
overwrite_cache=data_args.overwrite_cache,
|
||||||
|
mode=Split.test,
|
||||||
|
)
|
||||||
|
|
||||||
|
predictions, label_ids, metrics = trainer.predict(test_dataset)
|
||||||
|
preds_list, _ = align_predictions(predictions, label_ids)
|
||||||
|
|
||||||
|
output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
|
||||||
|
if trainer.is_world_master():
|
||||||
|
with open(output_test_results_file, "w") as writer:
|
||||||
|
for key, value in metrics.items():
|
||||||
|
logger.info(" %s = %s", key, value)
|
||||||
|
writer.write("%s = %s\n" % (key, value))
|
||||||
|
|
||||||
|
# Save predictions
|
||||||
|
output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
|
||||||
|
if trainer.is_world_master():
|
||||||
|
with open(output_test_predictions_file, "w") as writer:
|
||||||
|
with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
|
||||||
|
token_classification_task.write_predictions_to_file(writer, f, preds_list)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _mp_fn(index):
|
||||||
|
# For xla_spawn (TPUs)
|
||||||
|
main()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
36
examples/token-classification/run_old.sh
Executable file
36
examples/token-classification/run_old.sh
Executable file
@@ -0,0 +1,36 @@
|
|||||||
|
## The relevant files are currently on a shared Google
|
||||||
|
## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J
|
||||||
|
## Monitor for changes and eventually migrate to nlp dataset
|
||||||
|
curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
|
||||||
|
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
|
||||||
|
curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
|
||||||
|
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
|
||||||
|
curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \
|
||||||
|
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
|
||||||
|
|
||||||
|
export MAX_LENGTH=128
|
||||||
|
export BERT_MODEL=bert-base-multilingual-cased
|
||||||
|
python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
|
||||||
|
python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
|
||||||
|
python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
|
||||||
|
cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
|
||||||
|
export OUTPUT_DIR=germeval-model
|
||||||
|
export BATCH_SIZE=32
|
||||||
|
export NUM_EPOCHS=3
|
||||||
|
export SAVE_STEPS=750
|
||||||
|
export SEED=1
|
||||||
|
|
||||||
|
python3 run_ner_old.py \
|
||||||
|
--task_type NER \
|
||||||
|
--data_dir . \
|
||||||
|
--labels ./labels.txt \
|
||||||
|
--model_name_or_path $BERT_MODEL \
|
||||||
|
--output_dir $OUTPUT_DIR \
|
||||||
|
--max_seq_length $MAX_LENGTH \
|
||||||
|
--num_train_epochs $NUM_EPOCHS \
|
||||||
|
--per_gpu_train_batch_size $BATCH_SIZE \
|
||||||
|
--save_steps $SAVE_STEPS \
|
||||||
|
--seed $SEED \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--do_predict
|
||||||
@@ -21,7 +21,7 @@ export NUM_EPOCHS=3
|
|||||||
export SAVE_STEPS=750
|
export SAVE_STEPS=750
|
||||||
export SEED=1
|
export SEED=1
|
||||||
|
|
||||||
python3 run_ner.py \
|
python3 _old.py \
|
||||||
--task_type POS \
|
--task_type POS \
|
||||||
--data_dir . \
|
--data_dir . \
|
||||||
--model_name_or_path $BERT_MODEL \
|
--model_name_or_path $BERT_MODEL \
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import sys
|
|||||||
import unittest
|
import unittest
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import run_ner
|
import run_ner_old as run_ner
|
||||||
from transformers.testing_utils import slow
|
from transformers.testing_utils import slow
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ This model is a fine-tuned on [NER-C](https://www.kaggle.com/nltkdata/conll-corp
|
|||||||
| Dev | 40 K |
|
| Dev | 40 K |
|
||||||
|
|
||||||
|
|
||||||
- [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner.py)
|
- [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner_old.py)
|
||||||
|
|
||||||
- Labels covered:
|
- Labels covered:
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ This model is a fine-tuned on [CONLL CORPORA](https://www.kaggle.com/nltkdata/co
|
|||||||
| Train | 445 K |
|
| Train | 445 K |
|
||||||
| Dev | 55 K |
|
| Dev | 55 K |
|
||||||
|
|
||||||
- [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner.py)
|
- [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner_old.py)
|
||||||
|
|
||||||
- Labels covered:
|
- Labels covered:
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ I preprocessed the dataset and split it as train / dev (80/20)
|
|||||||
| Dev | 2.2 K |
|
| Dev | 2.2 K |
|
||||||
|
|
||||||
|
|
||||||
- [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner.py)
|
- [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner_old.py)
|
||||||
|
|
||||||
- Labels covered:
|
- Labels covered:
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ Court decisions from 2017 and 2018 were selected for the dataset, published onli
|
|||||||
| Train | 1657048 |
|
| Train | 1657048 |
|
||||||
| Eval | 500000 |
|
| Eval | 500000 |
|
||||||
|
|
||||||
- Training script: [Fine-tuning script for NER provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner.py)
|
- Training script: [Fine-tuning script for NER provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner_old.py)
|
||||||
Colab: [How to fine-tune a model for NER using HF scripts](https://colab.research.google.com/drive/156Qrd7NsUHwA3nmQ6gXdZY0NzOvqk9AT?usp=sharing)
|
Colab: [How to fine-tune a model for NER using HF scripts](https://colab.research.google.com/drive/156Qrd7NsUHwA3nmQ6gXdZY0NzOvqk9AT?usp=sharing)
|
||||||
|
|
||||||
- Labels covered (and its distribution):
|
- Labels covered (and its distribution):
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ thumbnail:
|
|||||||
|
|
||||||
- Dataset: [GitHub Typo Corpus](https://github.com/mhagiwara/github-typo-corpus) 📚
|
- Dataset: [GitHub Typo Corpus](https://github.com/mhagiwara/github-typo-corpus) 📚
|
||||||
|
|
||||||
- [Fine-tune script on NER dataset provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner.py) 🏋️♂️
|
- [Fine-tune script on NER dataset provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner_old.py) 🏋️♂️
|
||||||
|
|
||||||
## Metrics on test set 📋
|
## Metrics on test set 📋
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ I preprocessed the dataset and split it as train / dev (80/20)
|
|||||||
| Dev | 2.2 K |
|
| Dev | 2.2 K |
|
||||||
|
|
||||||
|
|
||||||
- [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner.py)
|
- [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner_old.py)
|
||||||
|
|
||||||
- Labels covered:
|
- Labels covered:
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ This model is a fine-tuned version of the Spanish BERT [(BETO)](https://github.c
|
|||||||
|
|
||||||
- [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora)
|
- [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora)
|
||||||
|
|
||||||
#### [Fine-tune script on NER dataset provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner.py)
|
#### [Fine-tune script on NER dataset provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner_old.py)
|
||||||
|
|
||||||
#### 21 Syntax annotations (Labels) covered:
|
#### 21 Syntax annotations (Labels) covered:
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ I preprocessed the dataset and split it as train / dev (80/20)
|
|||||||
| Dev | 50 K |
|
| Dev | 50 K |
|
||||||
|
|
||||||
|
|
||||||
- [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner.py)
|
- [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner_old.py)
|
||||||
|
|
||||||
- **60** Labels covered:
|
- **60** Labels covered:
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ thumbnail:
|
|||||||
|
|
||||||
- Dataset: [GitHub Typo Corpus](https://github.com/mhagiwara/github-typo-corpus) 📚 for 15 languages
|
- Dataset: [GitHub Typo Corpus](https://github.com/mhagiwara/github-typo-corpus) 📚 for 15 languages
|
||||||
|
|
||||||
- [Fine-tune script on NER dataset provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner.py) 🏋️♂️
|
- [Fine-tune script on NER dataset provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner_old.py) 🏋️♂️
|
||||||
|
|
||||||
## Metrics on test set 📋
|
## Metrics on test set 📋
|
||||||
|
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ export SEED=1
|
|||||||
```
|
```
|
||||||
Then run pre-training:
|
Then run pre-training:
|
||||||
```
|
```
|
||||||
python3 run_ner.py --data_dir ./tr-data3 \
|
python3 run_ner_old.py --data_dir ./tr-data3 \
|
||||||
--model_type bert \
|
--model_type bert \
|
||||||
--labels ./tr-data/labels.txt \
|
--labels ./tr-data/labels.txt \
|
||||||
--model_name_or_path $BERT_MODEL \
|
--model_name_or_path $BERT_MODEL \
|
||||||
|
|||||||
10
tests/fixtures/tests_samples/conll/sample.json
vendored
Normal file
10
tests/fixtures/tests_samples/conll/sample.json
vendored
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
{"words": ["He", "was", "the", "27th", "pitcher", "used", "by", "the", "Angels", "this", "season", ",", "tying", "a", "major-league", "record", "."], "ner": ["O", "O", "O", "O", "O", "O", "O", "O", "B-ORG", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
||||||
|
{"words": ["CHICAGO", "AT", "ATLANTA"], "ner": ["B-ORG", "O", "B-LOC"]}
|
||||||
|
{"words": ["President", "Bill", "Clinton", "earlier", "this", "month", "invoked", "special", "powers", "to", "appoint", "Fowler", "during", "the", "congressional", "recess", "because", "the", "Senate", "delayed", "confirming", "his", "nomination", "."], "ner": ["O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "O", "O", "O", "O", "O", "O", "B-ORG", "O", "O", "O", "O", "O"]}
|
||||||
|
{"words": ["goals", "for", ",", "goals", "against", ",", "points", ")", "."], "ner": ["O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
||||||
|
{"words": ["\"", "It", "is", "one", "step", "short", "of", "an", "emergency", "situation", ",", "\"", "a", "police", "spokesman", "said", "via", "telephone", "from", "a", "command", "post", "in", "the", "bush", "."], "ner": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
||||||
|
{"words": ["U.S.", "Ambassador", "Myles", "Frechette", "applauded", "the", "move", ",", "saying", "it", "could", "prompt", "the", "Clinton", "administration", "to", "remove", "Colombia", "from", "a", "list", "of", "outcast", "nations", "that", "have", "failed", "to", "cooperate", "in", "U.S.", "counternarcotics", "efforts", "."], "ner": ["B-LOC", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "O", "O", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-LOC", "O", "O", "O"]}
|
||||||
|
{"words": ["Halftime"], "ner": ["O"]}
|
||||||
|
{"words": ["It", "has", "manufacturing", "plants", "in", "San", "Diego", ";", "Creedmoor", ",", "N.C.", ";", "Hampshire", ",", "England", ";", "and", "Tijuana", ",", "Mexico", ",", "and", "distributes", "its", "prodcuts", "in", "more", "than", "120", "countries", "."], "ner": ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-LOC", "O", "B-LOC", "O", "B-LOC", "O", "B-LOC", "O", "O", "B-LOC", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
||||||
|
{"words": ["Scotland", "manager", "Craig", "Brown", "said", "on", "Thursday", ":", "\"", "I", "'ve", "watched", "Duncan", "Ferguson", "in", "action", "twice", "recently", "and", "he", "'s", "bang", "in", "form", "."], "ner": ["B-LOC", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
||||||
|
{"words": ["Clinton", "flew", "in", "by", "helicopter", "from", "Michigan", "City", ",", "Indiana", ",", "after", "ending", "a", "four-day", ",", "559-mile", "trip", "aboard", "a", "campaign", "train", "from", "Washington", "."], "ner": ["B-PER", "O", "O", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-LOC", "O"]}
|
||||||
Reference in New Issue
Block a user