Reorganize examples (#9010)
* Reorganize example folder * Continue reorganization * Change requirements for tests * Final cleanup * Finish regroup with tests all passing * Copyright * Requirements and readme * Make a full link for the documentation * Address review comments * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Add symlink * Reorg again * Apply suggestions from code review Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com> * Adapt title * Update to new strucutre * Remove test * Update READMEs Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com>
This commit is contained in:
21
examples/legacy/README.md
Normal file
21
examples/legacy/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
<!---
|
||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
# Legacy examples
|
||||
|
||||
This folder contains examples which are not actively maintained (mostly contributed by the community).
|
||||
|
||||
Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
|
||||
391
examples/legacy/pytorch-lightning/lightning_base.py
Normal file
391
examples/legacy/pytorch-lightning/lightning_base.py
Normal file
@@ -0,0 +1,391 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
import pytorch_lightning as pl
|
||||
from pytorch_lightning.utilities import rank_zero_info
|
||||
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoConfig,
|
||||
AutoModel,
|
||||
AutoModelForPreTraining,
|
||||
AutoModelForQuestionAnswering,
|
||||
AutoModelForSeq2SeqLM,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoModelForTokenClassification,
|
||||
AutoModelWithLMHead,
|
||||
AutoTokenizer,
|
||||
PretrainedConfig,
|
||||
PreTrainedTokenizer,
|
||||
)
|
||||
from transformers.optimization import (
|
||||
Adafactor,
|
||||
get_cosine_schedule_with_warmup,
|
||||
get_cosine_with_hard_restarts_schedule_with_warmup,
|
||||
get_linear_schedule_with_warmup,
|
||||
get_polynomial_decay_schedule_with_warmup,
|
||||
)
|
||||
from transformers.utils.versions import require_version_examples
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
require_version_examples("pytorch_lightning>=1.0.4")
|
||||
|
||||
MODEL_MODES = {
|
||||
"base": AutoModel,
|
||||
"sequence-classification": AutoModelForSequenceClassification,
|
||||
"question-answering": AutoModelForQuestionAnswering,
|
||||
"pretraining": AutoModelForPreTraining,
|
||||
"token-classification": AutoModelForTokenClassification,
|
||||
"language-modeling": AutoModelWithLMHead,
|
||||
"summarization": AutoModelForSeq2SeqLM,
|
||||
"translation": AutoModelForSeq2SeqLM,
|
||||
}
|
||||
|
||||
|
||||
# update this and the import above to support new schedulers from transformers.optimization
|
||||
arg_to_scheduler = {
|
||||
"linear": get_linear_schedule_with_warmup,
|
||||
"cosine": get_cosine_schedule_with_warmup,
|
||||
"cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
|
||||
"polynomial": get_polynomial_decay_schedule_with_warmup,
|
||||
# '': get_constant_schedule, # not supported for now
|
||||
# '': get_constant_schedule_with_warmup, # not supported for now
|
||||
}
|
||||
arg_to_scheduler_choices = sorted(arg_to_scheduler.keys())
|
||||
arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}"
|
||||
|
||||
|
||||
class BaseTransformer(pl.LightningModule):
|
||||
def __init__(
|
||||
self,
|
||||
hparams: argparse.Namespace,
|
||||
num_labels=None,
|
||||
mode="base",
|
||||
config=None,
|
||||
tokenizer=None,
|
||||
model=None,
|
||||
**config_kwargs
|
||||
):
|
||||
"""Initialize a model, tokenizer and config."""
|
||||
super().__init__()
|
||||
# TODO: move to self.save_hyperparameters()
|
||||
# self.save_hyperparameters()
|
||||
# can also expand arguments into trainer signature for easier reading
|
||||
|
||||
self.save_hyperparameters(hparams)
|
||||
self.step_count = 0
|
||||
self.output_dir = Path(self.hparams.output_dir)
|
||||
cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
|
||||
if config is None:
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
|
||||
**({"num_labels": num_labels} if num_labels is not None else {}),
|
||||
cache_dir=cache_dir,
|
||||
**config_kwargs,
|
||||
)
|
||||
else:
|
||||
self.config: PretrainedConfig = config
|
||||
|
||||
extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
|
||||
for p in extra_model_params:
|
||||
if getattr(self.hparams, p, None):
|
||||
assert hasattr(self.config, p), f"model config doesn't have a `{p}` attribute"
|
||||
setattr(self.config, p, getattr(self.hparams, p))
|
||||
|
||||
if tokenizer is None:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
|
||||
cache_dir=cache_dir,
|
||||
)
|
||||
else:
|
||||
self.tokenizer: PreTrainedTokenizer = tokenizer
|
||||
self.model_type = MODEL_MODES[mode]
|
||||
if model is None:
|
||||
self.model = self.model_type.from_pretrained(
|
||||
self.hparams.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
|
||||
config=self.config,
|
||||
cache_dir=cache_dir,
|
||||
)
|
||||
else:
|
||||
self.model = model
|
||||
|
||||
def load_hf_checkpoint(self, *args, **kwargs):
|
||||
self.model = self.model_type.from_pretrained(*args, **kwargs)
|
||||
|
||||
def get_lr_scheduler(self):
|
||||
get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
|
||||
scheduler = get_schedule_func(
|
||||
self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps()
|
||||
)
|
||||
scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
|
||||
return scheduler
|
||||
|
||||
def configure_optimizers(self):
|
||||
"""Prepare optimizer and schedule (linear warmup and decay)"""
|
||||
model = self.model
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": self.hparams.weight_decay,
|
||||
},
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
if self.hparams.adafactor:
|
||||
optimizer = Adafactor(
|
||||
optimizer_grouped_parameters, lr=self.hparams.learning_rate, scale_parameter=False, relative_step=False
|
||||
)
|
||||
|
||||
else:
|
||||
optimizer = AdamW(
|
||||
optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
|
||||
)
|
||||
self.opt = optimizer
|
||||
|
||||
scheduler = self.get_lr_scheduler()
|
||||
|
||||
return [optimizer], [scheduler]
|
||||
|
||||
def test_step(self, batch, batch_nb):
|
||||
return self.validation_step(batch, batch_nb)
|
||||
|
||||
def test_epoch_end(self, outputs):
|
||||
return self.validation_end(outputs)
|
||||
|
||||
def total_steps(self) -> int:
|
||||
"""The number of total training steps that will be run. Used for lr scheduler purposes."""
|
||||
num_devices = max(1, self.hparams.gpus) # TODO: consider num_tpu_cores
|
||||
effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
|
||||
return (self.dataset_size / effective_batch_size) * self.hparams.max_epochs
|
||||
|
||||
def setup(self, mode):
|
||||
if mode == "test":
|
||||
self.dataset_size = len(self.test_dataloader().dataset)
|
||||
else:
|
||||
self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)
|
||||
self.dataset_size = len(self.train_dataloader().dataset)
|
||||
|
||||
def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False):
|
||||
raise NotImplementedError("You must implement this for your task")
|
||||
|
||||
def train_dataloader(self):
|
||||
return self.train_loader
|
||||
|
||||
def val_dataloader(self):
|
||||
return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)
|
||||
|
||||
def test_dataloader(self):
|
||||
return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)
|
||||
|
||||
def _feature_file(self, mode):
|
||||
return os.path.join(
|
||||
self.hparams.data_dir,
|
||||
"cached_{}_{}_{}".format(
|
||||
mode,
|
||||
list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
|
||||
str(self.hparams.max_seq_length),
|
||||
),
|
||||
)
|
||||
|
||||
@pl.utilities.rank_zero_only
|
||||
def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
|
||||
save_path = self.output_dir.joinpath("best_tfmr")
|
||||
self.model.config.save_step = self.step_count
|
||||
self.model.save_pretrained(save_path)
|
||||
self.tokenizer.save_pretrained(save_path)
|
||||
|
||||
@staticmethod
|
||||
def add_model_specific_args(parser, root_dir):
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from huggingface.co",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--encoder_layerdrop",
|
||||
type=float,
|
||||
help="Encoder layer dropout probability (Optional). Goes into model.config",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--decoder_layerdrop",
|
||||
type=float,
|
||||
help="Decoder layer dropout probability (Optional). Goes into model.config",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dropout",
|
||||
type=float,
|
||||
help="Dropout probability (Optional). Goes into model.config",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--attention_dropout",
|
||||
type=float,
|
||||
help="Attention dropout probability (Optional). Goes into model.config",
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument(
|
||||
"--lr_scheduler",
|
||||
default="linear",
|
||||
choices=arg_to_scheduler_choices,
|
||||
metavar=arg_to_scheduler_metavar,
|
||||
type=str,
|
||||
help="Learning rate scheduler",
|
||||
)
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument("--num_workers", default=4, type=int, help="kwarg passed to DataLoader")
|
||||
parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
|
||||
parser.add_argument("--train_batch_size", default=32, type=int)
|
||||
parser.add_argument("--eval_batch_size", default=32, type=int)
|
||||
parser.add_argument("--adafactor", action="store_true")
|
||||
|
||||
|
||||
class LoggingCallback(pl.Callback):
|
||||
def on_batch_end(self, trainer, pl_module):
|
||||
lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
|
||||
lrs = {f"lr_group_{i}": lr for i, lr in enumerate(lr_scheduler.get_lr())}
|
||||
pl_module.logger.log_metrics(lrs)
|
||||
|
||||
def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
|
||||
rank_zero_info("***** Validation results *****")
|
||||
metrics = trainer.callback_metrics
|
||||
# Log results
|
||||
for key in sorted(metrics):
|
||||
if key not in ["log", "progress_bar"]:
|
||||
rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
|
||||
|
||||
def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
|
||||
rank_zero_info("***** Test results *****")
|
||||
metrics = trainer.callback_metrics
|
||||
# Log and save results to file
|
||||
output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
|
||||
with open(output_test_results_file, "w") as writer:
|
||||
for key in sorted(metrics):
|
||||
if key not in ["log", "progress_bar"]:
|
||||
rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
|
||||
writer.write("{} = {}\n".format(key, str(metrics[key])))
|
||||
|
||||
|
||||
def add_generic_args(parser, root_dir) -> None:
|
||||
# To allow all pl args uncomment the following line
|
||||
# parser = pl.Trainer.add_argparse_args(parser)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O2",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int)
|
||||
parser.add_argument("--max_grad_norm", dest="gradient_clip_val", default=1.0, type=float, help="Max gradient norm")
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
dest="accumulate_grad_batches",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
|
||||
)
|
||||
|
||||
|
||||
def generic_train(
|
||||
model: BaseTransformer,
|
||||
args: argparse.Namespace,
|
||||
early_stopping_callback=None,
|
||||
logger=True, # can pass WandbLogger() here
|
||||
extra_callbacks=[],
|
||||
checkpoint_callback=None,
|
||||
logging_callback=None,
|
||||
**extra_train_kwargs
|
||||
):
|
||||
pl.seed_everything(args.seed)
|
||||
|
||||
# init model
|
||||
odir = Path(model.hparams.output_dir)
|
||||
odir.mkdir(exist_ok=True)
|
||||
|
||||
# add custom checkpoints
|
||||
if checkpoint_callback is None:
|
||||
checkpoint_callback = pl.callbacks.ModelCheckpoint(
|
||||
filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
|
||||
)
|
||||
if early_stopping_callback:
|
||||
extra_callbacks.append(early_stopping_callback)
|
||||
if logging_callback is None:
|
||||
logging_callback = LoggingCallback()
|
||||
|
||||
train_params = {}
|
||||
|
||||
# TODO: remove with PyTorch 1.6 since pl uses native amp
|
||||
if args.fp16:
|
||||
train_params["precision"] = 16
|
||||
train_params["amp_level"] = args.fp16_opt_level
|
||||
|
||||
if args.gpus > 1:
|
||||
train_params["distributed_backend"] = "ddp"
|
||||
|
||||
train_params["accumulate_grad_batches"] = args.accumulate_grad_batches
|
||||
train_params["accelerator"] = extra_train_kwargs.get("accelerator", None)
|
||||
train_params["profiler"] = extra_train_kwargs.get("profiler", None)
|
||||
|
||||
trainer = pl.Trainer.from_argparse_args(
|
||||
args,
|
||||
weights_summary=None,
|
||||
callbacks=[logging_callback] + extra_callbacks,
|
||||
logger=logger,
|
||||
checkpoint_callback=checkpoint_callback,
|
||||
**train_params,
|
||||
)
|
||||
|
||||
if args.do_train:
|
||||
trainer.fit(model)
|
||||
|
||||
return trainer
|
||||
21
examples/legacy/pytorch-lightning/requirements.txt
Normal file
21
examples/legacy/pytorch-lightning/requirements.txt
Normal file
@@ -0,0 +1,21 @@
|
||||
tensorboard
|
||||
scikit-learn
|
||||
seqeval
|
||||
psutil
|
||||
sacrebleu
|
||||
rouge-score
|
||||
tensorflow_datasets
|
||||
pytorch-lightning==1.0.4
|
||||
matplotlib
|
||||
git-python==1.0.3
|
||||
faiss-cpu
|
||||
streamlit
|
||||
elasticsearch
|
||||
nltk
|
||||
pandas
|
||||
datasets >= 1.1.3
|
||||
fire
|
||||
pytest
|
||||
conllu
|
||||
sentencepiece != 0.1.92
|
||||
protobuf
|
||||
201
examples/legacy/pytorch-lightning/run_glue.py
Normal file
201
examples/legacy/pytorch-lightning/run_glue.py
Normal file
@@ -0,0 +1,201 @@
|
||||
import argparse
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from argparse import Namespace
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, TensorDataset
|
||||
|
||||
from lightning_base import BaseTransformer, add_generic_args, generic_train
|
||||
from transformers import glue_compute_metrics as compute_metrics
|
||||
from transformers import glue_convert_examples_to_features as convert_examples_to_features
|
||||
from transformers import glue_output_modes
|
||||
from transformers import glue_processors as processors
|
||||
from transformers import glue_tasks_num_labels
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GLUETransformer(BaseTransformer):
|
||||
|
||||
mode = "sequence-classification"
|
||||
|
||||
def __init__(self, hparams):
|
||||
if type(hparams) == dict:
|
||||
hparams = Namespace(**hparams)
|
||||
hparams.glue_output_mode = glue_output_modes[hparams.task]
|
||||
num_labels = glue_tasks_num_labels[hparams.task]
|
||||
|
||||
super().__init__(hparams, num_labels, self.mode)
|
||||
|
||||
def forward(self, **inputs):
|
||||
return self.model(**inputs)
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
|
||||
if self.config.model_type not in ["distilbert", "bart"]:
|
||||
inputs["token_type_ids"] = batch[2] if self.config.model_type in ["bert", "xlnet", "albert"] else None
|
||||
|
||||
outputs = self(**inputs)
|
||||
loss = outputs[0]
|
||||
|
||||
lr_scheduler = self.trainer.lr_schedulers[0]["scheduler"]
|
||||
tensorboard_logs = {"loss": loss, "rate": lr_scheduler.get_last_lr()[-1]}
|
||||
return {"loss": loss, "log": tensorboard_logs}
|
||||
|
||||
def prepare_data(self):
|
||||
"Called to initialize data. Use the call to construct features"
|
||||
args = self.hparams
|
||||
processor = processors[args.task]()
|
||||
self.labels = processor.get_labels()
|
||||
|
||||
for mode in ["train", "dev"]:
|
||||
cached_features_file = self._feature_file(mode)
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", args.data_dir)
|
||||
examples = (
|
||||
processor.get_dev_examples(args.data_dir)
|
||||
if mode == "dev"
|
||||
else processor.get_train_examples(args.data_dir)
|
||||
)
|
||||
features = convert_examples_to_features(
|
||||
examples,
|
||||
self.tokenizer,
|
||||
max_length=args.max_seq_length,
|
||||
label_list=self.labels,
|
||||
output_mode=args.glue_output_mode,
|
||||
)
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
torch.save(features, cached_features_file)
|
||||
|
||||
def get_dataloader(self, mode: str, batch_size: int, shuffle: bool = False) -> DataLoader:
|
||||
"Load datasets. Called after prepare data."
|
||||
|
||||
# We test on dev set to compare to benchmarks without having to submit to GLUE server
|
||||
mode = "dev" if mode == "test" else mode
|
||||
|
||||
cached_features_file = self._feature_file(mode)
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features = torch.load(cached_features_file)
|
||||
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
|
||||
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
|
||||
if self.hparams.glue_output_mode == "classification":
|
||||
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||
elif self.hparams.glue_output_mode == "regression":
|
||||
all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
|
||||
|
||||
return DataLoader(
|
||||
TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels),
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle,
|
||||
)
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
|
||||
if self.config.model_type not in ["distilbert", "bart"]:
|
||||
inputs["token_type_ids"] = batch[2] if self.config.model_type in ["bert", "xlnet", "albert"] else None
|
||||
|
||||
outputs = self(**inputs)
|
||||
tmp_eval_loss, logits = outputs[:2]
|
||||
preds = logits.detach().cpu().numpy()
|
||||
out_label_ids = inputs["labels"].detach().cpu().numpy()
|
||||
|
||||
return {"val_loss": tmp_eval_loss.detach().cpu(), "pred": preds, "target": out_label_ids}
|
||||
|
||||
def _eval_end(self, outputs) -> tuple:
|
||||
val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean().detach().cpu().item()
|
||||
preds = np.concatenate([x["pred"] for x in outputs], axis=0)
|
||||
|
||||
if self.hparams.glue_output_mode == "classification":
|
||||
preds = np.argmax(preds, axis=1)
|
||||
elif self.hparams.glue_output_mode == "regression":
|
||||
preds = np.squeeze(preds)
|
||||
|
||||
out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)
|
||||
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
|
||||
preds_list = [[] for _ in range(out_label_ids.shape[0])]
|
||||
|
||||
results = {**{"val_loss": val_loss_mean}, **compute_metrics(self.hparams.task, preds, out_label_ids)}
|
||||
|
||||
ret = {k: v for k, v in results.items()}
|
||||
ret["log"] = results
|
||||
return ret, preds_list, out_label_list
|
||||
|
||||
def validation_epoch_end(self, outputs: list) -> dict:
|
||||
ret, preds, targets = self._eval_end(outputs)
|
||||
logs = ret["log"]
|
||||
return {"val_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
|
||||
|
||||
def test_epoch_end(self, outputs) -> dict:
|
||||
ret, predictions, targets = self._eval_end(outputs)
|
||||
logs = ret["log"]
|
||||
# `val_loss` is the key returned by `self._eval_end()` but actually refers to `test_loss`
|
||||
return {"avg_test_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
|
||||
|
||||
@staticmethod
|
||||
def add_model_specific_args(parser, root_dir):
|
||||
BaseTransformer.add_model_specific_args(parser, root_dir)
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--task",
|
||||
default="",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The GLUE task to run",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gpus",
|
||||
default=0,
|
||||
type=int,
|
||||
help="The number of GPUs allocated for this, it is by default 0 meaning none",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
add_generic_args(parser, os.getcwd())
|
||||
parser = GLUETransformer.add_model_specific_args(parser, os.getcwd())
|
||||
args = parser.parse_args()
|
||||
|
||||
# If output_dir not provided, a folder will be generated in pwd
|
||||
if args.output_dir is None:
|
||||
args.output_dir = os.path.join(
|
||||
"./results",
|
||||
f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
|
||||
)
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
model = GLUETransformer(args)
|
||||
trainer = generic_train(model, args)
|
||||
|
||||
# Optionally, predict on dev set and write to output_dir
|
||||
if args.do_predict:
|
||||
checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "checkpoint-epoch=*.ckpt"), recursive=True)))
|
||||
model = model.load_from_checkpoint(checkpoints[-1])
|
||||
return trainer.test(model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
34
examples/legacy/pytorch-lightning/run_glue.sh
Executable file
34
examples/legacy/pytorch-lightning/run_glue.sh
Executable file
@@ -0,0 +1,34 @@
|
||||
# Install example requirements
|
||||
pip install -r ../requirements.txt
|
||||
|
||||
# Download glue data
|
||||
python3 ../../utils/download_glue_data.py
|
||||
|
||||
export TASK=mrpc
|
||||
export DATA_DIR=./glue_data/MRPC/
|
||||
export MAX_LENGTH=128
|
||||
export LEARNING_RATE=2e-5
|
||||
export BERT_MODEL=bert-base-cased
|
||||
export BATCH_SIZE=32
|
||||
export NUM_EPOCHS=3
|
||||
export SEED=2
|
||||
export OUTPUT_DIR_NAME=mrpc-pl-bert
|
||||
export CURRENT_DIR=${PWD}
|
||||
export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
|
||||
|
||||
# Make output directory if it doesn't exist
|
||||
mkdir -p $OUTPUT_DIR
|
||||
# Add parent directory to python path to access lightning_base.py
|
||||
export PYTHONPATH="../":"${PYTHONPATH}"
|
||||
|
||||
python3 run_glue.py --gpus 1 --data_dir $DATA_DIR \
|
||||
--task $TASK \
|
||||
--model_name_or_path $BERT_MODEL \
|
||||
--output_dir $OUTPUT_DIR \
|
||||
--max_seq_length $MAX_LENGTH \
|
||||
--learning_rate $LEARNING_RATE \
|
||||
--num_train_epochs $NUM_EPOCHS \
|
||||
--train_batch_size $BATCH_SIZE \
|
||||
--seed $SEED \
|
||||
--do_train \
|
||||
--do_predict
|
||||
215
examples/legacy/pytorch-lightning/run_ner.py
Normal file
215
examples/legacy/pytorch-lightning/run_ner.py
Normal file
@@ -0,0 +1,215 @@
|
||||
import argparse
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
from argparse import Namespace
|
||||
from importlib import import_module
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from torch.utils.data import DataLoader, TensorDataset
|
||||
|
||||
from lightning_base import BaseTransformer, add_generic_args, generic_train
|
||||
from utils_ner import TokenClassificationTask
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NERTransformer(BaseTransformer):
|
||||
"""
|
||||
A training module for NER. See BaseTransformer for the core options.
|
||||
"""
|
||||
|
||||
mode = "token-classification"
|
||||
|
||||
def __init__(self, hparams):
|
||||
if type(hparams) == dict:
|
||||
hparams = Namespace(**hparams)
|
||||
module = import_module("tasks")
|
||||
try:
|
||||
token_classification_task_clazz = getattr(module, hparams.task_type)
|
||||
self.token_classification_task: TokenClassificationTask = token_classification_task_clazz()
|
||||
except AttributeError:
|
||||
raise ValueError(
|
||||
f"Task {hparams.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
|
||||
f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
|
||||
)
|
||||
self.labels = self.token_classification_task.get_labels(hparams.labels)
|
||||
self.pad_token_label_id = CrossEntropyLoss().ignore_index
|
||||
super().__init__(hparams, len(self.labels), self.mode)
|
||||
|
||||
def forward(self, **inputs):
|
||||
return self.model(**inputs)
|
||||
|
||||
def training_step(self, batch, batch_num):
|
||||
"Compute loss and log."
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
if self.config.model_type != "distilbert":
|
||||
inputs["token_type_ids"] = (
|
||||
batch[2] if self.config.model_type in ["bert", "xlnet"] else None
|
||||
) # XLM and RoBERTa don"t use token_type_ids
|
||||
|
||||
outputs = self(**inputs)
|
||||
loss = outputs[0]
|
||||
# tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
|
||||
return {"loss": loss}
|
||||
|
||||
def prepare_data(self):
|
||||
"Called to initialize data. Use the call to construct features"
|
||||
args = self.hparams
|
||||
for mode in ["train", "dev", "test"]:
|
||||
cached_features_file = self._feature_file(mode)
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features = torch.load(cached_features_file)
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", args.data_dir)
|
||||
examples = self.token_classification_task.read_examples_from_file(args.data_dir, mode)
|
||||
features = self.token_classification_task.convert_examples_to_features(
|
||||
examples,
|
||||
self.labels,
|
||||
args.max_seq_length,
|
||||
self.tokenizer,
|
||||
cls_token_at_end=bool(self.config.model_type in ["xlnet"]),
|
||||
cls_token=self.tokenizer.cls_token,
|
||||
cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0,
|
||||
sep_token=self.tokenizer.sep_token,
|
||||
sep_token_extra=False,
|
||||
pad_on_left=bool(self.config.model_type in ["xlnet"]),
|
||||
pad_token=self.tokenizer.pad_token_id,
|
||||
pad_token_segment_id=self.tokenizer.pad_token_type_id,
|
||||
pad_token_label_id=self.pad_token_label_id,
|
||||
)
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
torch.save(features, cached_features_file)
|
||||
|
||||
def get_dataloader(self, mode: int, batch_size: int, shuffle: bool = False) -> DataLoader:
|
||||
"Load datasets. Called after prepare data."
|
||||
cached_features_file = self._feature_file(mode)
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features = torch.load(cached_features_file)
|
||||
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
|
||||
if features[0].token_type_ids is not None:
|
||||
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
|
||||
else:
|
||||
all_token_type_ids = torch.tensor([0 for f in features], dtype=torch.long)
|
||||
# HACK(we will not use this anymore soon)
|
||||
all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
|
||||
return DataLoader(
|
||||
TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_label_ids), batch_size=batch_size
|
||||
)
|
||||
|
||||
def validation_step(self, batch, batch_nb):
|
||||
"""Compute validation""" ""
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
if self.config.model_type != "distilbert":
|
||||
inputs["token_type_ids"] = (
|
||||
batch[2] if self.config.model_type in ["bert", "xlnet"] else None
|
||||
) # XLM and RoBERTa don"t use token_type_ids
|
||||
outputs = self(**inputs)
|
||||
tmp_eval_loss, logits = outputs[:2]
|
||||
preds = logits.detach().cpu().numpy()
|
||||
out_label_ids = inputs["labels"].detach().cpu().numpy()
|
||||
return {"val_loss": tmp_eval_loss.detach().cpu(), "pred": preds, "target": out_label_ids}
|
||||
|
||||
def _eval_end(self, outputs):
|
||||
"Evaluation called for both Val and Test"
|
||||
val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean()
|
||||
preds = np.concatenate([x["pred"] for x in outputs], axis=0)
|
||||
preds = np.argmax(preds, axis=2)
|
||||
out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)
|
||||
|
||||
label_map = {i: label for i, label in enumerate(self.labels)}
|
||||
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
|
||||
preds_list = [[] for _ in range(out_label_ids.shape[0])]
|
||||
|
||||
for i in range(out_label_ids.shape[0]):
|
||||
for j in range(out_label_ids.shape[1]):
|
||||
if out_label_ids[i, j] != self.pad_token_label_id:
|
||||
out_label_list[i].append(label_map[out_label_ids[i][j]])
|
||||
preds_list[i].append(label_map[preds[i][j]])
|
||||
|
||||
results = {
|
||||
"val_loss": val_loss_mean,
|
||||
"accuracy_score": accuracy_score(out_label_list, preds_list),
|
||||
"precision": precision_score(out_label_list, preds_list),
|
||||
"recall": recall_score(out_label_list, preds_list),
|
||||
"f1": f1_score(out_label_list, preds_list),
|
||||
}
|
||||
|
||||
ret = {k: v for k, v in results.items()}
|
||||
ret["log"] = results
|
||||
return ret, preds_list, out_label_list
|
||||
|
||||
def validation_epoch_end(self, outputs):
|
||||
# when stable
|
||||
ret, preds, targets = self._eval_end(outputs)
|
||||
logs = ret["log"]
|
||||
return {"val_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
|
||||
|
||||
def test_epoch_end(self, outputs):
|
||||
# updating to test_epoch_end instead of deprecated test_end
|
||||
ret, predictions, targets = self._eval_end(outputs)
|
||||
|
||||
# Converting to the dict required by pl
|
||||
# https://github.com/PyTorchLightning/pytorch-lightning/blob/master/\
|
||||
# pytorch_lightning/trainer/logging.py#L139
|
||||
logs = ret["log"]
|
||||
# `val_loss` is the key returned by `self._eval_end()` but actually refers to `test_loss`
|
||||
return {"avg_test_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
|
||||
|
||||
@staticmethod
|
||||
def add_model_specific_args(parser, root_dir):
|
||||
# Add NER specific options
|
||||
BaseTransformer.add_model_specific_args(parser, root_dir)
|
||||
parser.add_argument(
|
||||
"--task_type", default="NER", type=str, help="Task type to fine tune in training (e.g. NER, POS, etc)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--labels",
|
||||
default="",
|
||||
type=str,
|
||||
help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gpus",
|
||||
default=0,
|
||||
type=int,
|
||||
help="The number of GPUs allocated for this, it is by default 0 meaning none",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
add_generic_args(parser, os.getcwd())
|
||||
parser = NERTransformer.add_model_specific_args(parser, os.getcwd())
|
||||
args = parser.parse_args()
|
||||
model = NERTransformer(args)
|
||||
trainer = generic_train(model, args)
|
||||
|
||||
if args.do_predict:
|
||||
# See https://github.com/huggingface/transformers/issues/3159
|
||||
# pl use this default format to create a checkpoint:
|
||||
# https://github.com/PyTorchLightning/pytorch-lightning/blob/master\
|
||||
# /pytorch_lightning/callbacks/model_checkpoint.py#L322
|
||||
checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "checkpoint-epoch=*.ckpt"), recursive=True)))
|
||||
model = model.load_from_checkpoint(checkpoints[-1])
|
||||
trainer.test(model)
|
||||
44
examples/legacy/pytorch-lightning/run_ner.sh
Executable file
44
examples/legacy/pytorch-lightning/run_ner.sh
Executable file
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# for seqeval metrics import
|
||||
pip install -r ../requirements.txt
|
||||
|
||||
## The relevant files are currently on a shared Google
|
||||
## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J
|
||||
## Monitor for changes and eventually migrate to nlp dataset
|
||||
curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
|
||||
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
|
||||
curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
|
||||
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
|
||||
curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \
|
||||
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
|
||||
|
||||
export MAX_LENGTH=128
|
||||
export BERT_MODEL=bert-base-multilingual-cased
|
||||
python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
|
||||
python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
|
||||
python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
|
||||
cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
|
||||
export BATCH_SIZE=32
|
||||
export NUM_EPOCHS=3
|
||||
export SEED=1
|
||||
|
||||
export OUTPUT_DIR_NAME=germeval-model
|
||||
export CURRENT_DIR=${PWD}
|
||||
export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
|
||||
mkdir -p $OUTPUT_DIR
|
||||
|
||||
# Add parent directory to python path to access lightning_base.py
|
||||
export PYTHONPATH="../":"${PYTHONPATH}"
|
||||
|
||||
python3 run_ner.py --data_dir ./ \
|
||||
--labels ./labels.txt \
|
||||
--model_name_or_path $BERT_MODEL \
|
||||
--output_dir $OUTPUT_DIR \
|
||||
--max_seq_length $MAX_LENGTH \
|
||||
--num_train_epochs $NUM_EPOCHS \
|
||||
--train_batch_size $BATCH_SIZE \
|
||||
--seed $SEED \
|
||||
--gpus 1 \
|
||||
--do_train \
|
||||
--do_predict
|
||||
39
examples/legacy/pytorch-lightning/run_pos.sh
Executable file
39
examples/legacy/pytorch-lightning/run_pos.sh
Executable file
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env bash
|
||||
if ! [ -f ./dev.txt ]; then
|
||||
echo "Download dev dataset...."
|
||||
curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
|
||||
fi
|
||||
|
||||
if ! [ -f ./test.txt ]; then
|
||||
echo "Download test dataset...."
|
||||
curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
|
||||
fi
|
||||
|
||||
if ! [ -f ./train.txt ]; then
|
||||
echo "Download train dataset...."
|
||||
curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
|
||||
fi
|
||||
|
||||
export MAX_LENGTH=200
|
||||
export BERT_MODEL=bert-base-uncased
|
||||
export OUTPUT_DIR=postagger-model
|
||||
export BATCH_SIZE=32
|
||||
export NUM_EPOCHS=3
|
||||
export SAVE_STEPS=750
|
||||
export SEED=1
|
||||
|
||||
|
||||
# Add parent directory to python path to access lightning_base.py
|
||||
export PYTHONPATH="../":"${PYTHONPATH}"
|
||||
|
||||
python3 run_ner.py --data_dir ./ \
|
||||
--task_type POS \
|
||||
--model_name_or_path $BERT_MODEL \
|
||||
--output_dir $OUTPUT_DIR \
|
||||
--max_seq_length $MAX_LENGTH \
|
||||
--num_train_epochs $NUM_EPOCHS \
|
||||
--train_batch_size $BATCH_SIZE \
|
||||
--seed $SEED \
|
||||
--gpus 1 \
|
||||
--do_train \
|
||||
--do_predict
|
||||
830
examples/legacy/question-answering/run_squad.py
Normal file
830
examples/legacy/question-answering/run_squad.py
Normal file
@@ -0,0 +1,830 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
|
||||
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import timeit
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
import transformers
|
||||
from transformers import (
|
||||
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
AutoConfig,
|
||||
AutoModelForQuestionAnswering,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
squad_convert_examples_to_features,
|
||||
)
|
||||
from transformers.data.metrics.squad_metrics import (
|
||||
compute_predictions_log_probs,
|
||||
compute_predictions_logits,
|
||||
squad_evaluate,
|
||||
)
|
||||
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
|
||||
from transformers.trainer_utils import is_main_process
|
||||
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except ImportError:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
|
||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||
|
||||
|
||||
def set_seed(args):
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
if args.n_gpu > 0:
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
|
||||
def to_list(tensor):
|
||||
return tensor.detach().cpu().tolist()
|
||||
|
||||
|
||||
def train(args, train_dataset, model, tokenizer):
|
||||
""" Train the model """
|
||||
if args.local_rank in [-1, 0]:
|
||||
tb_writer = SummaryWriter()
|
||||
|
||||
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
|
||||
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
|
||||
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
if args.max_steps > 0:
|
||||
t_total = args.max_steps
|
||||
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
|
||||
else:
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
|
||||
# Check if saved optimizer or scheduler states exist
|
||||
if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
|
||||
os.path.join(args.model_name_or_path, "scheduler.pt")
|
||||
):
|
||||
# Load in optimizer and scheduler states
|
||||
optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
|
||||
scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
|
||||
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||
|
||||
# multi-gpu training (should be after apex fp16 initialization)
|
||||
if args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
|
||||
)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size
|
||||
* args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 1
|
||||
epochs_trained = 0
|
||||
steps_trained_in_current_epoch = 0
|
||||
# Check if continuing training from a checkpoint
|
||||
if os.path.exists(args.model_name_or_path):
|
||||
try:
|
||||
# set global_step to gobal_step of last saved checkpoint from model path
|
||||
checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
|
||||
global_step = int(checkpoint_suffix)
|
||||
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
|
||||
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
|
||||
logger.info(" Continuing training from epoch %d", epochs_trained)
|
||||
logger.info(" Continuing training from global step %d", global_step)
|
||||
logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
|
||||
except ValueError:
|
||||
logger.info(" Starting fine-tuning.")
|
||||
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
# Added here for reproductibility
|
||||
set_seed(args)
|
||||
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
||||
# Skip past any already trained steps if resuming training
|
||||
if steps_trained_in_current_epoch > 0:
|
||||
steps_trained_in_current_epoch -= 1
|
||||
continue
|
||||
|
||||
model.train()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
|
||||
inputs = {
|
||||
"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
"token_type_ids": batch[2],
|
||||
"start_positions": batch[3],
|
||||
"end_positions": batch[4],
|
||||
}
|
||||
|
||||
if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
|
||||
del inputs["token_type_ids"]
|
||||
|
||||
if args.model_type in ["xlnet", "xlm"]:
|
||||
inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
|
||||
if args.version_2_with_negative:
|
||||
inputs.update({"is_impossible": batch[7]})
|
||||
if hasattr(model, "config") and hasattr(model.config, "lang2id"):
|
||||
inputs.update(
|
||||
{"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
|
||||
)
|
||||
|
||||
outputs = model(**inputs)
|
||||
# model outputs are always tuple in transformers (see doc)
|
||||
loss = outputs[0]
|
||||
|
||||
if args.n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
|
||||
if args.fp16:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
tr_loss += loss.item()
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
if args.fp16:
|
||||
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
||||
else:
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
||||
|
||||
optimizer.step()
|
||||
scheduler.step() # Update learning rate schedule
|
||||
model.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
# Log metrics
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
# Only evaluate when single GPU otherwise metrics may not average well
|
||||
if args.local_rank == -1 and args.evaluate_during_training:
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
tb_writer.add_scalar("eval_{}".format(key), value, global_step)
|
||||
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
|
||||
logging_loss = tr_loss
|
||||
|
||||
# Save model checkpoint
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
|
||||
# Take care of distributed/parallel training
|
||||
model_to_save = model.module if hasattr(model, "module") else model
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
torch.save(args, os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
|
||||
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
|
||||
logger.info("Saving optimizer and scheduler states to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
train_iterator.close()
|
||||
break
|
||||
|
||||
if args.local_rank in [-1, 0]:
|
||||
tb_writer.close()
|
||||
|
||||
return global_step, tr_loss / global_step
|
||||
|
||||
|
||||
def evaluate(args, model, tokenizer, prefix=""):
|
||||
dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
|
||||
|
||||
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||
|
||||
# Note that DistributedSampler samples randomly
|
||||
eval_sampler = SequentialSampler(dataset)
|
||||
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
# multi-gpu evaluate
|
||||
if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Eval!
|
||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||
logger.info(" Num examples = %d", len(dataset))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
|
||||
all_results = []
|
||||
start_time = timeit.default_timer()
|
||||
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
model.eval()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
|
||||
with torch.no_grad():
|
||||
inputs = {
|
||||
"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
"token_type_ids": batch[2],
|
||||
}
|
||||
|
||||
if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
|
||||
del inputs["token_type_ids"]
|
||||
|
||||
feature_indices = batch[3]
|
||||
|
||||
# XLNet and XLM use more arguments for their predictions
|
||||
if args.model_type in ["xlnet", "xlm"]:
|
||||
inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
|
||||
# for lang_id-sensitive xlm models
|
||||
if hasattr(model, "config") and hasattr(model.config, "lang2id"):
|
||||
inputs.update(
|
||||
{"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
|
||||
)
|
||||
outputs = model(**inputs)
|
||||
|
||||
for i, feature_index in enumerate(feature_indices):
|
||||
eval_feature = features[feature_index.item()]
|
||||
unique_id = int(eval_feature.unique_id)
|
||||
|
||||
output = [to_list(output[i]) for output in outputs.to_tuple()]
|
||||
|
||||
# Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
|
||||
# models only use two.
|
||||
if len(output) >= 5:
|
||||
start_logits = output[0]
|
||||
start_top_index = output[1]
|
||||
end_logits = output[2]
|
||||
end_top_index = output[3]
|
||||
cls_logits = output[4]
|
||||
|
||||
result = SquadResult(
|
||||
unique_id,
|
||||
start_logits,
|
||||
end_logits,
|
||||
start_top_index=start_top_index,
|
||||
end_top_index=end_top_index,
|
||||
cls_logits=cls_logits,
|
||||
)
|
||||
|
||||
else:
|
||||
start_logits, end_logits = output
|
||||
result = SquadResult(unique_id, start_logits, end_logits)
|
||||
|
||||
all_results.append(result)
|
||||
|
||||
evalTime = timeit.default_timer() - start_time
|
||||
logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
|
||||
|
||||
# Compute predictions
|
||||
output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
|
||||
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
|
||||
|
||||
if args.version_2_with_negative:
|
||||
output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
|
||||
else:
|
||||
output_null_log_odds_file = None
|
||||
|
||||
# XLNet and XLM use a more complex post-processing procedure
|
||||
if args.model_type in ["xlnet", "xlm"]:
|
||||
start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
|
||||
end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
|
||||
|
||||
predictions = compute_predictions_log_probs(
|
||||
examples,
|
||||
features,
|
||||
all_results,
|
||||
args.n_best_size,
|
||||
args.max_answer_length,
|
||||
output_prediction_file,
|
||||
output_nbest_file,
|
||||
output_null_log_odds_file,
|
||||
start_n_top,
|
||||
end_n_top,
|
||||
args.version_2_with_negative,
|
||||
tokenizer,
|
||||
args.verbose_logging,
|
||||
)
|
||||
else:
|
||||
predictions = compute_predictions_logits(
|
||||
examples,
|
||||
features,
|
||||
all_results,
|
||||
args.n_best_size,
|
||||
args.max_answer_length,
|
||||
args.do_lower_case,
|
||||
output_prediction_file,
|
||||
output_nbest_file,
|
||||
output_null_log_odds_file,
|
||||
args.verbose_logging,
|
||||
args.version_2_with_negative,
|
||||
args.null_score_diff_threshold,
|
||||
tokenizer,
|
||||
)
|
||||
|
||||
# Compute the F1 and exact scores.
|
||||
results = squad_evaluate(examples, predictions)
|
||||
return results
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
|
||||
if args.local_rank not in [-1, 0] and not evaluate:
|
||||
# Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
torch.distributed.barrier()
|
||||
|
||||
# Load data features from cache or dataset file
|
||||
input_dir = args.data_dir if args.data_dir else "."
|
||||
cached_features_file = os.path.join(
|
||||
input_dir,
|
||||
"cached_{}_{}_{}".format(
|
||||
"dev" if evaluate else "train",
|
||||
list(filter(None, args.model_name_or_path.split("/"))).pop(),
|
||||
str(args.max_seq_length),
|
||||
),
|
||||
)
|
||||
|
||||
# Init features and dataset from cache if it exists
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features_and_dataset = torch.load(cached_features_file)
|
||||
features, dataset, examples = (
|
||||
features_and_dataset["features"],
|
||||
features_and_dataset["dataset"],
|
||||
features_and_dataset["examples"],
|
||||
)
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", input_dir)
|
||||
|
||||
if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
|
||||
try:
|
||||
import tensorflow_datasets as tfds
|
||||
except ImportError:
|
||||
raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
|
||||
|
||||
if args.version_2_with_negative:
|
||||
logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
|
||||
|
||||
tfds_examples = tfds.load("squad")
|
||||
examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
|
||||
else:
|
||||
processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
|
||||
if evaluate:
|
||||
examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
|
||||
else:
|
||||
examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
|
||||
|
||||
features, dataset = squad_convert_examples_to_features(
|
||||
examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=args.max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=args.max_query_length,
|
||||
is_training=not evaluate,
|
||||
return_dataset="pt",
|
||||
threads=args.threads,
|
||||
)
|
||||
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
|
||||
|
||||
if args.local_rank == 0 and not evaluate:
|
||||
# Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
torch.distributed.barrier()
|
||||
|
||||
if output_examples:
|
||||
return dataset, examples, features
|
||||
return dataset
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model checkpoints and predictions will be written.",
|
||||
)
|
||||
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
help="The input data dir. Should contain the .json files for the task."
|
||||
+ "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_file",
|
||||
default=None,
|
||||
type=str,
|
||||
help="The input training file. If a data dir is specified, will look for the file there"
|
||||
+ "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--predict_file",
|
||||
default=None,
|
||||
type=str,
|
||||
help="The input evaluation file. If a data dir is specified, will look for the file there"
|
||||
+ "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from huggingface.co",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--version_2_with_negative",
|
||||
action="store_true",
|
||||
help="If true, the SQuAD examples contain some that do not have an answer.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--null_score_diff_threshold",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="If null_score - best_non_null is greater than the threshold predict null.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=384,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after WordPiece tokenization. Sequences "
|
||||
"longer than this will be truncated, and sequences shorter than this will be padded.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--doc_stride",
|
||||
default=128,
|
||||
type=int,
|
||||
help="When splitting up a long document into chunks, how much stride to take between chunks.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_query_length",
|
||||
default=64,
|
||||
type=int,
|
||||
help="The maximum number of tokens for the question. Questions longer than this will "
|
||||
"be truncated to this length.",
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
|
||||
)
|
||||
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument(
|
||||
"--n_best_size",
|
||||
default=20,
|
||||
type=int,
|
||||
help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_answer_length",
|
||||
default=30,
|
||||
type=int,
|
||||
help="The maximum length of an answer that can be generated. This is needed because the start "
|
||||
"and end predictions are not conditioned on one another.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose_logging",
|
||||
action="store_true",
|
||||
help="If true, all of the warnings related to data processing will be printed. "
|
||||
"A number of warnings are expected for a normal SQuAD evaluation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lang_id",
|
||||
default=0,
|
||||
type=int,
|
||||
help="language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)",
|
||||
)
|
||||
|
||||
parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
|
||||
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
action="store_true",
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
|
||||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
|
||||
|
||||
parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.doc_stride >= args.max_seq_length - args.max_query_length:
|
||||
logger.warning(
|
||||
"WARNING - You've set a doc stride which may be superior to the document length in some "
|
||||
"examples. This could result in errors when building features from the examples. Please reduce the doc "
|
||||
"stride or increase the maximum length to ensure the features are correctly built."
|
||||
)
|
||||
|
||||
if (
|
||||
os.path.exists(args.output_dir)
|
||||
and os.listdir(args.output_dir)
|
||||
and args.do_train
|
||||
and not args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args.output_dir
|
||||
)
|
||||
)
|
||||
|
||||
# Setup distant debugging if needed
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
# Setup CUDA, GPU & distributed training
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
args.n_gpu = 1
|
||||
args.device = device
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank,
|
||||
device,
|
||||
args.n_gpu,
|
||||
bool(args.local_rank != -1),
|
||||
args.fp16,
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if is_main_process(args.local_rank):
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
# Set seed
|
||||
set_seed(args)
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
if args.local_rank not in [-1, 0]:
|
||||
# Make sure only the first process in distributed training will download model & vocab
|
||||
torch.distributed.barrier()
|
||||
|
||||
args.model_type = args.model_type.lower()
|
||||
config = AutoConfig.from_pretrained(
|
||||
args.config_name if args.config_name else args.model_name_or_path,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
|
||||
)
|
||||
model = AutoModelForQuestionAnswering.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
|
||||
if args.local_rank == 0:
|
||||
# Make sure only the first process in distributed training will download model & vocab
|
||||
torch.distributed.barrier()
|
||||
|
||||
model.to(args.device)
|
||||
|
||||
logger.info("Training/evaluation parameters %s", args)
|
||||
|
||||
# Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
|
||||
# Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
|
||||
# remove the need for this code, but it is still valid.
|
||||
if args.fp16:
|
||||
try:
|
||||
import apex
|
||||
|
||||
apex.amp.register_half_function(torch, "einsum")
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
|
||||
# Training
|
||||
if args.do_train:
|
||||
train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
|
||||
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
|
||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||
|
||||
# Save the trained model and the tokenizer
|
||||
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
# Take care of distributed/parallel training
|
||||
model_to_save = model.module if hasattr(model, "module") else model
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir) # , force_download=True)
|
||||
|
||||
# SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
|
||||
# So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case, use_fast=False)
|
||||
model.to(args.device)
|
||||
|
||||
# Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
if args.do_train:
|
||||
logger.info("Loading checkpoints saved during training for evaluation")
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(
|
||||
os.path.dirname(c)
|
||||
for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
|
||||
)
|
||||
|
||||
else:
|
||||
logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
|
||||
checkpoints = [args.model_name_or_path]
|
||||
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
|
||||
for checkpoint in checkpoints:
|
||||
# Reload the model
|
||||
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
|
||||
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint) # , force_download=True)
|
||||
model.to(args.device)
|
||||
|
||||
# Evaluate
|
||||
result = evaluate(args, model, tokenizer, prefix=global_step)
|
||||
|
||||
result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
|
||||
results.update(result)
|
||||
|
||||
logger.info("Results: {}".format(results))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
174
examples/legacy/question-answering/run_squad_trainer.py
Normal file
174
examples/legacy/question-answering/run_squad_trainer.py
Normal file
@@ -0,0 +1,174 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Fine-tuning the library models for question-answering."""
|
||||
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import transformers
|
||||
from transformers import AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer, HfArgumentParser, SquadDataset
|
||||
from transformers import SquadDataTrainingArguments as DataTrainingArguments
|
||||
from transformers import Trainer, TrainingArguments
|
||||
from transformers.trainer_utils import is_main_process
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelArguments:
|
||||
"""
|
||||
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
|
||||
"""
|
||||
|
||||
model_name_or_path: str = field(
|
||||
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
|
||||
)
|
||||
config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
||||
)
|
||||
tokenizer_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
||||
)
|
||||
use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
|
||||
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
|
||||
# or just modify its tokenizer_config.json.
|
||||
cache_dir: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
# See all possible arguments in src/transformers/training_args.py
|
||||
# or by passing the --help flag to this script.
|
||||
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
||||
|
||||
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
|
||||
|
||||
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
||||
# If we pass only one argument to the script and it's the path to a json file,
|
||||
# let's parse it to get our arguments.
|
||||
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
|
||||
else:
|
||||
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
||||
|
||||
if (
|
||||
os.path.exists(training_args.output_dir)
|
||||
and os.listdir(training_args.output_dir)
|
||||
and training_args.do_train
|
||||
and not training_args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
|
||||
)
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
training_args.local_rank,
|
||||
training_args.device,
|
||||
training_args.n_gpu,
|
||||
bool(training_args.local_rank != -1),
|
||||
training_args.fp16,
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if is_main_process(training_args.local_rank):
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
logger.info("Training/evaluation parameters %s", training_args)
|
||||
|
||||
# Prepare Question-Answering task
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
# Distributed training:
|
||||
# The .from_pretrained methods guarantee that only one local process can concurrently
|
||||
# download model & vocab.
|
||||
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
|
||||
)
|
||||
model = AutoModelForQuestionAnswering.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
)
|
||||
|
||||
# Get datasets
|
||||
is_language_sensitive = hasattr(model.config, "lang2id")
|
||||
train_dataset = (
|
||||
SquadDataset(
|
||||
data_args, tokenizer=tokenizer, is_language_sensitive=is_language_sensitive, cache_dir=model_args.cache_dir
|
||||
)
|
||||
if training_args.do_train
|
||||
else None
|
||||
)
|
||||
eval_dataset = (
|
||||
SquadDataset(
|
||||
data_args,
|
||||
tokenizer=tokenizer,
|
||||
mode="dev",
|
||||
is_language_sensitive=is_language_sensitive,
|
||||
cache_dir=model_args.cache_dir,
|
||||
)
|
||||
if training_args.do_eval
|
||||
else None
|
||||
)
|
||||
|
||||
# Initialize our Trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
)
|
||||
|
||||
# Training
|
||||
if training_args.do_train:
|
||||
trainer.train(
|
||||
model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
|
||||
)
|
||||
trainer.save_model()
|
||||
# For convenience, we also re-save the tokenizer to the same directory,
|
||||
# so that you can share your model easily on huggingface.co/models =)
|
||||
if trainer.is_world_master():
|
||||
tokenizer.save_pretrained(training_args.output_dir)
|
||||
|
||||
|
||||
def _mp_fn(index):
|
||||
# For xla_spawn (TPUs)
|
||||
main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
46
examples/legacy/run_camembert.py
Normal file
46
examples/legacy/run_camembert.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import torch
|
||||
|
||||
from transformers import CamembertForMaskedLM, CamembertTokenizer
|
||||
|
||||
|
||||
def fill_mask(masked_input, model, tokenizer, topk=5):
|
||||
# Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
|
||||
assert masked_input.count("<mask>") == 1
|
||||
input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
logits = model(input_ids)[0] # The last hidden-state is the first element of the output tuple
|
||||
masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
|
||||
logits = logits[0, masked_index, :]
|
||||
prob = logits.softmax(dim=0)
|
||||
values, indices = prob.topk(k=topk, dim=0)
|
||||
topk_predicted_token_bpe = " ".join(
|
||||
[tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
|
||||
)
|
||||
masked_token = tokenizer.mask_token
|
||||
topk_filled_outputs = []
|
||||
for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
|
||||
predicted_token = predicted_token_bpe.replace("\u2581", " ")
|
||||
if " {0}".format(masked_token) in masked_input:
|
||||
topk_filled_outputs.append(
|
||||
(
|
||||
masked_input.replace(" {0}".format(masked_token), predicted_token),
|
||||
values[index].item(),
|
||||
predicted_token,
|
||||
)
|
||||
)
|
||||
else:
|
||||
topk_filled_outputs.append(
|
||||
(
|
||||
masked_input.replace(masked_token, predicted_token),
|
||||
values[index].item(),
|
||||
predicted_token,
|
||||
)
|
||||
)
|
||||
return topk_filled_outputs
|
||||
|
||||
|
||||
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
|
||||
model = CamembertForMaskedLM.from_pretrained("camembert-base")
|
||||
model.eval()
|
||||
|
||||
masked_input = "Le camembert est <mask> :)"
|
||||
print(fill_mask(masked_input, model, tokenizer, topk=3))
|
||||
147
examples/legacy/run_chinese_ref.py
Normal file
147
examples/legacy/run_chinese_ref.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import argparse
|
||||
import json
|
||||
from typing import List
|
||||
|
||||
from ltp import LTP
|
||||
from transformers import BertTokenizer
|
||||
|
||||
|
||||
def _is_chinese_char(cp):
|
||||
"""Checks whether CP is the codepoint of a CJK character."""
|
||||
# This defines a "chinese character" as anything in the CJK Unicode block:
|
||||
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
|
||||
#
|
||||
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
|
||||
# despite its name. The modern Korean Hangul alphabet is a different block,
|
||||
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
|
||||
# space-separated words, so they are not treated specially and handled
|
||||
# like the all of the other languages.
|
||||
if (
|
||||
(cp >= 0x4E00 and cp <= 0x9FFF)
|
||||
or (cp >= 0x3400 and cp <= 0x4DBF) #
|
||||
or (cp >= 0x20000 and cp <= 0x2A6DF) #
|
||||
or (cp >= 0x2A700 and cp <= 0x2B73F) #
|
||||
or (cp >= 0x2B740 and cp <= 0x2B81F) #
|
||||
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
|
||||
or (cp >= 0xF900 and cp <= 0xFAFF)
|
||||
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
|
||||
): #
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def is_chinese(word: str):
|
||||
# word like '180' or '身高' or '神'
|
||||
for char in word:
|
||||
char = ord(char)
|
||||
if not _is_chinese_char(char):
|
||||
return 0
|
||||
return 1
|
||||
|
||||
|
||||
def get_chinese_word(tokens: List[str]):
|
||||
word_set = set()
|
||||
|
||||
for token in tokens:
|
||||
chinese_word = len(token) > 1 and is_chinese(token)
|
||||
if chinese_word:
|
||||
word_set.add(token)
|
||||
word_list = list(word_set)
|
||||
return word_list
|
||||
|
||||
|
||||
def add_sub_symbol(bert_tokens: List[str], chinese_word_set: set()):
|
||||
if not chinese_word_set:
|
||||
return bert_tokens
|
||||
max_word_len = max([len(w) for w in chinese_word_set])
|
||||
|
||||
bert_word = bert_tokens
|
||||
start, end = 0, len(bert_word)
|
||||
while start < end:
|
||||
single_word = True
|
||||
if is_chinese(bert_word[start]):
|
||||
l = min(end - start, max_word_len)
|
||||
for i in range(l, 1, -1):
|
||||
whole_word = "".join(bert_word[start : start + i])
|
||||
if whole_word in chinese_word_set:
|
||||
for j in range(start + 1, start + i):
|
||||
bert_word[j] = "##" + bert_word[j]
|
||||
start = start + i
|
||||
single_word = False
|
||||
break
|
||||
if single_word:
|
||||
start += 1
|
||||
return bert_word
|
||||
|
||||
|
||||
def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer):
|
||||
ltp_res = []
|
||||
|
||||
for i in range(0, len(lines), 100):
|
||||
res = ltp_tokenizer.seg(lines[i : i + 100])[0]
|
||||
res = [get_chinese_word(r) for r in res]
|
||||
ltp_res.extend(res)
|
||||
assert len(ltp_res) == len(lines)
|
||||
|
||||
bert_res = []
|
||||
for i in range(0, len(lines), 100):
|
||||
res = bert_tokenizer(lines[i : i + 100], add_special_tokens=True, truncation=True, max_length=512)
|
||||
bert_res.extend(res["input_ids"])
|
||||
assert len(bert_res) == len(lines)
|
||||
|
||||
ref_ids = []
|
||||
for input_ids, chinese_word in zip(bert_res, ltp_res):
|
||||
|
||||
input_tokens = []
|
||||
for id in input_ids:
|
||||
token = bert_tokenizer._convert_id_to_token(id)
|
||||
input_tokens.append(token)
|
||||
input_tokens = add_sub_symbol(input_tokens, chinese_word)
|
||||
ref_id = []
|
||||
# We only save pos of chinese subwords start with ##, which mean is part of a whole word.
|
||||
for i, token in enumerate(input_tokens):
|
||||
if token[:2] == "##":
|
||||
clean_token = token[2:]
|
||||
# save chinese tokens' pos
|
||||
if len(clean_token) == 1 and _is_chinese_char(ord(clean_token)):
|
||||
ref_id.append(i)
|
||||
ref_ids.append(ref_id)
|
||||
|
||||
assert len(ref_ids) == len(bert_res)
|
||||
|
||||
return ref_ids
|
||||
|
||||
|
||||
def main(args):
|
||||
# For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
|
||||
# If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp)
|
||||
with open(args.file_name, "r", encoding="utf-8") as f:
|
||||
data = f.readlines()
|
||||
data = [line.strip() for line in data if len(line) > 0 and not line.isspace()] # avoid delimiter like '\u2029'
|
||||
ltp_tokenizer = LTP(args.ltp) # faster in GPU device
|
||||
bert_tokenizer = BertTokenizer.from_pretrained(args.bert)
|
||||
|
||||
ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer)
|
||||
|
||||
with open(args.save_path, "w", encoding="utf-8") as f:
|
||||
data = [json.dumps(ref) + "\n" for ref in ref_ids]
|
||||
f.writelines(data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="prepare_chinese_ref")
|
||||
parser.add_argument(
|
||||
"--file_name",
|
||||
type=str,
|
||||
default="./resources/chinese-demo.txt",
|
||||
help="file need process, same as training data in lm",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ltp", type=str, default="./resources/ltp", help="resources for LTP tokenizer, usually a path"
|
||||
)
|
||||
parser.add_argument("--bert", type=str, default="./resources/robert", help="resources for Bert tokenizer")
|
||||
parser.add_argument("--save_path", type=str, default="./resources/ref.txt", help="path to save res")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
363
examples/legacy/run_language_modeling.py
Normal file
363
examples/legacy/run_language_modeling.py
Normal file
@@ -0,0 +1,363 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, CTRL, BERT, RoBERTa, XLNet).
|
||||
GPT, GPT-2 and CTRL are fine-tuned using a causal language modeling (CLM) loss. BERT and RoBERTa are fine-tuned
|
||||
using a masked language modeling (MLM) loss. XLNet is fine-tuned using a permutation language modeling (PLM) loss.
|
||||
"""
|
||||
|
||||
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from glob import glob
|
||||
from typing import Optional
|
||||
|
||||
from torch.utils.data import ConcatDataset
|
||||
|
||||
import transformers
|
||||
from transformers import (
|
||||
CONFIG_MAPPING,
|
||||
MODEL_WITH_LM_HEAD_MAPPING,
|
||||
AutoConfig,
|
||||
AutoModelWithLMHead,
|
||||
AutoTokenizer,
|
||||
DataCollatorForLanguageModeling,
|
||||
DataCollatorForPermutationLanguageModeling,
|
||||
DataCollatorForWholeWordMask,
|
||||
HfArgumentParser,
|
||||
LineByLineTextDataset,
|
||||
LineByLineWithRefDataset,
|
||||
PreTrainedTokenizer,
|
||||
TextDataset,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.trainer_utils import is_main_process
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
|
||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelArguments:
|
||||
"""
|
||||
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
|
||||
"""
|
||||
|
||||
model_name_or_path: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
|
||||
},
|
||||
)
|
||||
model_type: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
|
||||
)
|
||||
config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
||||
)
|
||||
tokenizer_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
||||
)
|
||||
cache_dir: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataTrainingArguments:
|
||||
"""
|
||||
Arguments pertaining to what data we are going to input our model for training and eval.
|
||||
"""
|
||||
|
||||
train_data_file: Optional[str] = field(
|
||||
default=None, metadata={"help": "The input training data file (a text file)."}
|
||||
)
|
||||
train_data_files: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "The input training data files (multiple files in glob format). "
|
||||
"Very often splitting large files to smaller files can prevent tokenizer going out of memory"
|
||||
},
|
||||
)
|
||||
eval_data_file: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
|
||||
)
|
||||
train_ref_file: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "An optional input train ref data file for whole word mask in Chinese."},
|
||||
)
|
||||
eval_ref_file: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "An optional input eval ref data file for whole word mask in Chinese."},
|
||||
)
|
||||
line_by_line: bool = field(
|
||||
default=False,
|
||||
metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
|
||||
)
|
||||
|
||||
mlm: bool = field(
|
||||
default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
|
||||
)
|
||||
whole_word_mask: bool = field(default=False, metadata={"help": "Whether ot not to use whole word mask."})
|
||||
mlm_probability: float = field(
|
||||
default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
|
||||
)
|
||||
plm_probability: float = field(
|
||||
default=1 / 6,
|
||||
metadata={
|
||||
"help": "Ratio of length of a span of masked tokens to surrounding context length for permutation language modeling."
|
||||
},
|
||||
)
|
||||
max_span_length: int = field(
|
||||
default=5, metadata={"help": "Maximum length of a span of masked tokens for permutation language modeling."}
|
||||
)
|
||||
|
||||
block_size: int = field(
|
||||
default=-1,
|
||||
metadata={
|
||||
"help": "Optional input sequence length after tokenization."
|
||||
"The training dataset will be truncated in block of this size for training."
|
||||
"Default to the model max input length for single sentence inputs (take into account special tokens)."
|
||||
},
|
||||
)
|
||||
overwrite_cache: bool = field(
|
||||
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
||||
)
|
||||
|
||||
|
||||
def get_dataset(
|
||||
args: DataTrainingArguments,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
evaluate: bool = False,
|
||||
cache_dir: Optional[str] = None,
|
||||
):
|
||||
def _dataset(file_path, ref_path=None):
|
||||
if args.line_by_line:
|
||||
if ref_path is not None:
|
||||
if not args.whole_word_mask or not args.mlm:
|
||||
raise ValueError("You need to set world whole masking and mlm to True for Chinese Whole Word Mask")
|
||||
return LineByLineWithRefDataset(
|
||||
tokenizer=tokenizer,
|
||||
file_path=file_path,
|
||||
block_size=args.block_size,
|
||||
ref_path=ref_path,
|
||||
)
|
||||
|
||||
return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
|
||||
else:
|
||||
return TextDataset(
|
||||
tokenizer=tokenizer,
|
||||
file_path=file_path,
|
||||
block_size=args.block_size,
|
||||
overwrite_cache=args.overwrite_cache,
|
||||
cache_dir=cache_dir,
|
||||
)
|
||||
|
||||
if evaluate:
|
||||
return _dataset(args.eval_data_file, args.eval_ref_file)
|
||||
elif args.train_data_files:
|
||||
return ConcatDataset([_dataset(f) for f in glob(args.train_data_files)])
|
||||
else:
|
||||
return _dataset(args.train_data_file, args.train_ref_file)
|
||||
|
||||
|
||||
def main():
|
||||
# See all possible arguments in src/transformers/training_args.py
|
||||
# or by passing the --help flag to this script.
|
||||
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
||||
|
||||
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
|
||||
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
||||
|
||||
if data_args.eval_data_file is None and training_args.do_eval:
|
||||
raise ValueError(
|
||||
"Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
|
||||
"or remove the --do_eval argument."
|
||||
)
|
||||
if (
|
||||
os.path.exists(training_args.output_dir)
|
||||
and os.listdir(training_args.output_dir)
|
||||
and training_args.do_train
|
||||
and not training_args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
|
||||
)
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
training_args.local_rank,
|
||||
training_args.device,
|
||||
training_args.n_gpu,
|
||||
bool(training_args.local_rank != -1),
|
||||
training_args.fp16,
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if is_main_process(training_args.local_rank):
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
logger.info("Training/evaluation parameters %s", training_args)
|
||||
|
||||
# Set seed
|
||||
set_seed(training_args.seed)
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
# Distributed training:
|
||||
# The .from_pretrained methods guarantee that only one local process can concurrently
|
||||
# download model & vocab.
|
||||
|
||||
if model_args.config_name:
|
||||
config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
|
||||
elif model_args.model_name_or_path:
|
||||
config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
|
||||
else:
|
||||
config = CONFIG_MAPPING[model_args.model_type]()
|
||||
logger.warning("You are instantiating a new config instance from scratch.")
|
||||
|
||||
if model_args.tokenizer_name:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
|
||||
elif model_args.model_name_or_path:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
|
||||
else:
|
||||
raise ValueError(
|
||||
"You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
|
||||
"and load it from here, using --tokenizer_name"
|
||||
)
|
||||
|
||||
if model_args.model_name_or_path:
|
||||
model = AutoModelWithLMHead.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
)
|
||||
else:
|
||||
logger.info("Training new model from scratch")
|
||||
model = AutoModelWithLMHead.from_config(config)
|
||||
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
|
||||
raise ValueError(
|
||||
"BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the"
|
||||
"--mlm flag (masked language modeling)."
|
||||
)
|
||||
|
||||
if data_args.block_size <= 0:
|
||||
data_args.block_size = tokenizer.max_len
|
||||
# Our input block size will be the max possible for the model
|
||||
else:
|
||||
data_args.block_size = min(data_args.block_size, tokenizer.max_len)
|
||||
|
||||
# Get datasets
|
||||
|
||||
train_dataset = (
|
||||
get_dataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None
|
||||
)
|
||||
eval_dataset = (
|
||||
get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir)
|
||||
if training_args.do_eval
|
||||
else None
|
||||
)
|
||||
if config.model_type == "xlnet":
|
||||
data_collator = DataCollatorForPermutationLanguageModeling(
|
||||
tokenizer=tokenizer,
|
||||
plm_probability=data_args.plm_probability,
|
||||
max_span_length=data_args.max_span_length,
|
||||
)
|
||||
else:
|
||||
if data_args.mlm and data_args.whole_word_mask:
|
||||
data_collator = DataCollatorForWholeWordMask(
|
||||
tokenizer=tokenizer, mlm_probability=data_args.mlm_probability
|
||||
)
|
||||
else:
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
|
||||
)
|
||||
|
||||
# Initialize our Trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
data_collator=data_collator,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
prediction_loss_only=True,
|
||||
)
|
||||
|
||||
# Training
|
||||
if training_args.do_train:
|
||||
model_path = (
|
||||
model_args.model_name_or_path
|
||||
if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
|
||||
else None
|
||||
)
|
||||
trainer.train(model_path=model_path)
|
||||
trainer.save_model()
|
||||
# For convenience, we also re-save the tokenizer to the same directory,
|
||||
# so that you can share your model easily on huggingface.co/models =)
|
||||
if trainer.is_world_master():
|
||||
tokenizer.save_pretrained(training_args.output_dir)
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if training_args.do_eval:
|
||||
logger.info("*** Evaluate ***")
|
||||
|
||||
eval_output = trainer.evaluate()
|
||||
|
||||
perplexity = math.exp(eval_output["eval_loss"])
|
||||
result = {"perplexity": perplexity}
|
||||
|
||||
output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
|
||||
if trainer.is_world_master():
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results *****")
|
||||
for key in sorted(result.keys()):
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
|
||||
results.update(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _mp_fn(index):
|
||||
# For xla_spawn (TPUs)
|
||||
main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
319
examples/legacy/run_openai_gpt.py
Normal file
319
examples/legacy/run_openai_gpt.py
Normal file
@@ -0,0 +1,319 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" OpenAI GPT model fine-tuning script.
|
||||
Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
|
||||
It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
|
||||
|
||||
This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset:
|
||||
python run_openai_gpt.py \
|
||||
--model_name openai-gpt \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--train_dataset "$ROC_STORIES_DIR/cloze_test_val__spring2016 - cloze_test_ALL_val.csv" \
|
||||
--eval_dataset "$ROC_STORIES_DIR/cloze_test_test__spring2016 - cloze_test_ALL_test.csv" \
|
||||
--output_dir ../log \
|
||||
--train_batch_size 16 \
|
||||
"""
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (
|
||||
CONFIG_NAME,
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
OpenAIGPTDoubleHeadsModel,
|
||||
OpenAIGPTTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def accuracy(out, labels):
|
||||
outputs = np.argmax(out, axis=1)
|
||||
return np.sum(outputs == labels)
|
||||
|
||||
|
||||
def load_rocstories_dataset(dataset_path):
|
||||
""" Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
|
||||
with open(dataset_path, encoding="utf_8") as f:
|
||||
f = csv.reader(f)
|
||||
output = []
|
||||
next(f) # skip the first line
|
||||
for line in tqdm(f):
|
||||
output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1))
|
||||
return output
|
||||
|
||||
|
||||
def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
|
||||
"""Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
|
||||
|
||||
To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
|
||||
input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
||||
"""
|
||||
tensor_datasets = []
|
||||
for dataset in encoded_datasets:
|
||||
n_batch = len(dataset)
|
||||
input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
|
||||
mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
|
||||
lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
|
||||
mc_labels = np.zeros((n_batch,), dtype=np.int64)
|
||||
for (
|
||||
i,
|
||||
(story, cont1, cont2, mc_label),
|
||||
) in enumerate(dataset):
|
||||
with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
||||
with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
|
||||
input_ids[i, 0, : len(with_cont1)] = with_cont1
|
||||
input_ids[i, 1, : len(with_cont2)] = with_cont2
|
||||
mc_token_ids[i, 0] = len(with_cont1) - 1
|
||||
mc_token_ids[i, 1] = len(with_cont2) - 1
|
||||
lm_labels[i, 0, : len(with_cont1)] = with_cont1
|
||||
lm_labels[i, 1, : len(with_cont2)] = with_cont2
|
||||
mc_labels[i] = mc_label
|
||||
all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
|
||||
tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
|
||||
return tensor_datasets
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name")
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument("--train_dataset", type=str, default="")
|
||||
parser.add_argument("--eval_dataset", type=str, default="")
|
||||
parser.add_argument("--seed", type=int, default=42)
|
||||
parser.add_argument("--num_train_epochs", type=int, default=3)
|
||||
parser.add_argument("--train_batch_size", type=int, default=8)
|
||||
parser.add_argument("--eval_batch_size", type=int, default=16)
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", type=int, default=1)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training \
|
||||
steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before\
|
||||
performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--learning_rate", type=float, default=6.25e-5)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
|
||||
parser.add_argument("--weight_decay", type=float, default=0.01)
|
||||
parser.add_argument("--lm_coef", type=float, default=0.9)
|
||||
parser.add_argument("--n_valid", type=int, default=374)
|
||||
|
||||
parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
logger.info("device: {}, n_gpu {}".format(device, n_gpu))
|
||||
|
||||
if not args.do_train and not args.do_eval:
|
||||
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
|
||||
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
# Load tokenizer and model
|
||||
# This loading functions also add new tokens and embeddings called `special tokens`
|
||||
# These new embeddings will be fine-tuned on the RocStories dataset
|
||||
special_tokens = ["_start_", "_delimiter_", "_classify_"]
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
|
||||
tokenizer.add_tokens(special_tokens)
|
||||
special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
|
||||
model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
model.to(device)
|
||||
|
||||
# Load and encode the datasets
|
||||
def tokenize_and_encode(obj):
|
||||
""" Tokenize and encode a nested object """
|
||||
if isinstance(obj, str):
|
||||
return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
|
||||
elif isinstance(obj, int):
|
||||
return obj
|
||||
return list(tokenize_and_encode(o) for o in obj)
|
||||
|
||||
logger.info("Encoding dataset...")
|
||||
train_dataset = load_rocstories_dataset(args.train_dataset)
|
||||
eval_dataset = load_rocstories_dataset(args.eval_dataset)
|
||||
datasets = (train_dataset, eval_dataset)
|
||||
encoded_datasets = tokenize_and_encode(datasets)
|
||||
|
||||
# Compute the max input length for the Transformer
|
||||
max_length = model.config.n_positions // 2 - 2
|
||||
input_length = max(
|
||||
len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
|
||||
for dataset in encoded_datasets
|
||||
for story, cont1, cont2, _ in dataset
|
||||
)
|
||||
input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model
|
||||
|
||||
# Prepare inputs tensors and dataloaders
|
||||
tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
|
||||
train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
|
||||
|
||||
train_data = TensorDataset(*train_tensor_dataset)
|
||||
train_sampler = RandomSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
eval_data = TensorDataset(*eval_tensor_dataset)
|
||||
eval_sampler = SequentialSampler(eval_data)
|
||||
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
# Prepare optimizer
|
||||
if args.do_train:
|
||||
if args.max_steps > 0:
|
||||
t_total = args.max_steps
|
||||
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
|
||||
else:
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
param_optimizer = list(model.named_parameters())
|
||||
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
|
||||
if args.do_train:
|
||||
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
|
||||
model.train()
|
||||
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
||||
tr_loss = 0
|
||||
nb_tr_steps = 0
|
||||
tqdm_bar = tqdm(train_dataloader, desc="Training")
|
||||
for step, batch in enumerate(tqdm_bar):
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
input_ids, mc_token_ids, lm_labels, mc_labels = batch
|
||||
losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
|
||||
loss = args.lm_coef * losses[0] + losses[1]
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
tr_loss += loss.item()
|
||||
exp_average_loss = (
|
||||
loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
|
||||
)
|
||||
nb_tr_steps += 1
|
||||
tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
|
||||
|
||||
# Save a trained model
|
||||
if args.do_train:
|
||||
# Save a trained model, configuration and tokenizer
|
||||
model_to_save = model.module if hasattr(model, "module") else model # Only save the model itself
|
||||
|
||||
# If we save using the predefined names, we can load using `from_pretrained`
|
||||
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
||||
|
||||
torch.save(model_to_save.state_dict(), output_model_file)
|
||||
model_to_save.config.to_json_file(output_config_file)
|
||||
tokenizer.save_vocabulary(args.output_dir)
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
|
||||
model.to(device)
|
||||
|
||||
if args.do_eval:
|
||||
model.eval()
|
||||
eval_loss, eval_accuracy = 0, 0
|
||||
nb_eval_steps, nb_eval_examples = 0, 0
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
input_ids, mc_token_ids, lm_labels, mc_labels = batch
|
||||
with torch.no_grad():
|
||||
_, mc_loss, _, mc_logits = model(
|
||||
input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels
|
||||
)
|
||||
|
||||
mc_logits = mc_logits.detach().cpu().numpy()
|
||||
mc_labels = mc_labels.to("cpu").numpy()
|
||||
tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
|
||||
|
||||
eval_loss += mc_loss.mean().item()
|
||||
eval_accuracy += tmp_eval_accuracy
|
||||
|
||||
nb_eval_examples += input_ids.size(0)
|
||||
nb_eval_steps += 1
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
eval_accuracy = eval_accuracy / nb_eval_examples
|
||||
train_loss = tr_loss / nb_tr_steps if args.do_train else None
|
||||
result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss}
|
||||
|
||||
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results *****")
|
||||
for key in sorted(result.keys()):
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
719
examples/legacy/run_swag.py
Normal file
719
examples/legacy/run_swag.py
Normal file
@@ -0,0 +1,719 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""BERT finetuning runner.
|
||||
Finetuning the library models for multiple choice on SWAG (Bert).
|
||||
"""
|
||||
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
import transformers
|
||||
from transformers import (
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
AutoConfig,
|
||||
AutoModelForMultipleChoice,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
from transformers.trainer_utils import is_main_process
|
||||
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except ImportError:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SwagExample(object):
|
||||
"""A single training/test example for the SWAG dataset."""
|
||||
|
||||
def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
|
||||
self.swag_id = swag_id
|
||||
self.context_sentence = context_sentence
|
||||
self.start_ending = start_ending
|
||||
self.endings = [
|
||||
ending_0,
|
||||
ending_1,
|
||||
ending_2,
|
||||
ending_3,
|
||||
]
|
||||
self.label = label
|
||||
|
||||
def __str__(self):
|
||||
return self.__repr__()
|
||||
|
||||
def __repr__(self):
|
||||
attributes = [
|
||||
"swag_id: {}".format(self.swag_id),
|
||||
"context_sentence: {}".format(self.context_sentence),
|
||||
"start_ending: {}".format(self.start_ending),
|
||||
"ending_0: {}".format(self.endings[0]),
|
||||
"ending_1: {}".format(self.endings[1]),
|
||||
"ending_2: {}".format(self.endings[2]),
|
||||
"ending_3: {}".format(self.endings[3]),
|
||||
]
|
||||
|
||||
if self.label is not None:
|
||||
attributes.append("label: {}".format(self.label))
|
||||
|
||||
return ", ".join(attributes)
|
||||
|
||||
|
||||
class InputFeatures(object):
|
||||
def __init__(self, example_id, choices_features, label):
|
||||
self.example_id = example_id
|
||||
self.choices_features = [
|
||||
{"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
|
||||
for _, input_ids, input_mask, segment_ids in choices_features
|
||||
]
|
||||
self.label = label
|
||||
|
||||
|
||||
def read_swag_examples(input_file, is_training=True):
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
lines = list(csv.reader(f))
|
||||
|
||||
if is_training and lines[0][-1] != "label":
|
||||
raise ValueError("For training, the input file must contain a label column.")
|
||||
|
||||
examples = [
|
||||
SwagExample(
|
||||
swag_id=line[2],
|
||||
context_sentence=line[4],
|
||||
start_ending=line[5], # in the swag dataset, the
|
||||
# common beginning of each
|
||||
# choice is stored in "sent2".
|
||||
ending_0=line[7],
|
||||
ending_1=line[8],
|
||||
ending_2=line[9],
|
||||
ending_3=line[10],
|
||||
label=int(line[11]) if is_training else None,
|
||||
)
|
||||
for line in lines[1:] # we skip the line with the column names
|
||||
]
|
||||
|
||||
return examples
|
||||
|
||||
|
||||
def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
|
||||
"""Loads a data file into a list of `InputBatch`s."""
|
||||
|
||||
# Swag is a multiple choice task. To perform this task using Bert,
|
||||
# we will use the formatting proposed in "Improving Language
|
||||
# Understanding by Generative Pre-Training" and suggested by
|
||||
# @jacobdevlin-google in this issue
|
||||
# https://github.com/google-research/bert/issues/38.
|
||||
#
|
||||
# Each choice will correspond to a sample on which we run the
|
||||
# inference. For a given Swag example, we will create the 4
|
||||
# following inputs:
|
||||
# - [CLS] context [SEP] choice_1 [SEP]
|
||||
# - [CLS] context [SEP] choice_2 [SEP]
|
||||
# - [CLS] context [SEP] choice_3 [SEP]
|
||||
# - [CLS] context [SEP] choice_4 [SEP]
|
||||
# The model will output a single value for each input. To get the
|
||||
# final decision of the model, we will run a softmax over these 4
|
||||
# outputs.
|
||||
features = []
|
||||
for example_index, example in tqdm(enumerate(examples)):
|
||||
context_tokens = tokenizer.tokenize(example.context_sentence)
|
||||
start_ending_tokens = tokenizer.tokenize(example.start_ending)
|
||||
|
||||
choices_features = []
|
||||
for ending_index, ending in enumerate(example.endings):
|
||||
# We create a copy of the context tokens in order to be
|
||||
# able to shrink it according to ending_tokens
|
||||
context_tokens_choice = context_tokens[:]
|
||||
ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
|
||||
# Modifies `context_tokens_choice` and `ending_tokens` in
|
||||
# place so that the total length is less than the
|
||||
# specified length. Account for [CLS], [SEP], [SEP] with
|
||||
# "- 3"
|
||||
_truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
|
||||
|
||||
tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
|
||||
segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
|
||||
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
input_mask = [1] * len(input_ids)
|
||||
|
||||
# Zero-pad up to the sequence length.
|
||||
padding = [0] * (max_seq_length - len(input_ids))
|
||||
input_ids += padding
|
||||
input_mask += padding
|
||||
segment_ids += padding
|
||||
|
||||
assert len(input_ids) == max_seq_length
|
||||
assert len(input_mask) == max_seq_length
|
||||
assert len(segment_ids) == max_seq_length
|
||||
|
||||
choices_features.append((tokens, input_ids, input_mask, segment_ids))
|
||||
|
||||
label = example.label
|
||||
if example_index < 5:
|
||||
logger.info("*** Example ***")
|
||||
logger.info("swag_id: {}".format(example.swag_id))
|
||||
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
|
||||
logger.info("choice: {}".format(choice_idx))
|
||||
logger.info("tokens: {}".format(" ".join(tokens)))
|
||||
logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
|
||||
logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
|
||||
logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
|
||||
if is_training:
|
||||
logger.info("label: {}".format(label))
|
||||
|
||||
features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))
|
||||
|
||||
return features
|
||||
|
||||
|
||||
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
|
||||
"""Truncates a sequence pair in place to the maximum length."""
|
||||
|
||||
# This is a simple heuristic which will always truncate the longer sequence
|
||||
# one token at a time. This makes more sense than truncating an equal percent
|
||||
# of tokens from each, since if one sequence is very short then each token
|
||||
# that's truncated likely contains more information than a longer sequence.
|
||||
while True:
|
||||
total_length = len(tokens_a) + len(tokens_b)
|
||||
if total_length <= max_length:
|
||||
break
|
||||
if len(tokens_a) > len(tokens_b):
|
||||
tokens_a.pop()
|
||||
else:
|
||||
tokens_b.pop()
|
||||
|
||||
|
||||
def accuracy(out, labels):
|
||||
outputs = np.argmax(out, axis=1)
|
||||
return np.sum(outputs == labels)
|
||||
|
||||
|
||||
def select_field(features, field):
|
||||
return [[choice[field] for choice in feature.choices_features] for feature in features]
|
||||
|
||||
|
||||
def set_seed(args):
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
if args.n_gpu > 0:
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
|
||||
if args.local_rank not in [-1, 0]:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
# Load data features from cache or dataset file
|
||||
input_file = args.predict_file if evaluate else args.train_file
|
||||
cached_features_file = os.path.join(
|
||||
os.path.dirname(input_file),
|
||||
"cached_{}_{}_{}".format(
|
||||
"dev" if evaluate else "train",
|
||||
list(filter(None, args.model_name_or_path.split("/"))).pop(),
|
||||
str(args.max_seq_length),
|
||||
),
|
||||
)
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features = torch.load(cached_features_file)
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", input_file)
|
||||
examples = read_swag_examples(input_file)
|
||||
features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate)
|
||||
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
torch.save(features, cached_features_file)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
# Convert to Tensors and build dataset
|
||||
all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
|
||||
all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
|
||||
all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
|
||||
all_label = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||
|
||||
if evaluate:
|
||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
|
||||
else:
|
||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
|
||||
|
||||
if output_examples:
|
||||
return dataset, examples, features
|
||||
return dataset
|
||||
|
||||
|
||||
def train(args, train_dataset, model, tokenizer):
|
||||
""" Train the model """
|
||||
if args.local_rank in [-1, 0]:
|
||||
tb_writer = SummaryWriter()
|
||||
|
||||
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
|
||||
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
|
||||
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
if args.max_steps > 0:
|
||||
t_total = args.max_steps
|
||||
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
|
||||
else:
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||
|
||||
# multi-gpu training (should be after apex fp16 initialization)
|
||||
if args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
|
||||
)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size
|
||||
* args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 0
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
model.train()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
inputs = {
|
||||
"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
# 'token_type_ids': None if args.model_type == 'xlm' else batch[2],
|
||||
"token_type_ids": batch[2],
|
||||
"labels": batch[3],
|
||||
}
|
||||
# if args.model_type in ['xlnet', 'xlm']:
|
||||
# inputs.update({'cls_index': batch[5],
|
||||
# 'p_mask': batch[6]})
|
||||
outputs = model(**inputs)
|
||||
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||
|
||||
if args.n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
|
||||
if args.fp16:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
||||
else:
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
||||
|
||||
tr_loss += loss.item()
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
scheduler.step() # Update learning rate schedule
|
||||
model.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
# Log metrics
|
||||
if (
|
||||
args.local_rank == -1 and args.evaluate_during_training
|
||||
): # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
tb_writer.add_scalar("eval_{}".format(key), value, global_step)
|
||||
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
|
||||
logging_loss = tr_loss
|
||||
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
tokenizer.save_vocabulary(output_dir)
|
||||
torch.save(args, os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
train_iterator.close()
|
||||
break
|
||||
|
||||
if args.local_rank in [-1, 0]:
|
||||
tb_writer.close()
|
||||
|
||||
return global_step, tr_loss / global_step
|
||||
|
||||
|
||||
def evaluate(args, model, tokenizer, prefix=""):
|
||||
dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
|
||||
|
||||
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||
# Note that DistributedSampler samples randomly
|
||||
eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
|
||||
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
# Eval!
|
||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||
logger.info(" Num examples = %d", len(dataset))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
|
||||
eval_loss, eval_accuracy = 0, 0
|
||||
nb_eval_steps, nb_eval_examples = 0, 0
|
||||
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
model.eval()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
with torch.no_grad():
|
||||
inputs = {
|
||||
"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
# 'token_type_ids': None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids
|
||||
"token_type_ids": batch[2],
|
||||
"labels": batch[3],
|
||||
}
|
||||
|
||||
# if args.model_type in ['xlnet', 'xlm']:
|
||||
# inputs.update({'cls_index': batch[4],
|
||||
# 'p_mask': batch[5]})
|
||||
outputs = model(**inputs)
|
||||
tmp_eval_loss, logits = outputs[:2]
|
||||
eval_loss += tmp_eval_loss.mean().item()
|
||||
|
||||
logits = logits.detach().cpu().numpy()
|
||||
label_ids = inputs["labels"].to("cpu").numpy()
|
||||
tmp_eval_accuracy = accuracy(logits, label_ids)
|
||||
eval_accuracy += tmp_eval_accuracy
|
||||
|
||||
nb_eval_steps += 1
|
||||
nb_eval_examples += inputs["input_ids"].size(0)
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
eval_accuracy = eval_accuracy / nb_eval_examples
|
||||
result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy}
|
||||
|
||||
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results *****")
|
||||
for key in sorted(result.keys()):
|
||||
logger.info("%s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--predict_file",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="SWAG csv for predictions. E.g., val.csv or test.csv",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model checkpoints and predictions will be written.",
|
||||
)
|
||||
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=384,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences "
|
||||
"longer than this will be truncated, and sequences shorter than this will be padded.",
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
|
||||
)
|
||||
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
|
||||
parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
|
||||
parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
action="store_true",
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
|
||||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if (
|
||||
os.path.exists(args.output_dir)
|
||||
and os.listdir(args.output_dir)
|
||||
and args.do_train
|
||||
and not args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args.output_dir
|
||||
)
|
||||
)
|
||||
|
||||
# Setup distant debugging if needed
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
# Setup CUDA, GPU & distributed training
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
args.n_gpu = 1
|
||||
args.device = device
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank,
|
||||
device,
|
||||
args.n_gpu,
|
||||
bool(args.local_rank != -1),
|
||||
args.fp16,
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if is_main_process(args.local_rank):
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Set seed
|
||||
set_seed(args)
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
if args.local_rank not in [-1, 0]:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
|
||||
config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
)
|
||||
model = AutoModelForMultipleChoice.from_pretrained(
|
||||
args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
|
||||
)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
|
||||
model.to(args.device)
|
||||
|
||||
logger.info("Training/evaluation parameters %s", args)
|
||||
|
||||
# Training
|
||||
if args.do_train:
|
||||
train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
|
||||
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
|
||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||
|
||||
# Save the trained model and the tokenizer
|
||||
if args.local_rank == -1 or torch.distributed.get_rank() == 0:
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = AutoModelForMultipleChoice.from_pretrained(args.output_dir)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
|
||||
model.to(args.device)
|
||||
|
||||
# Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
if args.do_train:
|
||||
checkpoints = [args.output_dir]
|
||||
else:
|
||||
# if do_train is False and do_eval is true, load model directly from pretrained.
|
||||
checkpoints = [args.model_name_or_path]
|
||||
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(
|
||||
os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
|
||||
)
|
||||
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
|
||||
for checkpoint in checkpoints:
|
||||
# Reload the model
|
||||
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
|
||||
model = AutoModelForMultipleChoice.from_pretrained(checkpoint)
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
model.to(args.device)
|
||||
|
||||
# Evaluate
|
||||
result = evaluate(args, model, tokenizer, prefix=global_step)
|
||||
|
||||
result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
|
||||
results.update(result)
|
||||
|
||||
logger.info("Results: {}".format(results))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
144
examples/legacy/run_transfo_xl.py
Normal file
144
examples/legacy/run_transfo_xl.py
Normal file
@@ -0,0 +1,144 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" PyTorch Transformer XL model evaluation script.
|
||||
Adapted from https://github.com/kimiyoung/transformer-xl.
|
||||
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
|
||||
|
||||
This script with default values evaluates a pretrained Transformer-XL on WikiText 103
|
||||
"""
|
||||
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import math
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
from transformers import TransfoXLCorpus, TransfoXLLMHeadModel
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="PyTorch Transformer Language Model")
|
||||
parser.add_argument("--model_name", type=str, default="transfo-xl-wt103", help="pretrained model name")
|
||||
parser.add_argument(
|
||||
"--split", type=str, default="test", choices=["all", "valid", "test"], help="which split to evaluate"
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=10, help="batch size")
|
||||
parser.add_argument("--tgt_len", type=int, default=128, help="number of tokens to predict")
|
||||
parser.add_argument("--ext_len", type=int, default=0, help="length of the extended context")
|
||||
parser.add_argument("--mem_len", type=int, default=1600, help="length of the retained previous heads")
|
||||
parser.add_argument("--clamp_len", type=int, default=1000, help="max positional embedding index")
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Do not use CUDA even though CUA is available")
|
||||
parser.add_argument("--work_dir", type=str, required=True, help="path to the work_dir")
|
||||
parser.add_argument("--no_log", action="store_true", help="do not log the eval result")
|
||||
parser.add_argument("--same_length", action="store_true", help="set same length attention with masking")
|
||||
parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
assert args.ext_len >= 0, "extended context length must be non-negative"
|
||||
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
logger.info("device: {}".format(device))
|
||||
|
||||
# Load a pre-processed dataset
|
||||
# You can also build the corpus yourself using TransfoXLCorpus methods
|
||||
# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
|
||||
# and tokenizing the dataset
|
||||
# The pre-processed corpus is a convertion (using the conversion script )
|
||||
corpus = TransfoXLCorpus.from_pretrained(args.model_name)
|
||||
|
||||
va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
|
||||
te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
|
||||
|
||||
# Load a pre-trained model
|
||||
model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
|
||||
model.to(device)
|
||||
|
||||
logger.info(
|
||||
"Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
|
||||
args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len
|
||||
)
|
||||
)
|
||||
|
||||
model.reset_memory_length(args.mem_len)
|
||||
if args.clamp_len > 0:
|
||||
model.clamp_len = args.clamp_len
|
||||
if args.same_length:
|
||||
model.same_length = True
|
||||
|
||||
###############################################################################
|
||||
# Evaluation code
|
||||
###############################################################################
|
||||
def evaluate(eval_iter):
|
||||
# Turn on evaluation mode which disables dropout.
|
||||
model.eval()
|
||||
total_len, total_loss = 0, 0.0
|
||||
start_time = time.time()
|
||||
with torch.no_grad():
|
||||
mems = None
|
||||
for idx, (data, target, seq_len) in enumerate(eval_iter):
|
||||
ret = model(data, lm_labels=target, mems=mems)
|
||||
loss, _, mems = ret
|
||||
loss = loss.mean()
|
||||
total_loss += seq_len * loss.item()
|
||||
total_len += seq_len
|
||||
total_time = time.time() - start_time
|
||||
logger.info("Time : {:.2f}s, {:.2f}ms/segment".format(total_time, 1000 * total_time / (idx + 1)))
|
||||
return total_loss / total_len
|
||||
|
||||
# Run on test data.
|
||||
if args.split == "all":
|
||||
test_loss = evaluate(te_iter)
|
||||
valid_loss = evaluate(va_iter)
|
||||
elif args.split == "valid":
|
||||
valid_loss = evaluate(va_iter)
|
||||
test_loss = None
|
||||
elif args.split == "test":
|
||||
test_loss = evaluate(te_iter)
|
||||
valid_loss = None
|
||||
|
||||
def format_log(loss, split):
|
||||
log_str = "| {0} loss {1:5.2f} | {0} ppl {2:9.3f} ".format(split, loss, math.exp(loss))
|
||||
return log_str
|
||||
|
||||
log_str = ""
|
||||
if valid_loss is not None:
|
||||
log_str += format_log(valid_loss, "valid")
|
||||
if test_loss is not None:
|
||||
log_str += format_log(test_loss, "test")
|
||||
|
||||
logger.info("=" * 100)
|
||||
logger.info(log_str)
|
||||
logger.info("=" * 100)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
229
examples/legacy/token-classification/README.md
Normal file
229
examples/legacy/token-classification/README.md
Normal file
@@ -0,0 +1,229 @@
|
||||
## Token classification
|
||||
|
||||
Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/contrib/legacy/token-classification/run_ner.py).
|
||||
|
||||
The following examples are covered in this section:
|
||||
|
||||
* NER on the GermEval 2014 (German NER) dataset
|
||||
* Emerging and Rare Entities task: WNUT’17 (English NER) dataset
|
||||
|
||||
Details and results for the fine-tuning provided by @stefan-it.
|
||||
|
||||
### GermEval 2014 (German NER) dataset
|
||||
|
||||
#### Data (Download and pre-processing steps)
|
||||
|
||||
Data can be obtained from the [GermEval 2014](https://sites.google.com/site/germeval2014ner/data) shared task page.
|
||||
|
||||
Here are the commands for downloading and pre-processing train, dev and test datasets. The original data format has four (tab-separated) columns, in a pre-processing step only the two relevant columns (token and outer span NER annotation) are extracted:
|
||||
|
||||
```bash
|
||||
curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
|
||||
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
|
||||
curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
|
||||
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
|
||||
curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \
|
||||
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
|
||||
```
|
||||
|
||||
The GermEval 2014 dataset contains some strange "control character" tokens like `'\x96', '\u200e', '\x95', '\xad' or '\x80'`.
|
||||
One problem with these tokens is, that `BertTokenizer` returns an empty token for them, resulting in misaligned `InputExample`s.
|
||||
The `preprocess.py` script located in the `scripts` folder a) filters these tokens and b) splits longer sentences into smaller ones (once the max. subtoken length is reached).
|
||||
|
||||
Let's define some variables that we need for further pre-processing steps and training the model:
|
||||
|
||||
```bash
|
||||
export MAX_LENGTH=128
|
||||
export BERT_MODEL=bert-base-multilingual-cased
|
||||
```
|
||||
|
||||
Run the pre-processing script on training, dev and test datasets:
|
||||
|
||||
```bash
|
||||
python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
|
||||
python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
|
||||
python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
|
||||
```
|
||||
|
||||
The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so an own set of labels must be used:
|
||||
|
||||
```bash
|
||||
cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
|
||||
```
|
||||
|
||||
#### Prepare the run
|
||||
|
||||
Additional environment variables must be set:
|
||||
|
||||
```bash
|
||||
export OUTPUT_DIR=germeval-model
|
||||
export BATCH_SIZE=32
|
||||
export NUM_EPOCHS=3
|
||||
export SAVE_STEPS=750
|
||||
export SEED=1
|
||||
```
|
||||
|
||||
#### Run the Pytorch version
|
||||
|
||||
To start training, just run:
|
||||
|
||||
```bash
|
||||
python3 run_ner.py --data_dir ./ \
|
||||
--labels ./labels.txt \
|
||||
--model_name_or_path $BERT_MODEL \
|
||||
--output_dir $OUTPUT_DIR \
|
||||
--max_seq_length $MAX_LENGTH \
|
||||
--num_train_epochs $NUM_EPOCHS \
|
||||
--per_device_train_batch_size $BATCH_SIZE \
|
||||
--save_steps $SAVE_STEPS \
|
||||
--seed $SEED \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--do_predict
|
||||
```
|
||||
|
||||
If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
|
||||
|
||||
#### JSON-based configuration file
|
||||
|
||||
Instead of passing all parameters via commandline arguments, the `run_ner.py` script also supports reading parameters from a json-based configuration file:
|
||||
|
||||
```json
|
||||
{
|
||||
"data_dir": ".",
|
||||
"labels": "./labels.txt",
|
||||
"model_name_or_path": "bert-base-multilingual-cased",
|
||||
"output_dir": "germeval-model",
|
||||
"max_seq_length": 128,
|
||||
"num_train_epochs": 3,
|
||||
"per_device_train_batch_size": 32,
|
||||
"save_steps": 750,
|
||||
"seed": 1,
|
||||
"do_train": true,
|
||||
"do_eval": true,
|
||||
"do_predict": true
|
||||
}
|
||||
```
|
||||
|
||||
It must be saved with a `.json` extension and can be used by running `python3 run_ner.py config.json`.
|
||||
|
||||
#### Evaluation
|
||||
|
||||
Evaluation on development dataset outputs the following for our example:
|
||||
|
||||
```bash
|
||||
10/04/2019 00:42:06 - INFO - __main__ - ***** Eval results *****
|
||||
10/04/2019 00:42:06 - INFO - __main__ - f1 = 0.8623348017621146
|
||||
10/04/2019 00:42:06 - INFO - __main__ - loss = 0.07183869666975543
|
||||
10/04/2019 00:42:06 - INFO - __main__ - precision = 0.8467916366258111
|
||||
10/04/2019 00:42:06 - INFO - __main__ - recall = 0.8784592370979806
|
||||
```
|
||||
|
||||
On the test dataset the following results could be achieved:
|
||||
|
||||
```bash
|
||||
10/04/2019 00:42:42 - INFO - __main__ - ***** Eval results *****
|
||||
10/04/2019 00:42:42 - INFO - __main__ - f1 = 0.8614389652384803
|
||||
10/04/2019 00:42:42 - INFO - __main__ - loss = 0.07064602487454782
|
||||
10/04/2019 00:42:42 - INFO - __main__ - precision = 0.8604651162790697
|
||||
10/04/2019 00:42:42 - INFO - __main__ - recall = 0.8624150210424085
|
||||
```
|
||||
|
||||
### Emerging and Rare Entities task: WNUT’17 (English NER) dataset
|
||||
|
||||
Description of the WNUT’17 task from the [shared task website](http://noisy-text.github.io/2017/index.html):
|
||||
|
||||
> The WNUT’17 shared task focuses on identifying unusual, previously-unseen entities in the context of emerging discussions.
|
||||
> Named entities form the basis of many modern approaches to other tasks (like event clustering and summarization), but recall on
|
||||
> them is a real problem in noisy text - even among annotators. This drop tends to be due to novel entities and surface forms.
|
||||
|
||||
Six labels are available in the dataset. An overview can be found on this [page](http://noisy-text.github.io/2017/files/).
|
||||
|
||||
#### Data (Download and pre-processing steps)
|
||||
|
||||
The dataset can be downloaded from the [official GitHub](https://github.com/leondz/emerging_entities_17) repository.
|
||||
|
||||
The following commands show how to prepare the dataset for fine-tuning:
|
||||
|
||||
```bash
|
||||
mkdir -p data_wnut_17
|
||||
|
||||
curl -L 'https://github.com/leondz/emerging_entities_17/raw/master/wnut17train.conll' | tr '\t' ' ' > data_wnut_17/train.txt.tmp
|
||||
curl -L 'https://github.com/leondz/emerging_entities_17/raw/master/emerging.dev.conll' | tr '\t' ' ' > data_wnut_17/dev.txt.tmp
|
||||
curl -L 'https://raw.githubusercontent.com/leondz/emerging_entities_17/master/emerging.test.annotated' | tr '\t' ' ' > data_wnut_17/test.txt.tmp
|
||||
```
|
||||
|
||||
Let's define some variables that we need for further pre-processing steps:
|
||||
|
||||
```bash
|
||||
export MAX_LENGTH=128
|
||||
export BERT_MODEL=bert-large-cased
|
||||
```
|
||||
|
||||
Here we use the English BERT large model for fine-tuning.
|
||||
The `preprocess.py` scripts splits longer sentences into smaller ones (once the max. subtoken length is reached):
|
||||
|
||||
```bash
|
||||
python3 scripts/preprocess.py data_wnut_17/train.txt.tmp $BERT_MODEL $MAX_LENGTH > data_wnut_17/train.txt
|
||||
python3 scripts/preprocess.py data_wnut_17/dev.txt.tmp $BERT_MODEL $MAX_LENGTH > data_wnut_17/dev.txt
|
||||
python3 scripts/preprocess.py data_wnut_17/test.txt.tmp $BERT_MODEL $MAX_LENGTH > data_wnut_17/test.txt
|
||||
```
|
||||
|
||||
In the last pre-processing step, the `labels.txt` file needs to be generated. This file contains all available labels:
|
||||
|
||||
```bash
|
||||
cat data_wnut_17/train.txt data_wnut_17/dev.txt data_wnut_17/test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > data_wnut_17/labels.txt
|
||||
```
|
||||
|
||||
#### Run the Pytorch version
|
||||
|
||||
Fine-tuning with the PyTorch version can be started using the `run_ner.py` script. In this example we use a JSON-based configuration file.
|
||||
|
||||
This configuration file looks like:
|
||||
|
||||
```json
|
||||
{
|
||||
"data_dir": "./data_wnut_17",
|
||||
"labels": "./data_wnut_17/labels.txt",
|
||||
"model_name_or_path": "bert-large-cased",
|
||||
"output_dir": "wnut-17-model-1",
|
||||
"max_seq_length": 128,
|
||||
"num_train_epochs": 3,
|
||||
"per_device_train_batch_size": 32,
|
||||
"save_steps": 425,
|
||||
"seed": 1,
|
||||
"do_train": true,
|
||||
"do_eval": true,
|
||||
"do_predict": true,
|
||||
"fp16": false
|
||||
}
|
||||
```
|
||||
|
||||
If your GPU supports half-precision training, please set `fp16` to `true`.
|
||||
|
||||
Save this JSON-based configuration under `wnut_17.json`. The fine-tuning can be started with `python3 run_ner_old.py wnut_17.json`.
|
||||
|
||||
#### Evaluation
|
||||
|
||||
Evaluation on development dataset outputs the following:
|
||||
|
||||
```bash
|
||||
05/29/2020 23:33:44 - INFO - __main__ - ***** Eval results *****
|
||||
05/29/2020 23:33:44 - INFO - __main__ - eval_loss = 0.26505235286212275
|
||||
05/29/2020 23:33:44 - INFO - __main__ - eval_precision = 0.7008264462809918
|
||||
05/29/2020 23:33:44 - INFO - __main__ - eval_recall = 0.507177033492823
|
||||
05/29/2020 23:33:44 - INFO - __main__ - eval_f1 = 0.5884802220680084
|
||||
05/29/2020 23:33:44 - INFO - __main__ - epoch = 3.0
|
||||
```
|
||||
|
||||
On the test dataset the following results could be achieved:
|
||||
|
||||
```bash
|
||||
05/29/2020 23:33:44 - INFO - transformers.trainer - ***** Running Prediction *****
|
||||
05/29/2020 23:34:02 - INFO - __main__ - eval_loss = 0.30948806500973547
|
||||
05/29/2020 23:34:02 - INFO - __main__ - eval_precision = 0.5840108401084011
|
||||
05/29/2020 23:34:02 - INFO - __main__ - eval_recall = 0.3994439295644115
|
||||
05/29/2020 23:34:02 - INFO - __main__ - eval_f1 = 0.47440836543753434
|
||||
```
|
||||
|
||||
WNUT’17 is a very difficult task. Current state-of-the-art results on this dataset can be found [here](http://nlpprogress.com/english/named_entity_recognition.html).
|
||||
36
examples/legacy/token-classification/run.sh
Executable file
36
examples/legacy/token-classification/run.sh
Executable file
@@ -0,0 +1,36 @@
|
||||
## The relevant files are currently on a shared Google
|
||||
## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J
|
||||
## Monitor for changes and eventually migrate to nlp dataset
|
||||
curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
|
||||
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
|
||||
curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
|
||||
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
|
||||
curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \
|
||||
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
|
||||
|
||||
export MAX_LENGTH=128
|
||||
export BERT_MODEL=bert-base-multilingual-cased
|
||||
python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
|
||||
python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
|
||||
python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
|
||||
cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
|
||||
export OUTPUT_DIR=germeval-model
|
||||
export BATCH_SIZE=32
|
||||
export NUM_EPOCHS=3
|
||||
export SAVE_STEPS=750
|
||||
export SEED=1
|
||||
|
||||
python3 run_ner.py \
|
||||
--task_type NER \
|
||||
--data_dir . \
|
||||
--labels ./labels.txt \
|
||||
--model_name_or_path $BERT_MODEL \
|
||||
--output_dir $OUTPUT_DIR \
|
||||
--max_seq_length $MAX_LENGTH \
|
||||
--num_train_epochs $NUM_EPOCHS \
|
||||
--per_gpu_train_batch_size $BATCH_SIZE \
|
||||
--save_steps $SAVE_STEPS \
|
||||
--seed $SEED \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--do_predict
|
||||
37
examples/legacy/token-classification/run_chunk.sh
Executable file
37
examples/legacy/token-classification/run_chunk.sh
Executable file
@@ -0,0 +1,37 @@
|
||||
if ! [ -f ./dev.txt ]; then
|
||||
echo "Downloading CONLL2003 dev dataset...."
|
||||
curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt'
|
||||
fi
|
||||
|
||||
if ! [ -f ./test.txt ]; then
|
||||
echo "Downloading CONLL2003 test dataset...."
|
||||
curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt'
|
||||
fi
|
||||
|
||||
if ! [ -f ./train.txt ]; then
|
||||
echo "Downloading CONLL2003 train dataset...."
|
||||
curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt'
|
||||
fi
|
||||
|
||||
export MAX_LENGTH=200
|
||||
export BERT_MODEL=bert-base-uncased
|
||||
export OUTPUT_DIR=chunker-model
|
||||
export BATCH_SIZE=32
|
||||
export NUM_EPOCHS=3
|
||||
export SAVE_STEPS=750
|
||||
export SEED=1
|
||||
|
||||
python3 run_ner.py \
|
||||
--task_type Chunk \
|
||||
--data_dir . \
|
||||
--model_name_or_path $BERT_MODEL \
|
||||
--output_dir $OUTPUT_DIR \
|
||||
--max_seq_length $MAX_LENGTH \
|
||||
--num_train_epochs $NUM_EPOCHS \
|
||||
--per_gpu_train_batch_size $BATCH_SIZE \
|
||||
--save_steps $SAVE_STEPS \
|
||||
--seed $SEED \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--do_predict
|
||||
|
||||
316
examples/legacy/token-classification/run_ner.py
Normal file
316
examples/legacy/token-classification/run_ner.py
Normal file
@@ -0,0 +1,316 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Fine-tuning the library models for named entity recognition on CoNLL-2003. """
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from importlib import import_module
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
||||
from torch import nn
|
||||
|
||||
import transformers
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForTokenClassification,
|
||||
AutoTokenizer,
|
||||
EvalPrediction,
|
||||
HfArgumentParser,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.trainer_utils import is_main_process
|
||||
from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelArguments:
|
||||
"""
|
||||
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
|
||||
"""
|
||||
|
||||
model_name_or_path: str = field(
|
||||
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
|
||||
)
|
||||
config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
||||
)
|
||||
task_type: Optional[str] = field(
|
||||
default="NER", metadata={"help": "Task type to fine tune in training (e.g. NER, POS, etc)"}
|
||||
)
|
||||
tokenizer_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
||||
)
|
||||
use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
|
||||
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
|
||||
# or just modify its tokenizer_config.json.
|
||||
cache_dir: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataTrainingArguments:
|
||||
"""
|
||||
Arguments pertaining to what data we are going to input our model for training and eval.
|
||||
"""
|
||||
|
||||
data_dir: str = field(
|
||||
metadata={"help": "The input data dir. Should contain the .txt files for a CoNLL-2003-formatted task."}
|
||||
)
|
||||
labels: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."},
|
||||
)
|
||||
max_seq_length: int = field(
|
||||
default=128,
|
||||
metadata={
|
||||
"help": "The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded."
|
||||
},
|
||||
)
|
||||
overwrite_cache: bool = field(
|
||||
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
# See all possible arguments in src/transformers/training_args.py
|
||||
# or by passing the --help flag to this script.
|
||||
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
||||
|
||||
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
|
||||
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
||||
# If we pass only one argument to the script and it's the path to a json file,
|
||||
# let's parse it to get our arguments.
|
||||
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
|
||||
else:
|
||||
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
||||
|
||||
if (
|
||||
os.path.exists(training_args.output_dir)
|
||||
and os.listdir(training_args.output_dir)
|
||||
and training_args.do_train
|
||||
and not training_args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
|
||||
)
|
||||
|
||||
module = import_module("tasks")
|
||||
try:
|
||||
token_classification_task_clazz = getattr(module, model_args.task_type)
|
||||
token_classification_task: TokenClassificationTask = token_classification_task_clazz()
|
||||
except AttributeError:
|
||||
raise ValueError(
|
||||
f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
|
||||
f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
|
||||
)
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
training_args.local_rank,
|
||||
training_args.device,
|
||||
training_args.n_gpu,
|
||||
bool(training_args.local_rank != -1),
|
||||
training_args.fp16,
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if is_main_process(training_args.local_rank):
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
logger.info("Training/evaluation parameters %s", training_args)
|
||||
|
||||
# Set seed
|
||||
set_seed(training_args.seed)
|
||||
|
||||
# Prepare CONLL-2003 task
|
||||
labels = token_classification_task.get_labels(data_args.labels)
|
||||
label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
|
||||
num_labels = len(labels)
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
# Distributed training:
|
||||
# The .from_pretrained methods guarantee that only one local process can concurrently
|
||||
# download model & vocab.
|
||||
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
num_labels=num_labels,
|
||||
id2label=label_map,
|
||||
label2id={label: i for i, label in enumerate(labels)},
|
||||
cache_dir=model_args.cache_dir,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_fast=model_args.use_fast,
|
||||
)
|
||||
model = AutoModelForTokenClassification.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
)
|
||||
|
||||
# Get datasets
|
||||
train_dataset = (
|
||||
TokenClassificationDataset(
|
||||
token_classification_task=token_classification_task,
|
||||
data_dir=data_args.data_dir,
|
||||
tokenizer=tokenizer,
|
||||
labels=labels,
|
||||
model_type=config.model_type,
|
||||
max_seq_length=data_args.max_seq_length,
|
||||
overwrite_cache=data_args.overwrite_cache,
|
||||
mode=Split.train,
|
||||
)
|
||||
if training_args.do_train
|
||||
else None
|
||||
)
|
||||
eval_dataset = (
|
||||
TokenClassificationDataset(
|
||||
token_classification_task=token_classification_task,
|
||||
data_dir=data_args.data_dir,
|
||||
tokenizer=tokenizer,
|
||||
labels=labels,
|
||||
model_type=config.model_type,
|
||||
max_seq_length=data_args.max_seq_length,
|
||||
overwrite_cache=data_args.overwrite_cache,
|
||||
mode=Split.dev,
|
||||
)
|
||||
if training_args.do_eval
|
||||
else None
|
||||
)
|
||||
|
||||
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
|
||||
preds = np.argmax(predictions, axis=2)
|
||||
|
||||
batch_size, seq_len = preds.shape
|
||||
|
||||
out_label_list = [[] for _ in range(batch_size)]
|
||||
preds_list = [[] for _ in range(batch_size)]
|
||||
|
||||
for i in range(batch_size):
|
||||
for j in range(seq_len):
|
||||
if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
|
||||
out_label_list[i].append(label_map[label_ids[i][j]])
|
||||
preds_list[i].append(label_map[preds[i][j]])
|
||||
|
||||
return preds_list, out_label_list
|
||||
|
||||
def compute_metrics(p: EvalPrediction) -> Dict:
|
||||
preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
|
||||
return {
|
||||
"accuracy_score": accuracy_score(out_label_list, preds_list),
|
||||
"precision": precision_score(out_label_list, preds_list),
|
||||
"recall": recall_score(out_label_list, preds_list),
|
||||
"f1": f1_score(out_label_list, preds_list),
|
||||
}
|
||||
|
||||
# Initialize our Trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
# Training
|
||||
if training_args.do_train:
|
||||
trainer.train(
|
||||
model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
|
||||
)
|
||||
trainer.save_model()
|
||||
# For convenience, we also re-save the tokenizer to the same directory,
|
||||
# so that you can share your model easily on huggingface.co/models =)
|
||||
if trainer.is_world_process_zero():
|
||||
tokenizer.save_pretrained(training_args.output_dir)
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if training_args.do_eval:
|
||||
logger.info("*** Evaluate ***")
|
||||
|
||||
result = trainer.evaluate()
|
||||
|
||||
output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
|
||||
if trainer.is_world_process_zero():
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results *****")
|
||||
for key, value in result.items():
|
||||
logger.info(" %s = %s", key, value)
|
||||
writer.write("%s = %s\n" % (key, value))
|
||||
|
||||
results.update(result)
|
||||
|
||||
# Predict
|
||||
if training_args.do_predict:
|
||||
test_dataset = TokenClassificationDataset(
|
||||
token_classification_task=token_classification_task,
|
||||
data_dir=data_args.data_dir,
|
||||
tokenizer=tokenizer,
|
||||
labels=labels,
|
||||
model_type=config.model_type,
|
||||
max_seq_length=data_args.max_seq_length,
|
||||
overwrite_cache=data_args.overwrite_cache,
|
||||
mode=Split.test,
|
||||
)
|
||||
|
||||
predictions, label_ids, metrics = trainer.predict(test_dataset)
|
||||
preds_list, _ = align_predictions(predictions, label_ids)
|
||||
|
||||
output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
|
||||
if trainer.is_world_process_zero():
|
||||
with open(output_test_results_file, "w") as writer:
|
||||
for key, value in metrics.items():
|
||||
logger.info(" %s = %s", key, value)
|
||||
writer.write("%s = %s\n" % (key, value))
|
||||
|
||||
# Save predictions
|
||||
output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
|
||||
if trainer.is_world_process_zero():
|
||||
with open(output_test_predictions_file, "w") as writer:
|
||||
with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
|
||||
token_classification_task.write_predictions_to_file(writer, f, preds_list)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _mp_fn(index):
|
||||
# For xla_spawn (TPUs)
|
||||
main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
37
examples/legacy/token-classification/run_pos.sh
Executable file
37
examples/legacy/token-classification/run_pos.sh
Executable file
@@ -0,0 +1,37 @@
|
||||
if ! [ -f ./dev.txt ]; then
|
||||
echo "Download dev dataset...."
|
||||
curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
|
||||
fi
|
||||
|
||||
if ! [ -f ./test.txt ]; then
|
||||
echo "Download test dataset...."
|
||||
curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
|
||||
fi
|
||||
|
||||
if ! [ -f ./train.txt ]; then
|
||||
echo "Download train dataset...."
|
||||
curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
|
||||
fi
|
||||
|
||||
export MAX_LENGTH=200
|
||||
export BERT_MODEL=bert-base-uncased
|
||||
export OUTPUT_DIR=postagger-model
|
||||
export BATCH_SIZE=32
|
||||
export NUM_EPOCHS=3
|
||||
export SAVE_STEPS=750
|
||||
export SEED=1
|
||||
|
||||
python3 run_ner.py \
|
||||
--task_type POS \
|
||||
--data_dir . \
|
||||
--model_name_or_path $BERT_MODEL \
|
||||
--output_dir $OUTPUT_DIR \
|
||||
--max_seq_length $MAX_LENGTH \
|
||||
--num_train_epochs $NUM_EPOCHS \
|
||||
--per_gpu_train_batch_size $BATCH_SIZE \
|
||||
--save_steps $SAVE_STEPS \
|
||||
--seed $SEED \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--do_predict
|
||||
|
||||
41
examples/legacy/token-classification/scripts/preprocess.py
Normal file
41
examples/legacy/token-classification/scripts/preprocess.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import sys
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
|
||||
dataset = sys.argv[1]
|
||||
model_name_or_path = sys.argv[2]
|
||||
max_len = int(sys.argv[3])
|
||||
|
||||
subword_len_counter = 0
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
||||
max_len -= tokenizer.num_special_tokens_to_add()
|
||||
|
||||
with open(dataset, "rt") as f_p:
|
||||
for line in f_p:
|
||||
line = line.rstrip()
|
||||
|
||||
if not line:
|
||||
print(line)
|
||||
subword_len_counter = 0
|
||||
continue
|
||||
|
||||
token = line.split()[0]
|
||||
|
||||
current_subwords_len = len(tokenizer.tokenize(token))
|
||||
|
||||
# Token contains strange control characters like \x96 or \x95
|
||||
# Just filter out the complete line
|
||||
if current_subwords_len == 0:
|
||||
continue
|
||||
|
||||
if (subword_len_counter + current_subwords_len) > max_len:
|
||||
print("")
|
||||
print(line)
|
||||
subword_len_counter = current_subwords_len
|
||||
continue
|
||||
|
||||
subword_len_counter += current_subwords_len
|
||||
|
||||
print(line)
|
||||
163
examples/legacy/token-classification/tasks.py
Normal file
163
examples/legacy/token-classification/tasks.py
Normal file
@@ -0,0 +1,163 @@
|
||||
import logging
|
||||
import os
|
||||
from typing import List, TextIO, Union
|
||||
|
||||
from conllu import parse_incr
|
||||
|
||||
from utils_ner import InputExample, Split, TokenClassificationTask
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NER(TokenClassificationTask):
|
||||
def __init__(self, label_idx=-1):
|
||||
# in NER datasets, the last column is usually reserved for NER label
|
||||
self.label_idx = label_idx
|
||||
|
||||
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
||||
if isinstance(mode, Split):
|
||||
mode = mode.value
|
||||
file_path = os.path.join(data_dir, f"{mode}.txt")
|
||||
guid_index = 1
|
||||
examples = []
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
words = []
|
||||
labels = []
|
||||
for line in f:
|
||||
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
|
||||
if words:
|
||||
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
|
||||
guid_index += 1
|
||||
words = []
|
||||
labels = []
|
||||
else:
|
||||
splits = line.split(" ")
|
||||
words.append(splits[0])
|
||||
if len(splits) > 1:
|
||||
labels.append(splits[self.label_idx].replace("\n", ""))
|
||||
else:
|
||||
# Examples could have no label for mode = "test"
|
||||
labels.append("O")
|
||||
if words:
|
||||
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
|
||||
return examples
|
||||
|
||||
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
|
||||
example_id = 0
|
||||
for line in test_input_reader:
|
||||
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
|
||||
writer.write(line)
|
||||
if not preds_list[example_id]:
|
||||
example_id += 1
|
||||
elif preds_list[example_id]:
|
||||
output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
|
||||
writer.write(output_line)
|
||||
else:
|
||||
logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
|
||||
|
||||
def get_labels(self, path: str) -> List[str]:
|
||||
if path:
|
||||
with open(path, "r") as f:
|
||||
labels = f.read().splitlines()
|
||||
if "O" not in labels:
|
||||
labels = ["O"] + labels
|
||||
return labels
|
||||
else:
|
||||
return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
|
||||
|
||||
|
||||
class Chunk(NER):
|
||||
def __init__(self):
|
||||
# in CONLL2003 dataset chunk column is second-to-last
|
||||
super().__init__(label_idx=-2)
|
||||
|
||||
def get_labels(self, path: str) -> List[str]:
|
||||
if path:
|
||||
with open(path, "r") as f:
|
||||
labels = f.read().splitlines()
|
||||
if "O" not in labels:
|
||||
labels = ["O"] + labels
|
||||
return labels
|
||||
else:
|
||||
return [
|
||||
"O",
|
||||
"B-ADVP",
|
||||
"B-INTJ",
|
||||
"B-LST",
|
||||
"B-PRT",
|
||||
"B-NP",
|
||||
"B-SBAR",
|
||||
"B-VP",
|
||||
"B-ADJP",
|
||||
"B-CONJP",
|
||||
"B-PP",
|
||||
"I-ADVP",
|
||||
"I-INTJ",
|
||||
"I-LST",
|
||||
"I-PRT",
|
||||
"I-NP",
|
||||
"I-SBAR",
|
||||
"I-VP",
|
||||
"I-ADJP",
|
||||
"I-CONJP",
|
||||
"I-PP",
|
||||
]
|
||||
|
||||
|
||||
class POS(TokenClassificationTask):
|
||||
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
||||
if isinstance(mode, Split):
|
||||
mode = mode.value
|
||||
file_path = os.path.join(data_dir, f"{mode}.txt")
|
||||
guid_index = 1
|
||||
examples = []
|
||||
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
for sentence in parse_incr(f):
|
||||
words = []
|
||||
labels = []
|
||||
for token in sentence:
|
||||
words.append(token["form"])
|
||||
labels.append(token["upos"])
|
||||
assert len(words) == len(labels)
|
||||
if words:
|
||||
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
|
||||
guid_index += 1
|
||||
return examples
|
||||
|
||||
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
|
||||
example_id = 0
|
||||
for sentence in parse_incr(test_input_reader):
|
||||
s_p = preds_list[example_id]
|
||||
out = ""
|
||||
for token in sentence:
|
||||
out += f'{token["form"]} ({token["upos"]}|{s_p.pop(0)}) '
|
||||
out += "\n"
|
||||
writer.write(out)
|
||||
example_id += 1
|
||||
|
||||
def get_labels(self, path: str) -> List[str]:
|
||||
if path:
|
||||
with open(path, "r") as f:
|
||||
return f.read().splitlines()
|
||||
else:
|
||||
return [
|
||||
"ADJ",
|
||||
"ADP",
|
||||
"ADV",
|
||||
"AUX",
|
||||
"CCONJ",
|
||||
"DET",
|
||||
"INTJ",
|
||||
"NOUN",
|
||||
"NUM",
|
||||
"PART",
|
||||
"PRON",
|
||||
"PROPN",
|
||||
"PUNCT",
|
||||
"SCONJ",
|
||||
"SYM",
|
||||
"VERB",
|
||||
"X",
|
||||
]
|
||||
372
examples/legacy/token-classification/utils_ner.py
Normal file
372
examples/legacy/token-classification/utils_ner.py
Normal file
@@ -0,0 +1,372 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. """
|
||||
|
||||
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from filelock import FileLock
|
||||
from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class InputExample:
|
||||
"""
|
||||
A single training/test example for token classification.
|
||||
|
||||
Args:
|
||||
guid: Unique id for the example.
|
||||
words: list. The words of the sequence.
|
||||
labels: (Optional) list. The labels for each word of the sequence. This should be
|
||||
specified for train and dev examples, but not for test examples.
|
||||
"""
|
||||
|
||||
guid: str
|
||||
words: List[str]
|
||||
labels: Optional[List[str]]
|
||||
|
||||
|
||||
@dataclass
|
||||
class InputFeatures:
|
||||
"""
|
||||
A single set of features of data.
|
||||
Property names are the same names as the corresponding inputs to a model.
|
||||
"""
|
||||
|
||||
input_ids: List[int]
|
||||
attention_mask: List[int]
|
||||
token_type_ids: Optional[List[int]] = None
|
||||
label_ids: Optional[List[int]] = None
|
||||
|
||||
|
||||
class Split(Enum):
|
||||
train = "train"
|
||||
dev = "dev"
|
||||
test = "test"
|
||||
|
||||
|
||||
class TokenClassificationTask:
|
||||
@staticmethod
|
||||
def read_examples_from_file(data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def get_labels(path: str) -> List[str]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def convert_examples_to_features(
|
||||
examples: List[InputExample],
|
||||
label_list: List[str],
|
||||
max_seq_length: int,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
cls_token_at_end=False,
|
||||
cls_token="[CLS]",
|
||||
cls_token_segment_id=1,
|
||||
sep_token="[SEP]",
|
||||
sep_token_extra=False,
|
||||
pad_on_left=False,
|
||||
pad_token=0,
|
||||
pad_token_segment_id=0,
|
||||
pad_token_label_id=-100,
|
||||
sequence_a_segment_id=0,
|
||||
mask_padding_with_zero=True,
|
||||
) -> List[InputFeatures]:
|
||||
"""Loads a data file into a list of `InputFeatures`
|
||||
`cls_token_at_end` define the location of the CLS token:
|
||||
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
|
||||
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
|
||||
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
|
||||
"""
|
||||
# TODO clean up all this to leverage built-in features of tokenizers
|
||||
|
||||
label_map = {label: i for i, label in enumerate(label_list)}
|
||||
|
||||
features = []
|
||||
for (ex_index, example) in enumerate(examples):
|
||||
if ex_index % 10_000 == 0:
|
||||
logger.info("Writing example %d of %d", ex_index, len(examples))
|
||||
|
||||
tokens = []
|
||||
label_ids = []
|
||||
for word, label in zip(example.words, example.labels):
|
||||
word_tokens = tokenizer.tokenize(word)
|
||||
|
||||
# bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
|
||||
if len(word_tokens) > 0:
|
||||
tokens.extend(word_tokens)
|
||||
# Use the real label id for the first token of the word, and padding ids for the remaining tokens
|
||||
label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
|
||||
|
||||
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
|
||||
special_tokens_count = tokenizer.num_special_tokens_to_add()
|
||||
if len(tokens) > max_seq_length - special_tokens_count:
|
||||
tokens = tokens[: (max_seq_length - special_tokens_count)]
|
||||
label_ids = label_ids[: (max_seq_length - special_tokens_count)]
|
||||
|
||||
# The convention in BERT is:
|
||||
# (a) For sequence pairs:
|
||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
||||
# (b) For single sequences:
|
||||
# tokens: [CLS] the dog is hairy . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0
|
||||
#
|
||||
# Where "type_ids" are used to indicate whether this is the first
|
||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
||||
# `type=1` were learned during pre-training and are added to the wordpiece
|
||||
# embedding vector (and position vector). This is not *strictly* necessary
|
||||
# since the [SEP] token unambiguously separates the sequences, but it makes
|
||||
# it easier for the model to learn the concept of sequences.
|
||||
#
|
||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
||||
# used as as the "sentence vector". Note that this only makes sense because
|
||||
# the entire model is fine-tuned.
|
||||
tokens += [sep_token]
|
||||
label_ids += [pad_token_label_id]
|
||||
if sep_token_extra:
|
||||
# roberta uses an extra separator b/w pairs of sentences
|
||||
tokens += [sep_token]
|
||||
label_ids += [pad_token_label_id]
|
||||
segment_ids = [sequence_a_segment_id] * len(tokens)
|
||||
|
||||
if cls_token_at_end:
|
||||
tokens += [cls_token]
|
||||
label_ids += [pad_token_label_id]
|
||||
segment_ids += [cls_token_segment_id]
|
||||
else:
|
||||
tokens = [cls_token] + tokens
|
||||
label_ids = [pad_token_label_id] + label_ids
|
||||
segment_ids = [cls_token_segment_id] + segment_ids
|
||||
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
|
||||
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
||||
# tokens are attended to.
|
||||
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
|
||||
|
||||
# Zero-pad up to the sequence length.
|
||||
padding_length = max_seq_length - len(input_ids)
|
||||
if pad_on_left:
|
||||
input_ids = ([pad_token] * padding_length) + input_ids
|
||||
input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
|
||||
segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
|
||||
label_ids = ([pad_token_label_id] * padding_length) + label_ids
|
||||
else:
|
||||
input_ids += [pad_token] * padding_length
|
||||
input_mask += [0 if mask_padding_with_zero else 1] * padding_length
|
||||
segment_ids += [pad_token_segment_id] * padding_length
|
||||
label_ids += [pad_token_label_id] * padding_length
|
||||
|
||||
assert len(input_ids) == max_seq_length
|
||||
assert len(input_mask) == max_seq_length
|
||||
assert len(segment_ids) == max_seq_length
|
||||
assert len(label_ids) == max_seq_length
|
||||
|
||||
if ex_index < 5:
|
||||
logger.info("*** Example ***")
|
||||
logger.info("guid: %s", example.guid)
|
||||
logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
|
||||
logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
|
||||
logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
|
||||
logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
|
||||
logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
|
||||
|
||||
if "token_type_ids" not in tokenizer.model_input_names:
|
||||
segment_ids = None
|
||||
|
||||
features.append(
|
||||
InputFeatures(
|
||||
input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids
|
||||
)
|
||||
)
|
||||
return features
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.utils.data.dataset import Dataset
|
||||
|
||||
class TokenClassificationDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
"""
|
||||
|
||||
features: List[InputFeatures]
|
||||
pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
|
||||
# Use cross entropy ignore_index as padding label id so that only
|
||||
# real label ids contribute to the loss later.
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
token_classification_task: TokenClassificationTask,
|
||||
data_dir: str,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
labels: List[str],
|
||||
model_type: str,
|
||||
max_seq_length: Optional[int] = None,
|
||||
overwrite_cache=False,
|
||||
mode: Split = Split.train,
|
||||
):
|
||||
# Load data features from cache or dataset file
|
||||
cached_features_file = os.path.join(
|
||||
data_dir,
|
||||
"cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
|
||||
)
|
||||
|
||||
# Make sure only the first process in distributed training processes the dataset,
|
||||
# and the others will use the cache.
|
||||
lock_path = cached_features_file + ".lock"
|
||||
with FileLock(lock_path):
|
||||
|
||||
if os.path.exists(cached_features_file) and not overwrite_cache:
|
||||
logger.info(f"Loading features from cached file {cached_features_file}")
|
||||
self.features = torch.load(cached_features_file)
|
||||
else:
|
||||
logger.info(f"Creating features from dataset file at {data_dir}")
|
||||
examples = token_classification_task.read_examples_from_file(data_dir, mode)
|
||||
# TODO clean up all this to leverage built-in features of tokenizers
|
||||
self.features = token_classification_task.convert_examples_to_features(
|
||||
examples,
|
||||
labels,
|
||||
max_seq_length,
|
||||
tokenizer,
|
||||
cls_token_at_end=bool(model_type in ["xlnet"]),
|
||||
# xlnet has a cls token at the end
|
||||
cls_token=tokenizer.cls_token,
|
||||
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
|
||||
sep_token=tokenizer.sep_token,
|
||||
sep_token_extra=False,
|
||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||
pad_on_left=bool(tokenizer.padding_side == "left"),
|
||||
pad_token=tokenizer.pad_token_id,
|
||||
pad_token_segment_id=tokenizer.pad_token_type_id,
|
||||
pad_token_label_id=self.pad_token_label_id,
|
||||
)
|
||||
logger.info(f"Saving features into cached file {cached_features_file}")
|
||||
torch.save(self.features, cached_features_file)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.features)
|
||||
|
||||
def __getitem__(self, i) -> InputFeatures:
|
||||
return self.features[i]
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
class TFTokenClassificationDataset:
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
"""
|
||||
|
||||
features: List[InputFeatures]
|
||||
pad_token_label_id: int = -100
|
||||
# Use cross entropy ignore_index as padding label id so that only
|
||||
# real label ids contribute to the loss later.
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
token_classification_task: TokenClassificationTask,
|
||||
data_dir: str,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
labels: List[str],
|
||||
model_type: str,
|
||||
max_seq_length: Optional[int] = None,
|
||||
overwrite_cache=False,
|
||||
mode: Split = Split.train,
|
||||
):
|
||||
examples = token_classification_task.read_examples_from_file(data_dir, mode)
|
||||
# TODO clean up all this to leverage built-in features of tokenizers
|
||||
self.features = token_classification_task.convert_examples_to_features(
|
||||
examples,
|
||||
labels,
|
||||
max_seq_length,
|
||||
tokenizer,
|
||||
cls_token_at_end=bool(model_type in ["xlnet"]),
|
||||
# xlnet has a cls token at the end
|
||||
cls_token=tokenizer.cls_token,
|
||||
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
|
||||
sep_token=tokenizer.sep_token,
|
||||
sep_token_extra=False,
|
||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||
pad_on_left=bool(tokenizer.padding_side == "left"),
|
||||
pad_token=tokenizer.pad_token_id,
|
||||
pad_token_segment_id=tokenizer.pad_token_type_id,
|
||||
pad_token_label_id=self.pad_token_label_id,
|
||||
)
|
||||
|
||||
def gen():
|
||||
for ex in self.features:
|
||||
if ex.token_type_ids is None:
|
||||
yield (
|
||||
{"input_ids": ex.input_ids, "attention_mask": ex.attention_mask},
|
||||
ex.label_ids,
|
||||
)
|
||||
else:
|
||||
yield (
|
||||
{
|
||||
"input_ids": ex.input_ids,
|
||||
"attention_mask": ex.attention_mask,
|
||||
"token_type_ids": ex.token_type_ids,
|
||||
},
|
||||
ex.label_ids,
|
||||
)
|
||||
|
||||
if "token_type_ids" not in tokenizer.model_input_names:
|
||||
self.dataset = tf.data.Dataset.from_generator(
|
||||
gen,
|
||||
({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
|
||||
(
|
||||
{"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])},
|
||||
tf.TensorShape([None]),
|
||||
),
|
||||
)
|
||||
else:
|
||||
self.dataset = tf.data.Dataset.from_generator(
|
||||
gen,
|
||||
({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
|
||||
(
|
||||
{
|
||||
"input_ids": tf.TensorShape([None]),
|
||||
"attention_mask": tf.TensorShape([None]),
|
||||
"token_type_ids": tf.TensorShape([None]),
|
||||
},
|
||||
tf.TensorShape([None]),
|
||||
),
|
||||
)
|
||||
|
||||
def get_dataset(self):
|
||||
self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
|
||||
|
||||
return self.dataset
|
||||
|
||||
def __len__(self):
|
||||
return len(self.features)
|
||||
|
||||
def __getitem__(self, i) -> InputFeatures:
|
||||
return self.features[i]
|
||||
Reference in New Issue
Block a user