From 585217c87fbb619a6d57e0bb947575ce75025140 Mon Sep 17 00:00:00 2001 From: Julien Plu Date: Tue, 22 Sep 2020 18:05:05 +0200 Subject: [PATCH] Add generic text classification example in TF (#5716) * Add new example with nlp * Update README * replace nlp by datasets * Update examples/text-classification/README.md Add Lysandre's suggestion. Co-authored-by: Lysandre Debut Co-authored-by: Lysandre Debut --- examples/text-classification/README.md | 25 ++ .../run_tf_text_classification.py | 283 ++++++++++++++++++ 2 files changed, 308 insertions(+) create mode 100644 examples/text-classification/run_tf_text_classification.py diff --git a/examples/text-classification/README.md b/examples/text-classification/README.md index 30c44c1d41..ce412751c1 100644 --- a/examples/text-classification/README.md +++ b/examples/text-classification/README.md @@ -23,6 +23,31 @@ Quick benchmarks from the script (no other modifications): Mixed precision (AMP) reduces the training time considerably for the same hardware and hyper-parameters (same batch size was used). +## Run generic text classification script in TensorFlow + +The script [run_tf_text_classification.py](https://github.com/huggingface/transformers/blob/master/examples/text-classification/run_tf_text_classification.py) allows users to run a text classification on their own CSV files. For now there are few restrictions, the CSV files must have a header corresponding to the column names and not more than three columns: one column for the id, one column for the text and another column for a second piece of text in case of an entailment classification for example. + +To use the script, one as to run the following command line: +```bash +python run_tf_text_classification.py \ + --train_file train.csv \ ### training dataset file location (mandatory if running with --do_train option) + --dev_file dev.csv \ ### development dataset file location (mandatory if running with --do_eval option) + --test_file test.csv \ ### test dataset file location (mandatory if running with --do_predict option) + --label_column_id 0 \ ### which column corresponds to the labels + --model_name_or_path bert-base-multilingual-uncased \ + --output_dir model \ + --num_train_epochs 4 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 32 \ + --do_train \ + --do_eval \ + --do_predict \ + --logging_steps 10 \ + --evaluate_during_training \ + --save_steps 10 \ + --overwrite_output_dir \ + --max_seq_length 128 +``` # Run PyTorch version diff --git a/examples/text-classification/run_tf_text_classification.py b/examples/text-classification/run_tf_text_classification.py new file mode 100644 index 0000000000..40472da47e --- /dev/null +++ b/examples/text-classification/run_tf_text_classification.py @@ -0,0 +1,283 @@ +# coding=utf-8 +""" Fine-tuning the library models for sequence classification.""" + + +import logging +import os +from dataclasses import dataclass, field +from typing import Dict, Optional + +import datasets +import numpy as np +import tensorflow as tf + +from transformers import ( + AutoConfig, + AutoTokenizer, + EvalPrediction, + HfArgumentParser, + PreTrainedTokenizer, + TFAutoModelForSequenceClassification, + TFTrainer, + TFTrainingArguments, +) + + +def get_tfds( + train_file: str, + eval_file: str, + test_file: str, + tokenizer: PreTrainedTokenizer, + label_column_id: int, + max_seq_length: Optional[int] = None, +): + files = {} + + if train_file is not None: + files[datasets.Split.TRAIN] = [train_file] + if eval_file is not None: + files[datasets.Split.VALIDATION] = [eval_file] + if test_file is not None: + files[datasets.Split.TEST] = [test_file] + + ds = datasets.load_dataset("csv", data_files=files) + features_name = list(ds[list(files.keys())[0]].features.keys()) + label_name = features_name.pop(label_column_id) + label_list = list(set(ds[list(files.keys())[0]][label_name])) + label2id = {label: i for i, label in enumerate(label_list)} + input_names = ["input_ids"] + tokenizer.model_input_names + transformed_ds = {} + + if len(features_name) == 1: + for k in files.keys(): + transformed_ds[k] = ds[k].map( + lambda example: tokenizer.batch_encode_plus( + example[features_name[0]], truncation=True, max_length=max_seq_length, padding="max_length" + ), + batched=True, + ) + elif len(features_name) == 2: + for k in files.keys(): + transformed_ds[k] = ds[k].map( + lambda example: tokenizer.batch_encode_plus( + (example[features_name[0]], features_name[1]), + truncation=True, + max_length=max_seq_length, + padding="max_length", + ), + batched=True, + ) + + def gen_train(): + for ex in transformed_ds[datasets.Split.TRAIN]: + d = {k: v for k, v in ex.items() if k in input_names} + label = label2id[ex[label_name]] + yield (d, label) + + def gen_val(): + for ex in transformed_ds[datasets.Split.VALIDATION]: + d = {k: v for k, v in ex.items() if k in input_names} + label = label2id[ex[label_name]] + yield (d, label) + + def gen_test(): + for ex in transformed_ds[datasets.Split.TEST]: + d = {k: v for k, v in ex.items() if k in input_names} + label = label2id[ex[label_name]] + yield (d, label) + + train_ds = ( + tf.data.Dataset.from_generator( + gen_train, + ({k: tf.int32 for k in input_names}, tf.int64), + ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])), + ) + if datasets.Split.TRAIN in transformed_ds + else None + ) + + val_ds = ( + tf.data.Dataset.from_generator( + gen_val, + ({k: tf.int32 for k in input_names}, tf.int64), + ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])), + ) + if datasets.Split.VALIDATION in transformed_ds + else None + ) + + test_ds = ( + tf.data.Dataset.from_generator( + gen_test, + ({k: tf.int32 for k in input_names}, tf.int64), + ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])), + ) + if datasets.Split.TEST in transformed_ds + else None + ) + + return train_ds, val_ds, test_ds, label2id + + +logger = logging.getLogger(__name__) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + + Using `HfArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + label_column_id: int = field(metadata={"help": "Which column contains the label"}) + train_file: str = field(default=None, metadata={"help": "The path of the training file"}) + dev_file: Optional[str] = field(default=None, metadata={"help": "The path of the development file"}) + test_file: Optional[str] = field(default=None, metadata={"help": "The path of the test file"}) + max_seq_length: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."}) + # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script, + # or just modify its tokenizer_config.json. + cache_dir: Optional[str] = field( + default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + ) + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if ( + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info( + "n_replicas: %s, distributed training: %s, 16-bits training: %s", + training_args.n_replicas, + bool(training_args.n_replicas > 1), + training_args.fp16, + ) + logger.info("Training/evaluation parameters %s", training_args) + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + ) + + train_dataset, eval_dataset, test_ds, label2id = get_tfds( + train_file=data_args.train_file, + eval_file=data_args.dev_file, + test_file=data_args.test_file, + tokenizer=tokenizer, + label_column_id=data_args.label_column_id, + max_seq_length=data_args.max_seq_length, + ) + + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + num_labels=len(label2id), + label2id=label2id, + id2label={id: label for label, id in label2id.items()}, + finetuning_task="text-classification", + cache_dir=model_args.cache_dir, + ) + + with training_args.strategy.scope(): + model = TFAutoModelForSequenceClassification.from_pretrained( + model_args.model_name_or_path, + from_pt=bool(".bin" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + ) + + def compute_metrics(p: EvalPrediction) -> Dict: + preds = np.argmax(p.predictions, axis=1) + + return {"acc": (preds == p.label_ids).mean()} + + # Initialize our Trainer + trainer = TFTrainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, + ) + + # Training + if training_args.do_train: + trainer.train() + trainer.save_model() + tokenizer.save_pretrained(training_args.output_dir) + + # Evaluation + results = {} + if training_args.do_eval: + logger.info("*** Evaluate ***") + + result = trainer.evaluate() + output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") + + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + + for key, value in result.items(): + logger.info(" %s = %s", key, value) + writer.write("%s = %s\n" % (key, value)) + + results.update(result) + + return results + + +if __name__ == "__main__": + main()