From e4512aab3bffcbaa28d65e4eca7ab73d8fdd7889 Mon Sep 17 00:00:00 2001 From: Viktor Alm Date: Tue, 12 May 2020 14:48:48 +0200 Subject: [PATCH] Add MultipleChoice to TFTrainer [WIP] (#4270) * catch gpu len 1 set to gpu0 * Add mpc to trainer * Add MPC for TF * fix TF automodel for MPC and add Albert * Apply style * Fix import * Note to self: double check * Make shape None, None for datasetgenerator output shapes * Add from_pt bool which doesnt seem to work * Original checkpoint dir * Fix docstrings for automodel * Update readme and apply style * Colab should probably not be from users * Colabs should probably not be from users * Add colab * Update README.md * Update README.md * Cleanup __intit__ * Cleanup flake8 trailing comma * Update src/transformers/training_args_tf.py * Update src/transformers/modeling_tf_auto.py Co-authored-by: Viktor Alm Co-authored-by: Julien Chaumond --- examples/README.md | 4 +- examples/multiple-choice/README.md | 25 ++ .../multiple-choice/run_tf_multiple_choice.py | 211 +++++++++++++++ .../multiple-choice/utils_multiple_choice.py | 256 ++++++++++++++---- src/transformers/__init__.py | 2 + src/transformers/modeling_tf_albert.py | 126 ++++++++- src/transformers/modeling_tf_auto.py | 153 +++++++++++ src/transformers/trainer_tf.py | 7 +- src/transformers/training_args_tf.py | 11 +- 9 files changed, 730 insertions(+), 65 deletions(-) create mode 100644 examples/multiple-choice/run_tf_multiple_choice.py diff --git a/examples/README.md b/examples/README.md index 0645c1e1b6..e27e34f46f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -14,12 +14,12 @@ This is still a work-in-progress – in particular documentation is still sparse ## Tasks built on Trainer -| Task | Example datasets | Trainer support | TFTrainer support | pytorch-lightning | Colab | One-click Deploy to Azure (wip) | +| Task | Example datasets | Trainer support | TFTrainer support | pytorch-lightning | Colab | One-click Deploy to Azure (wip) | |---|---|:---:|:---:|:---:|:---:|:---:| | [`language-modeling`](./language-modeling) | Raw text | ✅ | - | - | - | - | | [`text-classification`](./text-classification) | GLUE, XNLI | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/trainer/01_text_classification.ipynb) | [![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure%2Fazure-quickstart-templates%2Fmaster%2F101-storage-account-create%2Fazuredeploy.json) | | [`token-classification`](./token-classification) | CoNLL NER | ✅ | ✅ | ✅ | - | - | -| [`multiple-choice`](./multiple-choice) | SWAG, RACE, ARC | ✅ | - | - | - | - | +| [`multiple-choice`](./multiple-choice) | SWAG, RACE, ARC | ✅ | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb) | - | diff --git a/examples/multiple-choice/README.md b/examples/multiple-choice/README.md index 469724be4b..89796b656f 100644 --- a/examples/multiple-choice/README.md +++ b/examples/multiple-choice/README.md @@ -29,3 +29,28 @@ Training with the defined hyper-parameters yields the following results: eval_acc = 0.8338998300509847 eval_loss = 0.44457291918821606 ``` + + +## Tensorflow + +```bash +export SWAG_DIR=/path/to/swag_data_dir +python ./examples/multiple-choice/run_tf_multiple_choice.py \ +--task_name swag \ +--model_name_or_path bert-base-cased \ +--do_train \ +--do_eval \ +--data_dir $SWAG_DIR \ +--learning_rate 5e-5 \ +--num_train_epochs 3 \ +--max_seq_length 80 \ +--output_dir models_bert/swag_base \ +--per_gpu_eval_batch_size=16 \ +--per_gpu_train_batch_size=16 \ +--logging-dir logs \ +--gradient_accumulation_steps 2 \ +--overwrite_output +``` + +# Run it in colab +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb) diff --git a/examples/multiple-choice/run_tf_multiple_choice.py b/examples/multiple-choice/run_tf_multiple_choice.py new file mode 100644 index 0000000000..26d0fcbff5 --- /dev/null +++ b/examples/multiple-choice/run_tf_multiple_choice.py @@ -0,0 +1,211 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Finetuning the library models for multiple choice (Bert, Roberta, XLNet).""" + + +import logging +import os +from dataclasses import dataclass, field +from typing import Dict, Optional + +import numpy as np + +from transformers import ( + AutoConfig, + AutoTokenizer, + EvalPrediction, + HfArgumentParser, + TFAutoModelForMultipleChoice, + TFTrainer, + TFTrainingArguments, + set_seed, +) +from utils_multiple_choice import Split, TFMultipleChoiceDataset, processors + + +logger = logging.getLogger(__name__) + + +def simple_accuracy(preds, labels): + return (preds == labels).mean() + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(processors.keys())}) + data_dir: str = field(metadata={"help": "Should contain the data files for the task."}) + max_seq_length: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if ( + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.warning( + "device: %s, n_gpu: %s, 16-bits training: %s", training_args.device, training_args.n_gpu, training_args.fp16, + ) + logger.info("Training/evaluation parameters %s", training_args) + + # Set seed + set_seed(training_args.seed) + + try: + processor = processors[data_args.task_name]() + label_list = processor.get_labels() + num_labels = len(label_list) + except KeyError: + raise ValueError("Task not found: %s" % (data_args.task_name)) + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + num_labels=num_labels, + finetuning_task=data_args.task_name, + cache_dir=model_args.cache_dir, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + ) + with training_args.strategy.scope(): + model = TFAutoModelForMultipleChoice.from_pretrained( + model_args.model_name_or_path, + from_pt=bool(".bin" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + ) + # Get datasets + train_dataset = ( + TFMultipleChoiceDataset( + data_dir=data_args.data_dir, + tokenizer=tokenizer, + task=data_args.task_name, + max_seq_length=data_args.max_seq_length, + overwrite_cache=data_args.overwrite_cache, + mode=Split.train, + ) + if training_args.do_train + else None + ) + eval_dataset = ( + TFMultipleChoiceDataset( + data_dir=data_args.data_dir, + tokenizer=tokenizer, + task=data_args.task_name, + max_seq_length=data_args.max_seq_length, + overwrite_cache=data_args.overwrite_cache, + mode=Split.dev, + ) + if training_args.do_eval + else None + ) + + def compute_metrics(p: EvalPrediction) -> Dict: + preds = np.argmax(p.predictions, axis=1) + return {"acc": simple_accuracy(preds, p.label_ids)} + + # Initialize our Trainer + trainer = TFTrainer( + model=model, + args=training_args, + train_dataset=train_dataset.get_dataset() if train_dataset else None, + eval_dataset=eval_dataset.get_dataset() if eval_dataset else None, + compute_metrics=compute_metrics, + ) + + # Training + if training_args.do_train: + trainer.train() + trainer.save_model() + tokenizer.save_pretrained(training_args.output_dir) + # Evaluation + results = {} + if training_args.do_eval: + logger.info("*** Evaluate ***") + + result = trainer.evaluate() + + output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + for key, value in result.items(): + logger.info(" %s = %s", key, value) + writer.write("%s = %s\n" % (key, value)) + + results.update(result) + + return results + + +if __name__ == "__main__": + main() diff --git a/examples/multiple-choice/utils_multiple_choice.py b/examples/multiple-choice/utils_multiple_choice.py index 8fe201e8d0..e78173d35e 100644 --- a/examples/multiple-choice/utils_multiple_choice.py +++ b/examples/multiple-choice/utils_multiple_choice.py @@ -25,11 +25,9 @@ from dataclasses import dataclass from enum import Enum from typing import List, Optional -import torch import tqdm -from torch.utils.data.dataset import Dataset -from transformers import PreTrainedTokenizer, torch_distributed_zero_first +from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available logger = logging.getLogger(__name__) @@ -76,66 +74,160 @@ class Split(Enum): test = "test" -class MultipleChoiceDataset(Dataset): - """ - This will be superseded by a framework-agnostic approach - soon. - """ +if is_torch_available(): + import torch + from torch.utils.data.dataset import Dataset + from transformers import torch_distributed_zero_first - features: List[InputFeatures] + class MultipleChoiceDataset(Dataset): + """ + This will be superseded by a framework-agnostic approach + soon. + """ - def __init__( - self, - data_dir: str, - tokenizer: PreTrainedTokenizer, - task: str, - max_seq_length: Optional[int] = None, - overwrite_cache=False, - mode: Split = Split.train, - local_rank=-1, - ): - processor = processors[task]() + features: List[InputFeatures] - cached_features_file = os.path.join( - data_dir, - "cached_{}_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length), task,), - ) - with torch_distributed_zero_first(local_rank): - # Make sure only the first process in distributed training processes the dataset, - # and the others will use the cache. + def __init__( + self, + data_dir: str, + tokenizer: PreTrainedTokenizer, + task: str, + max_seq_length: Optional[int] = None, + overwrite_cache=False, + mode: Split = Split.train, + local_rank=-1, + ): + processor = processors[task]() - if os.path.exists(cached_features_file) and not overwrite_cache: - logger.info(f"Loading features from cached file {cached_features_file}") - self.features = torch.load(cached_features_file) - else: - logger.info(f"Creating features from dataset file at {data_dir}") - label_list = processor.get_labels() - if mode == Split.dev: - examples = processor.get_dev_examples(data_dir) - elif mode == Split.test: - examples = processor.get_test_examples(data_dir) + cached_features_file = os.path.join( + data_dir, + "cached_{}_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length), task,), + ) + with torch_distributed_zero_first(local_rank): + # Make sure only the first process in distributed training processes the dataset, + # and the others will use the cache. + + if os.path.exists(cached_features_file) and not overwrite_cache: + logger.info(f"Loading features from cached file {cached_features_file}") + self.features = torch.load(cached_features_file) else: - examples = processor.get_train_examples(data_dir) - logger.info("Training examples: %s", len(examples)) - # TODO clean up all this to leverage built-in features of tokenizers - self.features = convert_examples_to_features( - examples, - label_list, - max_seq_length, - tokenizer, - pad_on_left=bool(tokenizer.padding_side == "left"), - pad_token=tokenizer.pad_token_id, - pad_token_segment_id=tokenizer.pad_token_type_id, - ) - if local_rank in [-1, 0]: - logger.info("Saving features into cached file %s", cached_features_file) - torch.save(self.features, cached_features_file) + logger.info(f"Creating features from dataset file at {data_dir}") + label_list = processor.get_labels() + if mode == Split.dev: + examples = processor.get_dev_examples(data_dir) + elif mode == Split.test: + examples = processor.get_test_examples(data_dir) + else: + examples = processor.get_train_examples(data_dir) + logger.info("Training examples: %s", len(examples)) + # TODO clean up all this to leverage built-in features of tokenizers + self.features = convert_examples_to_features( + examples, + label_list, + max_seq_length, + tokenizer, + pad_on_left=bool(tokenizer.padding_side == "left"), + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id, + ) + if local_rank in [-1, 0]: + logger.info("Saving features into cached file %s", cached_features_file) + torch.save(self.features, cached_features_file) - def __len__(self): - return len(self.features) + def __len__(self): + return len(self.features) - def __getitem__(self, i) -> InputFeatures: - return self.features[i] + def __getitem__(self, i) -> InputFeatures: + return self.features[i] + + +if is_tf_available(): + import tensorflow as tf + + class TFMultipleChoiceDataset: + """ + This will be superseded by a framework-agnostic approach + soon. + """ + + features: List[InputFeatures] + + def __init__( + self, + data_dir: str, + tokenizer: PreTrainedTokenizer, + task: str, + max_seq_length: Optional[int] = 128, + overwrite_cache=False, + mode: Split = Split.train, + ): + processor = processors[task]() + + logger.info(f"Creating features from dataset file at {data_dir}") + label_list = processor.get_labels() + if mode == Split.dev: + examples = processor.get_dev_examples(data_dir) + elif mode == Split.test: + examples = processor.get_test_examples(data_dir) + else: + examples = processor.get_train_examples(data_dir) + logger.info("Training examples: %s", len(examples)) + # TODO clean up all this to leverage built-in features of tokenizers + self.features = convert_examples_to_features( + examples, + label_list, + max_seq_length, + tokenizer, + pad_on_left=bool(tokenizer.padding_side == "left"), + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id, + ) + + def gen(): + for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"): + if ex_index % 10000 == 0: + logger.info("Writing example %d of %d" % (ex_index, len(examples))) + + yield ( + { + "example_id": 0, + "input_ids": ex.input_ids, + "attention_mask": ex.attention_mask, + "token_type_ids": ex.token_type_ids, + }, + ex.label, + ) + + self.dataset = tf.data.Dataset.from_generator( + gen, + ( + { + "example_id": tf.int32, + "input_ids": tf.int32, + "attention_mask": tf.int32, + "token_type_ids": tf.int32, + }, + tf.int64, + ), + ( + { + "example_id": tf.TensorShape([]), + "input_ids": tf.TensorShape([None, None]), + "attention_mask": tf.TensorShape([None, None]), + "token_type_ids": tf.TensorShape([None, None]), + }, + tf.TensorShape([]), + ), + ) + + def get_dataset(self): + return self.dataset + + def __len__(self): + return len(self.features) + + def __getitem__(self, i) -> InputFeatures: + return self.features[i] class DataProcessor: @@ -225,6 +317,52 @@ class RaceProcessor(DataProcessor): return examples +class SynonymProcessor(DataProcessor): + """Processor for the Synonym data set.""" + + def get_train_examples(self, data_dir): + """See base class.""" + logger.info("LOOKING AT {} train".format(data_dir)) + return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + logger.info("LOOKING AT {} dev".format(data_dir)) + return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + logger.info("LOOKING AT {} dev".format(data_dir)) + + return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1", "2", "3", "4"] + + def _read_csv(self, input_file): + with open(input_file, "r", encoding="utf-8") as f: + return list(csv.reader(f)) + + def _create_examples(self, lines: List[List[str]], type: str): + """Creates examples for the training and dev sets.""" + + examples = [ + InputExample( + example_id=line[0], + question="", # in the swag dataset, the + # common beginning of each + # choice is stored in "sent2". + contexts=[line[1], line[1], line[1], line[1], line[1]], + endings=[line[2], line[3], line[4], line[5], line[6]], + label=line[7], + ) + for line in lines # we skip the line with the column names + ] + + return examples + + class SwagProcessor(DataProcessor): """Processor for the SWAG data set.""" @@ -435,7 +573,5 @@ def convert_examples_to_features( return features -processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor} - - -MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4} +processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor, "syn": SynonymProcessor} +MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4, "syn", 5} diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 242d03563b..43c91e652a 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -359,6 +359,7 @@ if is_tf_available(): from .modeling_tf_auto import ( TFAutoModel, TFAutoModelForPreTraining, + TFAutoModelForMultipleChoice, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, TFAutoModelWithLMHead, @@ -493,6 +494,7 @@ if is_tf_available(): TFAlbertModel, TFAlbertForPreTraining, TFAlbertForMaskedLM, + TFAlbertForMultipleChoice, TFAlbertForSequenceClassification, TFAlbertForQuestionAnswering, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py index 5982446250..a6065e70e1 100644 --- a/src/transformers/modeling_tf_albert.py +++ b/src/transformers/modeling_tf_albert.py @@ -21,7 +21,7 @@ import logging import tensorflow as tf from .configuration_albert import AlbertConfig -from .file_utils import add_start_docstrings, add_start_docstrings_to_callable +from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_bert import ACT2FN, TFBertSelfAttention from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .tokenization_utils import BatchEncoding @@ -957,3 +957,127 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel): outputs = (start_logits, end_logits,) + outputs[2:] return outputs # start_logits, end_logits, (hidden_states), (attentions) + + +@add_start_docstrings( + """Albert Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + ALBERT_START_DOCSTRING, +) +class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.albert = TFAlbertMainLayer(config, name="albert") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = tf.keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + + @property + def dummy_inputs(self): + """ Dummy inputs to build the network. + + Returns: + tf.Tensor with dummy inputs + """ + return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} + + @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) + def call( + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): + r""" + Return: + :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: + `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). + + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import AlbertTokenizer, TFAlbertForMultipleChoice + + tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') + model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2') + + example1 = ["This is a context", "Is it a context? Yes"] + example2 = ["This is a context", "Is it a context? No"] + encoding = tokenizer.batch_encode_plus([example1, example2], return_tensors='tf', truncation_strategy="only_first", pad_to_max_length=True, max_length=128) + outputs = model(encoding["input_ids"][None, :]) + logits = outputs[0] + + """ + if isinstance(inputs, (tuple, list)): + input_ids = inputs[0] + attention_mask = inputs[1] if len(inputs) > 1 else attention_mask + token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids + position_ids = inputs[3] if len(inputs) > 3 else position_ids + head_mask = inputs[4] if len(inputs) > 4 else head_mask + inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds + assert len(inputs) <= 6, "Too many inputs." + elif isinstance(inputs, dict): + print("isdict(1)") + input_ids = inputs.get("input_ids") + print(input_ids) + + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + assert len(inputs) <= 6, "Too many inputs." + else: + input_ids = inputs + + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + + flat_inputs = [ + flat_input_ids, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + head_mask, + inputs_embeds, + ] + + outputs = self.albert(flat_inputs, training=training) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output, training=training) + logits = self.classifier(pooled_output) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + + outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + + return outputs # reshaped_logits, (hidden_states), (attentions) diff --git a/src/transformers/modeling_tf_auto.py b/src/transformers/modeling_tf_auto.py index c9d084346d..d65b0f80e5 100644 --- a/src/transformers/modeling_tf_auto.py +++ b/src/transformers/modeling_tf_auto.py @@ -36,6 +36,7 @@ from .configuration_utils import PretrainedConfig from .modeling_tf_albert import ( TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, TFAlbertForMaskedLM, + TFAlbertForMultipleChoice, TFAlbertForPreTraining, TFAlbertForQuestionAnswering, TFAlbertForSequenceClassification, @@ -44,6 +45,7 @@ from .modeling_tf_albert import ( from .modeling_tf_bert import ( TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP, TFBertForMaskedLM, + TFBertForMultipleChoice, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, @@ -172,6 +174,10 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( ] ) +TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( + [(BertConfig, TFBertForMultipleChoice), (AlbertConfig, TFAlbertForMultipleChoice)] +) + TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( [ (DistilBertConfig, TFDistilBertForQuestionAnswering), @@ -662,6 +668,153 @@ class TFAutoModelWithLMHead(object): ) +class TFAutoModelForMultipleChoice: + r""" + :class:`~transformers.TFAutoModelForMultipleChoice` is a generic model class + that will be instantiated as one of the multiple choice model classes of the library + when created with the `TFAutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` + class method. + + The `from_pretrained()` method takes care of returning the correct model class instance + based on the `model_type` property of the config object, or when it's missing, + falling back to using pattern matching on the `pretrained_model_name_or_path` string. + + The model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `albert`: TFAlbertForMultipleChoice (Albert model) + - contains `bert`: TFBertForMultipleChoice (Bert model) + + This class cannot be instantiated using `__init__()` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "TFAutoModelForMultipleChoice is designed to be instantiated " + "using the `TFAutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` or " + "`TFAutoModelForMultipleChoice.from_config(config)` methods." + ) + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + The model class to instantiate is selected based on the configuration class: + - isInstance of `albert` configuration class: AlbertModel (Albert model) + - isInstance of `bert` configuration class: BertModel (Bert model) + + Examples:: + + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + model = AutoModelForMulitpleChoice.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + for config_class, model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): + if isinstance(config, config_class): + return model_class(config) + raise ValueError( + "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" + "Model type should be one of {}.".format( + config.__class__, + cls.__name__, + ", ".join(c.__name__ for c in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()), + ) + ) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r""" Instantiates one of the multiple choice model classes of the library + from a pre-trained model configuration. + + The `from_pretrained()` method takes care of returning the correct model class instance + based on the `model_type` property of the config object, or when it's missing, + falling back to using pattern matching on the `pretrained_model_name_or_path` string. + + The model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `albert`: TFRobertaForMultiple (Albert model) + - contains `bert`: TFBertForMultipleChoice (Bert model) + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) + To train the model, you should first set it back in training mode with `model.train()` + + Params: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. + + from_pt: (`Optional`) Boolean + Set to True if the Checkpoint is a PyTorch checkpoint. + + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + resume_download: (`optional`) boolean, default False: + Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. + + kwargs: (`optional`) Remaining dictionary of keyword arguments: + Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + + Examples:: + + model = TFAutoModelFormultipleChoice.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. + model = TFAutoModelFormultipleChoice.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + model = TFAutoModelFormultipleChoice.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') + model = TFAutoModelFormultipleChoice.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) + + """ + config = kwargs.pop("config", None) + if not isinstance(config, PretrainedConfig): + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + + for config_class, model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): + if isinstance(config, config_class): + return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) + raise ValueError( + "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" + "Model type should be one of {}.".format( + config.__class__, + cls.__name__, + ", ".join(c.__name__ for c in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()), + ) + ) + + class TFAutoModelForSequenceClassification(object): r""" :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index 3163eec0e1..1a5585c1db 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -125,7 +125,9 @@ class TFTrainer: in the Tensorflow documentation and those contained in the transformers library. """ if self.args.optimizer_name == "adamw": - self.optimizer = create_optimizer(self.args.learning_rate, self.train_steps, self.args.warmup_steps) + self.optimizer = create_optimizer( + self.args.learning_rate, self.train_steps, self.args.warmup_steps, self.args.end_lr + ) else: try: self.optimizer = tf.keras.optimizers.get( @@ -139,6 +141,7 @@ class TFTrainer: self.optimizer = tf.keras.optimizers.get( {"class_name": self.args.optimizer_name, "config": {"learning_rate": self.args.learning_rate}} ) + logger.info("Created an/a {} optimizer".format(self.optimizer)) def _create_checkpoint_manager(self, max_to_keep: int = 5, load_model: bool = True) -> None: """ @@ -149,6 +152,7 @@ class TFTrainer: load_model: if we want to start the training from the latest checkpoint. """ ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model) + self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, PREFIX_CHECKPOINT_DIR, max_to_keep=max_to_keep) if load_model: @@ -425,5 +429,6 @@ class TFTrainer: path = os.path.join(self.args.output_dir, "saved_model") + logger.info("Saving model in {}".format(path)) os.makedirs(path, exist_ok=True) self.model.save_pretrained(self.args.output_dir) diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index bb9bf9d18a..e295c4aae1 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -30,6 +30,12 @@ class TFTrainingArguments(TrainingArguments): "help": "Name of a Tensorflow loss. For the list see: https://www.tensorflow.org/api_docs/python/tf/keras/losses" }, ) + tpu_name: str = field( + default=None, metadata={"help": "Name of TPU"}, + ) + end_lr: float = field( + default=0, metadata={"help": "End learning rate for optimizer"}, + ) eval_steps: int = field(default=1000, metadata={"help": "Run an evaluation every X steps."}) debug: bool = field( default=False, metadata={"help": "Activate the trace to record computation graphs and profiling information"} @@ -45,7 +51,10 @@ class TFTrainingArguments(TrainingArguments): strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") else: try: - tpu = tf.distribute.cluster_resolver.TPUClusterResolver() + if self.tpu_name: + tpu = tf.distribute.cluster_resolver.TPUClusterResolver(self.tpu_name) + else: + tpu = tf.distribute.cluster_resolver.TPUClusterResolver() except ValueError: tpu = None