From bab6ad01aad6f6a8cbf5b8634d890d8fce9f46d1 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 24 Oct 2019 21:41:45 +0000 Subject: [PATCH 1/2] run_tf_glue works with all tasks --- examples/run_tf_glue.py | 43 ++++++++++++++++++++++----- transformers/data/processors/glue.py | 4 +++ transformers/data/processors/utils.py | 7 +++++ 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py index 399fe9e616..73173b0cf1 100644 --- a/examples/run_tf_glue.py +++ b/examples/run_tf_glue.py @@ -1,29 +1,47 @@ import os import tensorflow as tf import tensorflow_datasets -from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification +from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, glue_convert_examples_to_features, BertForSequenceClassification, glue_processors # script parameters BATCH_SIZE = 32 EVAL_BATCH_SIZE = BATCH_SIZE * 2 USE_XLA = False USE_AMP = False +EPOCHS = 3 + +TASK = "mrpc" + +if TASK == "sst-2": + TFDS_TASK = "sst2" +elif TASK == "sts-b": + TFDS_TASK = "stsb" +else: + TFDS_TASK = TASK + +num_labels = len(glue_processors[TASK]().get_labels()) +print(num_labels) tf.config.optimizer.set_jit(USE_XLA) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP}) -# Load tokenizer and model from pretrained model/vocabulary +# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression) +config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') -model = TFBertForSequenceClassification.from_pretrained('bert-base-cased') +model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config) # Load dataset via TensorFlow Datasets -data, info = tensorflow_datasets.load('glue/mrpc', with_info=True) +data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True) train_examples = info.splits['train'].num_examples + +# MNLI expects either validation_matched or validation_mismatched valid_examples = info.splits['validation'].num_examples # Prepare dataset for GLUE as a tf.data.Dataset instance -train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc') -valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc') +train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK) + +# MNLI expects either validation_matched or validation_mismatched +valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK) train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1) valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE) @@ -32,7 +50,13 @@ opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08) if USE_AMP: # loss scaling is currently required when using mixed precision opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') -loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + + +if num_labels == 1: + loss = tf.keras.losses.MeanSquaredError() +else: + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') model.compile(optimizer=opt, loss=loss, metrics=[metric]) @@ -40,7 +64,7 @@ model.compile(optimizer=opt, loss=loss, metrics=[metric]) train_steps = train_examples//BATCH_SIZE valid_steps = valid_examples//EVAL_BATCH_SIZE -history = model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps, +history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps, validation_data=valid_dataset, validation_steps=valid_steps) # Save TF2 model @@ -57,6 +81,9 @@ sentence_2 = 'His findings were not compatible with this research.' inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt') inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt') +del inputs_1["special_tokens_mask"] +del inputs_2["special_tokens_mask"] + pred_1 = pytorch_model(**inputs_1)[0].argmax().item() pred_2 = pytorch_model(**inputs_2)[0].argmax().item() print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0') diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py index 741569ea30..c81582fb72 100644 --- a/transformers/data/processors/glue.py +++ b/transformers/data/processors/glue.py @@ -76,10 +76,14 @@ def glue_convert_examples_to_features(examples, tokenizer, features = [] for (ex_index, example) in enumerate(examples): + if ex_index == 10: + break + if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) if is_tf_dataset: example = processor.get_example_from_tensor_dict(example) + example = processor.tfds_map(example) inputs = tokenizer.encode_plus( example.text_a, diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py index 27138f9959..07bdf3150c 100644 --- a/transformers/data/processors/utils.py +++ b/transformers/data/processors/utils.py @@ -107,6 +107,13 @@ class DataProcessor(object): """Gets the list of labels for this data set.""" raise NotImplementedError() + def tfds_map(self, example): + """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. + This method converts examples to the correct format.""" + if len(self.get_labels()) > 1: + example.label = self.get_labels()[int(example.label)] + return example + @classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" From beaf66b1f30aa29e11a02ecb5a7edb6b7b99eb01 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 24 Oct 2019 21:43:28 +0000 Subject: [PATCH 2/2] Remove break --- transformers/data/processors/glue.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py index c81582fb72..518251b050 100644 --- a/transformers/data/processors/glue.py +++ b/transformers/data/processors/glue.py @@ -76,9 +76,6 @@ def glue_convert_examples_to_features(examples, tokenizer, features = [] for (ex_index, example) in enumerate(examples): - if ex_index == 10: - break - if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) if is_tf_dataset: