From d6dde438eaf7117fa17de0c4ede3e5126989dfdf Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 26 Sep 2019 01:45:55 +0200 Subject: [PATCH] add batch dimension in encode --- examples/run_tf_glue.py | 47 ++-------------------- pytorch_transformers/tokenization_utils.py | 8 ++-- 2 files changed, 8 insertions(+), 47 deletions(-) diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py index e3cf1d4033..8d525842f5 100644 --- a/examples/run_tf_glue.py +++ b/examples/run_tf_glue.py @@ -20,51 +20,12 @@ loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) tf_model.compile(optimizer=optimizer, loss=loss, metrics=['sparse_categorical_accuracy']) # Train and evaluate using tf.keras.Model.fit() -tf_model.fit(train_dataset, epochs=1, steps_per_epoch=115, validation_data=valid_dataset, validation_steps=7) +tf_model.fit(train_dataset, epochs=3, steps_per_epoch=115, validation_data=valid_dataset, validation_steps=7) # Save the model and load it in PyTorch tf_model.save_pretrained('./runs/') -pt_model = BertForSequenceClassification.from_pretrained('./runs/') +pt_model = BertForSequenceClassification.from_pretrained('./runs/', from_tf=True) # Quickly inspect a few predictions -inputs = tokenizer.encode_plus("I said the company is doing great", "The company has good results", add_special_tokens=True) -pred = pt_model(torch.tensor([tokens])) - -# Divers -import torch - -import tensorflow as tf -import tensorflow_datasets -from pytorch_transformers import BertTokenizer, BertForSequenceClassification, TFBertForSequenceClassification, glue_convert_examples_to_features - -# Load tokenizer, model, dataset -tokenizer = BertTokenizer.from_pretrained('bert-base-cased') -model = TFBertForSequenceClassification.from_pretrained('bert-base-cased') - -pt_train_dataset = torch.load('../../data/glue_data//MRPC/cached_train_bert-base-cased_128_mrpc') - -def gen(): - for el in pt_train_dataset: - yield ((el.input_ids, el.attention_mask, el.token_type_ids), (el.label,)) - -dataset = tf.data.Dataset.from_generator(gen, - ((tf.int32, tf.int32, tf.int32), (tf.int64,)), - ((tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None])), - (tf.TensorShape([]),))) - -dataset = dataset.shuffle(100).batch(32) -next(iter(dataset)) - -learning_rate = tf.keras.optimizers.schedules.PolynomialDecay(2e-5, 345, 0) -loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) -model.compile(optimizer=tf.keras.optimizers.Adam( - learning_rate=learning_rate, - epsilon=1e-08, - clipnorm=1.0), - loss=loss, - metrics=[['sparse_categorical_accuracy']]) - -tensorboard_cbk = tf.keras.callbacks.TensorBoard(log_dir='./runs/', update_freq=10, histogram_freq=1) - -# Train model -model.fit(dataset, epochs=3, callbacks=[tensorboard_cbk]) +inputs = tokenizer.encode_plus("I said the company is doing great", "The company has good results", add_special_tokens=True, return_tensors='pt') +pred = pt_model(torch.tensor(tokens)) diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index 9a9b141412..ec5de3b772 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -849,11 +849,11 @@ class PreTrainedTokenizer(object): token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) if return_tensors == 'tf' and is_tf_available(): - sequence = tf.constant(sequence) - token_type_ids = tf.constant(token_type_ids) + sequence = tf.constant([sequence]) + token_type_ids = tf.constant([token_type_ids]) elif return_tensors == 'pt' and is_torch_available(): - sequence = torch.tensor(sequence) - token_type_ids = torch.tensor(token_type_ids) + sequence = torch.tensor([sequence]) + token_type_ids = torch.tensor([token_type_ids]) elif return_tensors is not None: logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))