diff --git a/examples/ner/run_tf_ner.py b/examples/ner/run_tf_ner.py index 0a607ff662..88b235d99e 100644 --- a/examples/ner/run_tf_ner.py +++ b/examples/ner/run_tf_ner.py @@ -157,7 +157,9 @@ def train( writer = tf.summary.create_file_writer("/tmp/mylogs") with strategy.scope(): - loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE) + loss_fct = tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True, reduction=tf.keras.losses.Reduction.NONE + ) optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"]) if args["fp16"]: @@ -205,11 +207,9 @@ def train( with tf.GradientTape() as tape: logits = model(train_features["input_ids"], **inputs)[0] - logits = tf.reshape(logits, (-1, len(labels) + 1)) - active_loss = tf.reshape(train_features["input_mask"], (-1,)) - active_logits = tf.boolean_mask(logits, active_loss) - train_labels = tf.reshape(train_labels, (-1,)) - active_labels = tf.boolean_mask(train_labels, active_loss) + active_loss = tf.reshape(train_labels, (-1,)) != pad_token_label_id + active_logits = tf.boolean_mask(tf.reshape(logits, (-1, len(labels))), active_loss) + active_labels = tf.boolean_mask(tf.reshape(train_labels, (-1,)), active_loss) cross_entropy = loss_fct(active_labels, active_logits) loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size) grads = tape.gradient(loss, model.trainable_variables) @@ -329,11 +329,9 @@ def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode) with strategy.scope(): logits = model(eval_features["input_ids"], **inputs)[0] - tmp_logits = tf.reshape(logits, (-1, len(labels) + 1)) - active_loss = tf.reshape(eval_features["input_mask"], (-1,)) - active_logits = tf.boolean_mask(tmp_logits, active_loss) - tmp_eval_labels = tf.reshape(eval_labels, (-1,)) - active_labels = tf.boolean_mask(tmp_eval_labels, active_loss) + active_loss = tf.reshape(eval_labels, (-1,)) != pad_token_label_id + active_logits = tf.boolean_mask(tf.reshape(logits, (-1, len(labels))), active_loss) + active_labels = tf.boolean_mask(tf.reshape(eval_labels, (-1,)), active_loss) cross_entropy = loss_fct(active_labels, active_logits) loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size) @@ -497,8 +495,8 @@ def main(_): ) labels = get_labels(args["labels"]) - num_labels = len(labels) + 1 - pad_token_label_id = 0 + num_labels = len(labels) + pad_token_label_id = -1 config = AutoConfig.from_pretrained( args["config_name"] if args["config_name"] else args["model_name_or_path"], num_labels=num_labels, @@ -522,7 +520,6 @@ def main(_): config=config, cache_dir=args["cache_dir"] if args["cache_dir"] else None, ) - model.layers[-1].activation = tf.keras.activations.softmax train_batch_size = args["per_device_train_batch_size"] * args["n_device"] train_dataset, num_train_examples = load_and_cache_examples( diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py index d232370905..db87eda0be 100644 --- a/src/transformers/optimization_tf.py +++ b/src/transformers/optimization_tf.py @@ -214,7 +214,7 @@ class GradientAccumulator(object): raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients))) for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients): - if accum_gradient is not None: + if accum_gradient is not None and gradient is not None: accum_gradient.assign_add(gradient) self._accum_steps.assign_add(1) @@ -241,6 +241,7 @@ class GradientAccumulator(object): return ( gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients + if gradient is not None ) else: return self._gradients