Update TF LM examples (#15855)

This commit is contained in:
Joao Gante
2022-03-01 14:12:58 +00:00
committed by GitHub
parent 54f0db4066
commit 3f2e636850
2 changed files with 46 additions and 132 deletions

View File

@@ -29,13 +29,11 @@ import os
import random import random
import sys import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from functools import partial
from itertools import chain from itertools import chain
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
import datasets import datasets
import numpy as np
import tensorflow as tf import tensorflow as tf
from datasets import load_dataset from datasets import load_dataset
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
@@ -48,6 +46,7 @@ from transformers import (
TF_MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING,
AutoConfig, AutoConfig,
AutoTokenizer, AutoTokenizer,
DefaultDataCollator,
HfArgumentParser, HfArgumentParser,
TFAutoModelForCausalLM, TFAutoModelForCausalLM,
TFTrainingArguments, TFTrainingArguments,
@@ -160,9 +159,6 @@ class DataTrainingArguments:
default=None, default=None,
metadata={"help": "The number of processes to use for the preprocessing."}, metadata={"help": "The number of processes to use for the preprocessing."},
) )
mlm_probability: float = field(
default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
)
line_by_line: bool = field( line_by_line: bool = field(
default=False, default=False,
metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."}, metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
@@ -212,20 +208,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
self.model.save_pretrained(self.output_dir) self.model.save_pretrained(self.output_dir)
# endregion
# region Data generator
def sample_generator(dataset, tokenizer):
# Trim off the last partial batch if present
sample_ordering = np.random.permutation(len(dataset))
for sample_idx in sample_ordering:
example = dataset[int(sample_idx)]
# Handle dicts with proper padding and conversion to tensor.
example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int64) for key, arr in example.items()}
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
return
# endregion # endregion
@@ -457,34 +439,27 @@ def main():
# region TF Dataset preparation # region TF Dataset preparation
num_replicas = training_args.strategy.num_replicas_in_sync num_replicas = training_args.strategy.num_replicas_in_sync
train_generator = partial(sample_generator, train_dataset, tokenizer) data_collator = DefaultDataCollator(return_tensors="tf")
train_signature = {
feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
for feature in train_dataset.features
if feature != "special_tokens_mask"
}
train_sig = (train_signature, train_signature["labels"])
options = tf.data.Options() options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
tf_train_dataset = (
tf.data.Dataset.from_generator(train_generator, output_signature=train_sig) tf_train_dataset = train_dataset.to_tf_dataset(
.with_options(options) # labels are passed as input, as we will use the model's internal loss
.batch(batch_size=num_replicas * training_args.per_device_train_batch_size, drop_remainder=True) columns=[col for col in train_dataset.features if col != "special_tokens_mask"],
.repeat(int(training_args.num_train_epochs)) shuffle=True,
) batch_size=num_replicas * training_args.per_device_train_batch_size,
eval_generator = partial(sample_generator, eval_dataset, tokenizer) collate_fn=data_collator,
eval_signature = { drop_remainder=True,
feature: tf.TensorSpec(shape=(None,), dtype=tf.int64) ).with_options(options)
for feature in eval_dataset.features
if feature != "special_tokens_mask" tf_eval_dataset = eval_dataset.to_tf_dataset(
} # labels are passed as input, as we will use the model's internal loss
eval_sig = (eval_signature, eval_signature["labels"]) columns=[col for col in eval_dataset.features if col != "special_tokens_mask"],
tf_eval_dataset = ( shuffle=False,
tf.data.Dataset.from_generator(eval_generator, output_signature=eval_sig) batch_size=num_replicas * training_args.per_device_train_batch_size,
.with_options(options) collate_fn=data_collator,
.batch(batch_size=num_replicas * training_args.per_device_eval_batch_size, drop_remainder=True) drop_remainder=True,
.repeat(int(training_args.num_train_epochs)) ).with_options(options)
)
# endregion # endregion
# region Optimizer and loss # region Optimizer and loss
@@ -500,10 +475,8 @@ def main():
weight_decay_rate=training_args.weight_decay, weight_decay_rate=training_args.weight_decay,
) )
def dummy_loss(y_true, y_pred): # no user-specified loss = will use the model internal loss
return tf.reduce_mean(y_pred) model.compile(optimizer=optimizer)
model.compile(optimizer=optimizer, loss={"loss": dummy_loss})
# endregion # endregion
# region Training and validation # region Training and validation

View File

@@ -31,13 +31,11 @@ import os
import random import random
import sys import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from functools import partial
from itertools import chain from itertools import chain
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
import datasets import datasets
import numpy as np
import tensorflow as tf import tensorflow as tf
from datasets import load_dataset from datasets import load_dataset
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
@@ -50,6 +48,7 @@ from transformers import (
TF_MODEL_FOR_MASKED_LM_MAPPING, TF_MODEL_FOR_MASKED_LM_MAPPING,
AutoConfig, AutoConfig,
AutoTokenizer, AutoTokenizer,
DataCollatorForLanguageModeling,
HfArgumentParser, HfArgumentParser,
TFAutoModelForMaskedLM, TFAutoModelForMaskedLM,
TFTrainingArguments, TFTrainingArguments,
@@ -217,56 +216,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
self.model.save_pretrained(self.output_dir) self.model.save_pretrained(self.output_dir)
# endregion
# region Data generator
def sample_generator(dataset, tokenizer, mlm_probability=0.15, pad_to_multiple_of=None):
if tokenizer.mask_token is None:
raise ValueError("This tokenizer does not have a mask token which is necessary for masked language modeling. ")
# Trim off the last partial batch if present
sample_ordering = np.random.permutation(len(dataset))
for sample_idx in sample_ordering:
example = dataset[int(sample_idx)]
# Handle dicts with proper padding and conversion to tensor.
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
special_tokens_mask = example.pop("special_tokens_mask", None)
example["input_ids"], example["labels"] = mask_tokens(
example["input_ids"], mlm_probability, tokenizer, special_tokens_mask=special_tokens_mask
)
if tokenizer.pad_token_id is not None:
example["labels"][example["labels"] == tokenizer.pad_token_id] = -100
example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()}
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
return
def mask_tokens(inputs, mlm_probability, tokenizer, special_tokens_mask):
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
labels = np.copy(inputs)
# We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
probability_matrix = np.random.random_sample(labels.shape)
special_tokens_mask = special_tokens_mask.astype(np.bool_)
probability_matrix[special_tokens_mask] = 0.0
masked_indices = probability_matrix > (1 - mlm_probability)
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = (np.random.random_sample(labels.shape) < 0.8) & masked_indices
inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
# 10% of the time, we replace masked input tokens with random word
indices_random = (np.random.random_sample(labels.shape) < 0.5) & masked_indices & ~indices_replaced
random_words = np.random.randint(low=0, high=len(tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64)
inputs[indices_random] = random_words
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels
# endregion # endregion
@@ -531,35 +480,29 @@ def main():
# region TF Dataset preparation # region TF Dataset preparation
num_replicas = training_args.strategy.num_replicas_in_sync num_replicas = training_args.strategy.num_replicas_in_sync
train_generator = partial(sample_generator, train_dataset, tokenizer) data_collator = DataCollatorForLanguageModeling(
train_signature = { tokenizer=tokenizer, mlm_probability=data_args.mlm_probability, return_tensors="tf"
feature: tf.TensorSpec(shape=(None,), dtype=tf.int64) )
for feature in train_dataset.features
if feature != "special_tokens_mask"
}
train_signature["labels"] = train_signature["input_ids"]
train_signature = (train_signature, train_signature["labels"])
options = tf.data.Options() options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
tf_train_dataset = (
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature) tf_train_dataset = train_dataset.to_tf_dataset(
.with_options(options) # labels are passed as input, as we will use the model's internal loss
.batch(batch_size=num_replicas * training_args.per_device_train_batch_size, drop_remainder=True) columns=[col for col in train_dataset.features if col != "special_tokens_mask"] + ["labels"],
.repeat(int(training_args.num_train_epochs)) shuffle=True,
) batch_size=num_replicas * training_args.per_device_train_batch_size,
eval_generator = partial(sample_generator, eval_dataset, tokenizer) collate_fn=data_collator,
eval_signature = { drop_remainder=True,
feature: tf.TensorSpec(shape=(None,), dtype=tf.int64) ).with_options(options)
for feature in eval_dataset.features
if feature != "special_tokens_mask" tf_eval_dataset = eval_dataset.to_tf_dataset(
} # labels are passed as input, as we will use the model's internal loss
eval_signature["labels"] = eval_signature["input_ids"] columns=[col for col in eval_dataset.features if col != "special_tokens_mask"] + ["labels"],
eval_signature = (eval_signature, eval_signature["labels"]) shuffle=False,
tf_eval_dataset = ( batch_size=num_replicas * training_args.per_device_train_batch_size,
tf.data.Dataset.from_generator(eval_generator, output_signature=eval_signature) collate_fn=data_collator,
.with_options(options) drop_remainder=True,
.batch(batch_size=num_replicas * training_args.per_device_eval_batch_size, drop_remainder=True) ).with_options(options)
)
# endregion # endregion
# region Optimizer and loss # region Optimizer and loss
@@ -575,10 +518,8 @@ def main():
weight_decay_rate=training_args.weight_decay, weight_decay_rate=training_args.weight_decay,
) )
def dummy_loss(y_true, y_pred): # no user-specified loss = will use the model internal loss
return tf.reduce_mean(y_pred) model.compile(optimizer=optimizer)
model.compile(optimizer=optimizer, loss={"loss": dummy_loss})
# endregion # endregion
# region Training and validation # region Training and validation