Update TF LM examples (#15855)
This commit is contained in:
@@ -29,13 +29,11 @@ import os
|
|||||||
import random
|
import random
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from functools import partial
|
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
@@ -48,6 +46,7 @@ from transformers import (
|
|||||||
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
|
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
|
DefaultDataCollator,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
TFAutoModelForCausalLM,
|
TFAutoModelForCausalLM,
|
||||||
TFTrainingArguments,
|
TFTrainingArguments,
|
||||||
@@ -160,9 +159,6 @@ class DataTrainingArguments:
|
|||||||
default=None,
|
default=None,
|
||||||
metadata={"help": "The number of processes to use for the preprocessing."},
|
metadata={"help": "The number of processes to use for the preprocessing."},
|
||||||
)
|
)
|
||||||
mlm_probability: float = field(
|
|
||||||
default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
|
|
||||||
)
|
|
||||||
line_by_line: bool = field(
|
line_by_line: bool = field(
|
||||||
default=False,
|
default=False,
|
||||||
metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
|
metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
|
||||||
@@ -212,20 +208,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
|||||||
self.model.save_pretrained(self.output_dir)
|
self.model.save_pretrained(self.output_dir)
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
# region Data generator
|
|
||||||
def sample_generator(dataset, tokenizer):
|
|
||||||
# Trim off the last partial batch if present
|
|
||||||
sample_ordering = np.random.permutation(len(dataset))
|
|
||||||
for sample_idx in sample_ordering:
|
|
||||||
example = dataset[int(sample_idx)]
|
|
||||||
# Handle dicts with proper padding and conversion to tensor.
|
|
||||||
example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int64) for key, arr in example.items()}
|
|
||||||
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
|
||||||
@@ -457,34 +439,27 @@ def main():
|
|||||||
|
|
||||||
# region TF Dataset preparation
|
# region TF Dataset preparation
|
||||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||||
train_generator = partial(sample_generator, train_dataset, tokenizer)
|
data_collator = DefaultDataCollator(return_tensors="tf")
|
||||||
train_signature = {
|
|
||||||
feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
|
|
||||||
for feature in train_dataset.features
|
|
||||||
if feature != "special_tokens_mask"
|
|
||||||
}
|
|
||||||
train_sig = (train_signature, train_signature["labels"])
|
|
||||||
options = tf.data.Options()
|
options = tf.data.Options()
|
||||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||||
tf_train_dataset = (
|
|
||||||
tf.data.Dataset.from_generator(train_generator, output_signature=train_sig)
|
tf_train_dataset = train_dataset.to_tf_dataset(
|
||||||
.with_options(options)
|
# labels are passed as input, as we will use the model's internal loss
|
||||||
.batch(batch_size=num_replicas * training_args.per_device_train_batch_size, drop_remainder=True)
|
columns=[col for col in train_dataset.features if col != "special_tokens_mask"],
|
||||||
.repeat(int(training_args.num_train_epochs))
|
shuffle=True,
|
||||||
)
|
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
||||||
eval_generator = partial(sample_generator, eval_dataset, tokenizer)
|
collate_fn=data_collator,
|
||||||
eval_signature = {
|
drop_remainder=True,
|
||||||
feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
|
).with_options(options)
|
||||||
for feature in eval_dataset.features
|
|
||||||
if feature != "special_tokens_mask"
|
tf_eval_dataset = eval_dataset.to_tf_dataset(
|
||||||
}
|
# labels are passed as input, as we will use the model's internal loss
|
||||||
eval_sig = (eval_signature, eval_signature["labels"])
|
columns=[col for col in eval_dataset.features if col != "special_tokens_mask"],
|
||||||
tf_eval_dataset = (
|
shuffle=False,
|
||||||
tf.data.Dataset.from_generator(eval_generator, output_signature=eval_sig)
|
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
||||||
.with_options(options)
|
collate_fn=data_collator,
|
||||||
.batch(batch_size=num_replicas * training_args.per_device_eval_batch_size, drop_remainder=True)
|
drop_remainder=True,
|
||||||
.repeat(int(training_args.num_train_epochs))
|
).with_options(options)
|
||||||
)
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Optimizer and loss
|
# region Optimizer and loss
|
||||||
@@ -500,10 +475,8 @@ def main():
|
|||||||
weight_decay_rate=training_args.weight_decay,
|
weight_decay_rate=training_args.weight_decay,
|
||||||
)
|
)
|
||||||
|
|
||||||
def dummy_loss(y_true, y_pred):
|
# no user-specified loss = will use the model internal loss
|
||||||
return tf.reduce_mean(y_pred)
|
model.compile(optimizer=optimizer)
|
||||||
|
|
||||||
model.compile(optimizer=optimizer, loss={"loss": dummy_loss})
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Training and validation
|
# region Training and validation
|
||||||
|
|||||||
@@ -31,13 +31,11 @@ import os
|
|||||||
import random
|
import random
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from functools import partial
|
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
@@ -50,6 +48,7 @@ from transformers import (
|
|||||||
TF_MODEL_FOR_MASKED_LM_MAPPING,
|
TF_MODEL_FOR_MASKED_LM_MAPPING,
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
|
DataCollatorForLanguageModeling,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
TFAutoModelForMaskedLM,
|
TFAutoModelForMaskedLM,
|
||||||
TFTrainingArguments,
|
TFTrainingArguments,
|
||||||
@@ -217,56 +216,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
|||||||
self.model.save_pretrained(self.output_dir)
|
self.model.save_pretrained(self.output_dir)
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
# region Data generator
|
|
||||||
def sample_generator(dataset, tokenizer, mlm_probability=0.15, pad_to_multiple_of=None):
|
|
||||||
if tokenizer.mask_token is None:
|
|
||||||
raise ValueError("This tokenizer does not have a mask token which is necessary for masked language modeling. ")
|
|
||||||
# Trim off the last partial batch if present
|
|
||||||
sample_ordering = np.random.permutation(len(dataset))
|
|
||||||
for sample_idx in sample_ordering:
|
|
||||||
example = dataset[int(sample_idx)]
|
|
||||||
# Handle dicts with proper padding and conversion to tensor.
|
|
||||||
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
|
|
||||||
special_tokens_mask = example.pop("special_tokens_mask", None)
|
|
||||||
example["input_ids"], example["labels"] = mask_tokens(
|
|
||||||
example["input_ids"], mlm_probability, tokenizer, special_tokens_mask=special_tokens_mask
|
|
||||||
)
|
|
||||||
if tokenizer.pad_token_id is not None:
|
|
||||||
example["labels"][example["labels"] == tokenizer.pad_token_id] = -100
|
|
||||||
example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()}
|
|
||||||
|
|
||||||
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
def mask_tokens(inputs, mlm_probability, tokenizer, special_tokens_mask):
|
|
||||||
"""
|
|
||||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
|
|
||||||
"""
|
|
||||||
labels = np.copy(inputs)
|
|
||||||
# We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
|
|
||||||
probability_matrix = np.random.random_sample(labels.shape)
|
|
||||||
special_tokens_mask = special_tokens_mask.astype(np.bool_)
|
|
||||||
|
|
||||||
probability_matrix[special_tokens_mask] = 0.0
|
|
||||||
masked_indices = probability_matrix > (1 - mlm_probability)
|
|
||||||
labels[~masked_indices] = -100 # We only compute loss on masked tokens
|
|
||||||
|
|
||||||
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
|
|
||||||
indices_replaced = (np.random.random_sample(labels.shape) < 0.8) & masked_indices
|
|
||||||
inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
|
|
||||||
|
|
||||||
# 10% of the time, we replace masked input tokens with random word
|
|
||||||
indices_random = (np.random.random_sample(labels.shape) < 0.5) & masked_indices & ~indices_replaced
|
|
||||||
random_words = np.random.randint(low=0, high=len(tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64)
|
|
||||||
inputs[indices_random] = random_words
|
|
||||||
|
|
||||||
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
|
|
||||||
return inputs, labels
|
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
|
||||||
@@ -531,35 +480,29 @@ def main():
|
|||||||
|
|
||||||
# region TF Dataset preparation
|
# region TF Dataset preparation
|
||||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||||
train_generator = partial(sample_generator, train_dataset, tokenizer)
|
data_collator = DataCollatorForLanguageModeling(
|
||||||
train_signature = {
|
tokenizer=tokenizer, mlm_probability=data_args.mlm_probability, return_tensors="tf"
|
||||||
feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
|
)
|
||||||
for feature in train_dataset.features
|
|
||||||
if feature != "special_tokens_mask"
|
|
||||||
}
|
|
||||||
train_signature["labels"] = train_signature["input_ids"]
|
|
||||||
train_signature = (train_signature, train_signature["labels"])
|
|
||||||
options = tf.data.Options()
|
options = tf.data.Options()
|
||||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||||
tf_train_dataset = (
|
|
||||||
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
|
tf_train_dataset = train_dataset.to_tf_dataset(
|
||||||
.with_options(options)
|
# labels are passed as input, as we will use the model's internal loss
|
||||||
.batch(batch_size=num_replicas * training_args.per_device_train_batch_size, drop_remainder=True)
|
columns=[col for col in train_dataset.features if col != "special_tokens_mask"] + ["labels"],
|
||||||
.repeat(int(training_args.num_train_epochs))
|
shuffle=True,
|
||||||
)
|
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
||||||
eval_generator = partial(sample_generator, eval_dataset, tokenizer)
|
collate_fn=data_collator,
|
||||||
eval_signature = {
|
drop_remainder=True,
|
||||||
feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
|
).with_options(options)
|
||||||
for feature in eval_dataset.features
|
|
||||||
if feature != "special_tokens_mask"
|
tf_eval_dataset = eval_dataset.to_tf_dataset(
|
||||||
}
|
# labels are passed as input, as we will use the model's internal loss
|
||||||
eval_signature["labels"] = eval_signature["input_ids"]
|
columns=[col for col in eval_dataset.features if col != "special_tokens_mask"] + ["labels"],
|
||||||
eval_signature = (eval_signature, eval_signature["labels"])
|
shuffle=False,
|
||||||
tf_eval_dataset = (
|
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
||||||
tf.data.Dataset.from_generator(eval_generator, output_signature=eval_signature)
|
collate_fn=data_collator,
|
||||||
.with_options(options)
|
drop_remainder=True,
|
||||||
.batch(batch_size=num_replicas * training_args.per_device_eval_batch_size, drop_remainder=True)
|
).with_options(options)
|
||||||
)
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Optimizer and loss
|
# region Optimizer and loss
|
||||||
@@ -575,10 +518,8 @@ def main():
|
|||||||
weight_decay_rate=training_args.weight_decay,
|
weight_decay_rate=training_args.weight_decay,
|
||||||
)
|
)
|
||||||
|
|
||||||
def dummy_loss(y_true, y_pred):
|
# no user-specified loss = will use the model internal loss
|
||||||
return tf.reduce_mean(y_pred)
|
model.compile(optimizer=optimizer)
|
||||||
|
|
||||||
model.compile(optimizer=optimizer, loss={"loss": dummy_loss})
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Training and validation
|
# region Training and validation
|
||||||
|
|||||||
Reference in New Issue
Block a user