Replace as_target context managers by direct calls (#18325)
* Preliminary work on tokenizers * Quality + fix tests * Treat processors * Fix pad * Remove all uses of in tests, docs and examples * Replace all as_target_tokenizer * Fix tests * Fix quality * Update examples/flax/image-captioning/run_image_captioning_flax.py Co-authored-by: amyeroberts <amy@huggingface.co> * Style Co-authored-by: amyeroberts <amy@huggingface.co>
This commit is contained in:
@@ -552,11 +552,14 @@ def main():
|
||||
targets = captions
|
||||
|
||||
model_inputs = {}
|
||||
# Setup the tokenizer for targets
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(
|
||||
targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np"
|
||||
)
|
||||
|
||||
labels = tokenizer(
|
||||
text_target=targets,
|
||||
max_length=max_target_length,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
return_tensors="np",
|
||||
)
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
decoder_input_ids = shift_tokens_right_fn(
|
||||
labels["input_ids"], model.config.pad_token_id, model.config.decoder_start_token_id
|
||||
|
||||
@@ -590,10 +590,13 @@ def main():
|
||||
)
|
||||
|
||||
# Setup the tokenizer for targets
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(
|
||||
targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np"
|
||||
)
|
||||
labels = tokenizer(
|
||||
text_target=targets,
|
||||
max_length=max_target_length,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
return_tensors="np",
|
||||
)
|
||||
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
decoder_input_ids = shift_tokens_right_fn(
|
||||
|
||||
@@ -453,9 +453,8 @@ def main():
|
||||
inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)
|
||||
|
||||
model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True)
|
||||
# Setup the tokenizer for targets
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True)
|
||||
# Tokenize targets with text_target=...
|
||||
labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)
|
||||
|
||||
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
|
||||
# padding in the loss.
|
||||
@@ -479,9 +478,8 @@ def main():
|
||||
return_overflowing_tokens=True,
|
||||
return_offsets_mapping=True,
|
||||
)
|
||||
# Setup the tokenizer for targets
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True)
|
||||
# Tokenize targets with the `text_target` keyword argument
|
||||
labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)
|
||||
|
||||
# Since one example might give us several features if it has a long context, we need a map from a feature to
|
||||
# its corresponding example. This key gives us just that.
|
||||
|
||||
@@ -305,13 +305,12 @@ class DataCollatorCTCWithPadding:
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
with self.processor.as_target_processor():
|
||||
labels_batch = self.processor.pad(
|
||||
label_features,
|
||||
padding=self.padding,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
||||
return_tensors="pt",
|
||||
)
|
||||
labels_batch = self.processor.pad(
|
||||
labels=label_features,
|
||||
padding=self.padding,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# replace padding with -100 to ignore loss correctly
|
||||
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
|
||||
|
||||
@@ -522,9 +522,8 @@ def main():
|
||||
inputs = [prefix + inp for inp in inputs]
|
||||
model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
|
||||
|
||||
# Setup the tokenizer for targets
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
|
||||
# Tokenize targets with the `text_target` keyword argument
|
||||
labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
|
||||
|
||||
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
|
||||
# padding in the loss.
|
||||
|
||||
@@ -470,9 +470,8 @@ def main():
|
||||
inputs = [prefix + inp for inp in inputs]
|
||||
model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True)
|
||||
|
||||
# Setup the tokenizer for targets
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
|
||||
# Tokenize targets with the `text_target` keyword argument
|
||||
labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
|
||||
|
||||
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
|
||||
# padding in the loss.
|
||||
|
||||
@@ -443,9 +443,8 @@ def main():
|
||||
inputs = [prefix + inp for inp in inputs]
|
||||
model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
|
||||
|
||||
# Setup the tokenizer for targets
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
|
||||
# Tokenize targets with the `text_target` keyword argument
|
||||
labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
|
||||
|
||||
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
|
||||
# padding in the loss.
|
||||
|
||||
@@ -452,9 +452,8 @@ def main():
|
||||
inputs = [prefix + inp for inp in inputs]
|
||||
model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True)
|
||||
|
||||
# Setup the tokenizer for targets
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
|
||||
# Tokenize targets with the `text_target` keyword argument
|
||||
labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
|
||||
|
||||
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
|
||||
# padding in the loss.
|
||||
|
||||
@@ -304,13 +304,12 @@ class DataCollatorCTCWithPadding:
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
with self.processor.as_target_processor():
|
||||
labels_batch = self.processor.pad(
|
||||
label_features,
|
||||
padding=self.padding,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
||||
return_tensors="pt",
|
||||
)
|
||||
labels_batch = self.processor.pad(
|
||||
labels=label_features,
|
||||
padding=self.padding,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# replace padding with -100 to ignore loss correctly
|
||||
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
|
||||
|
||||
@@ -301,13 +301,12 @@ class DataCollatorCTCWithPadding:
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
with self.processor.as_target_processor():
|
||||
labels_batch = self.processor.pad(
|
||||
label_features,
|
||||
padding=self.padding,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
||||
return_tensors="pt",
|
||||
)
|
||||
labels_batch = self.processor.pad(
|
||||
labels=label_features,
|
||||
padding=self.padding,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# replace padding with -100 to ignore loss correctly
|
||||
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
|
||||
|
||||
@@ -437,13 +437,12 @@ def main():
|
||||
table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
|
||||
)
|
||||
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(
|
||||
answer=[", ".join(answer) for answer in answers],
|
||||
max_length=max_target_length,
|
||||
padding=padding,
|
||||
truncation=True,
|
||||
)
|
||||
labels = tokenizer(
|
||||
answer=[", ".join(answer) for answer in answers],
|
||||
max_length=max_target_length,
|
||||
padding=padding,
|
||||
truncation=True,
|
||||
)
|
||||
|
||||
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
|
||||
# padding in the loss.
|
||||
|
||||
@@ -413,13 +413,12 @@ def main():
|
||||
table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
|
||||
)
|
||||
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(
|
||||
answer=[", ".join(answer) for answer in answers],
|
||||
max_length=max_target_length,
|
||||
padding=padding,
|
||||
truncation=True,
|
||||
)
|
||||
labels = tokenizer(
|
||||
answer=[", ".join(answer) for answer in answers],
|
||||
max_length=max_target_length,
|
||||
padding=padding,
|
||||
truncation=True,
|
||||
)
|
||||
|
||||
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
|
||||
# padding in the loss.
|
||||
|
||||
@@ -266,14 +266,13 @@ class DataCollatorCTCWithPadding:
|
||||
pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
return_tensors="pt",
|
||||
)
|
||||
with self.processor.as_target_processor():
|
||||
labels_batch = self.processor.pad(
|
||||
label_features,
|
||||
padding=self.padding,
|
||||
max_length=self.max_length_labels,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
||||
return_tensors="pt",
|
||||
)
|
||||
labels_batch = self.processor.pad(
|
||||
labels=label_features,
|
||||
padding=self.padding,
|
||||
max_length=self.max_length_labels,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# replace padding with -100 to ignore loss correctly
|
||||
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
|
||||
@@ -419,9 +418,10 @@ def main():
|
||||
len(set(batch["sampling_rate"])) == 1
|
||||
), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
|
||||
|
||||
batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
|
||||
with processor.as_target_processor():
|
||||
batch["labels"] = processor(batch[data_args.target_text_column]).input_ids
|
||||
processed_batch = processor(
|
||||
audio=batch["speech"], text=batch[data_args.target_text_column], sampling_rate=batch["sampling_rate"][0]
|
||||
)
|
||||
batch.update(processed_batch)
|
||||
return batch
|
||||
|
||||
train_dataset = train_dataset.map(
|
||||
|
||||
@@ -185,14 +185,13 @@ class DataCollatorCTCWithPadding:
|
||||
pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
return_tensors="pt",
|
||||
)
|
||||
with self.processor.as_target_processor():
|
||||
labels_batch = self.processor.pad(
|
||||
label_features,
|
||||
padding=self.padding,
|
||||
max_length=self.max_length_labels,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
||||
return_tensors="pt",
|
||||
)
|
||||
labels_batch = self.processor.pad(
|
||||
labels=label_features,
|
||||
padding=self.padding,
|
||||
max_length=self.max_length_labels,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# replace padding with -100 to ignore loss correctly
|
||||
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
|
||||
@@ -414,10 +413,11 @@ def main():
|
||||
assert (
|
||||
len(set(batch["sampling_rate"])) == 1
|
||||
), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
|
||||
batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
|
||||
# Setup the processor for targets
|
||||
with processor.as_target_processor():
|
||||
batch["labels"] = processor(batch["target_text"]).input_ids
|
||||
|
||||
processed_batch = processor(
|
||||
audio=batch["speech"], text=batch["target_text"], sampling_rate=batch["sampling_rate"][0]
|
||||
)
|
||||
batch.update(processed_batch)
|
||||
return batch
|
||||
|
||||
train_dataset = train_dataset.map(
|
||||
|
||||
@@ -349,13 +349,12 @@ class SpeechDataCollatorWithPadding:
|
||||
|
||||
if self.pad_labels:
|
||||
label_features = [{"input_ids": feature["labels"]} for feature in features]
|
||||
with self.processor.as_target_processor():
|
||||
labels_batch = self.processor.pad(
|
||||
label_features,
|
||||
padding=self.padding,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
||||
return_tensors="pt",
|
||||
)
|
||||
labels_batch = self.processor.pad(
|
||||
labels=label_features,
|
||||
padding=self.padding,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# replace padding with -100 to ignore loss correctly
|
||||
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
|
||||
|
||||
@@ -504,9 +504,8 @@ def main():
|
||||
inputs = [prefix + inp for inp in inputs]
|
||||
model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
|
||||
|
||||
# Setup the tokenizer for targets
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
|
||||
# Tokenize targets with the `text_target` keyword argument
|
||||
labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
|
||||
|
||||
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
|
||||
# padding in the loss.
|
||||
|
||||
@@ -458,9 +458,8 @@ def main():
|
||||
inputs = [prefix + inp for inp in inputs]
|
||||
model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
|
||||
|
||||
# Setup the tokenizer for targets
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
|
||||
# Tokenize targets with the `text_target` keyword argument
|
||||
labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
|
||||
|
||||
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
|
||||
# padding in the loss.
|
||||
|
||||
Reference in New Issue
Block a user