Replace as_target context managers by direct calls (#18325)

* Preliminary work on tokenizers

* Quality + fix tests

* Treat processors

* Fix pad

* Remove all uses of  in tests, docs and examples

* Replace all as_target_tokenizer

* Fix tests

* Fix quality

* Update examples/flax/image-captioning/run_image_captioning_flax.py

Co-authored-by: amyeroberts <amy@huggingface.co>

* Style

Co-authored-by: amyeroberts <amy@huggingface.co>
This commit is contained in:
Sylvain Gugger
2022-07-29 08:09:09 -04:00
committed by GitHub
parent a64bcb564d
commit 986526a0e4
80 changed files with 725 additions and 550 deletions

View File

@@ -304,13 +304,12 @@ class DataCollatorCTCWithPadding:
return_tensors="pt",
)
with self.processor.as_target_processor():
labels_batch = self.processor.pad(
label_features,
padding=self.padding,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)
labels_batch = self.processor.pad(
labels=label_features,
padding=self.padding,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

View File

@@ -301,13 +301,12 @@ class DataCollatorCTCWithPadding:
return_tensors="pt",
)
with self.processor.as_target_processor():
labels_batch = self.processor.pad(
label_features,
padding=self.padding,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)
labels_batch = self.processor.pad(
labels=label_features,
padding=self.padding,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

View File

@@ -437,13 +437,12 @@ def main():
table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
)
with tokenizer.as_target_tokenizer():
labels = tokenizer(
answer=[", ".join(answer) for answer in answers],
max_length=max_target_length,
padding=padding,
truncation=True,
)
labels = tokenizer(
answer=[", ".join(answer) for answer in answers],
max_length=max_target_length,
padding=padding,
truncation=True,
)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss.

View File

@@ -413,13 +413,12 @@ def main():
table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
)
with tokenizer.as_target_tokenizer():
labels = tokenizer(
answer=[", ".join(answer) for answer in answers],
max_length=max_target_length,
padding=padding,
truncation=True,
)
labels = tokenizer(
answer=[", ".join(answer) for answer in answers],
max_length=max_target_length,
padding=padding,
truncation=True,
)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss.

View File

@@ -266,14 +266,13 @@ class DataCollatorCTCWithPadding:
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt",
)
with self.processor.as_target_processor():
labels_batch = self.processor.pad(
label_features,
padding=self.padding,
max_length=self.max_length_labels,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)
labels_batch = self.processor.pad(
labels=label_features,
padding=self.padding,
max_length=self.max_length_labels,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
@@ -419,9 +418,10 @@ def main():
len(set(batch["sampling_rate"])) == 1
), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
with processor.as_target_processor():
batch["labels"] = processor(batch[data_args.target_text_column]).input_ids
processed_batch = processor(
audio=batch["speech"], text=batch[data_args.target_text_column], sampling_rate=batch["sampling_rate"][0]
)
batch.update(processed_batch)
return batch
train_dataset = train_dataset.map(

View File

@@ -185,14 +185,13 @@ class DataCollatorCTCWithPadding:
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt",
)
with self.processor.as_target_processor():
labels_batch = self.processor.pad(
label_features,
padding=self.padding,
max_length=self.max_length_labels,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)
labels_batch = self.processor.pad(
labels=label_features,
padding=self.padding,
max_length=self.max_length_labels,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
@@ -414,10 +413,11 @@ def main():
assert (
len(set(batch["sampling_rate"])) == 1
), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
# Setup the processor for targets
with processor.as_target_processor():
batch["labels"] = processor(batch["target_text"]).input_ids
processed_batch = processor(
audio=batch["speech"], text=batch["target_text"], sampling_rate=batch["sampling_rate"][0]
)
batch.update(processed_batch)
return batch
train_dataset = train_dataset.map(

View File

@@ -349,13 +349,12 @@ class SpeechDataCollatorWithPadding:
if self.pad_labels:
label_features = [{"input_ids": feature["labels"]} for feature in features]
with self.processor.as_target_processor():
labels_batch = self.processor.pad(
label_features,
padding=self.padding,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)
labels_batch = self.processor.pad(
labels=label_features,
padding=self.padding,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)