Replace as_target context managers by direct calls (#18325)
* Preliminary work on tokenizers * Quality + fix tests * Treat processors * Fix pad * Remove all uses of in tests, docs and examples * Replace all as_target_tokenizer * Fix tests * Fix quality * Update examples/flax/image-captioning/run_image_captioning_flax.py Co-authored-by: amyeroberts <amy@huggingface.co> * Style Co-authored-by: amyeroberts <amy@huggingface.co>
This commit is contained in:
@@ -109,11 +109,10 @@ The preprocessing function needs to:
|
||||
>>> def prepare_dataset(batch):
|
||||
... audio = batch["audio"]
|
||||
|
||||
... batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
|
||||
... batch = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
|
||||
... batch["input_length"] = len(batch["input_values"])
|
||||
|
||||
... with processor.as_target_processor():
|
||||
... batch["labels"] = processor(batch["transcription"]).input_ids
|
||||
... batch["labels"] = processor(text=batch["transcription"]).input_ids
|
||||
... return batch
|
||||
```
|
||||
|
||||
@@ -146,17 +145,9 @@ Unlike other data collators, this specific data collator needs to apply a differ
|
||||
... input_features = [{"input_values": feature["input_values"]} for feature in features]
|
||||
... label_features = [{"input_ids": feature["labels"]} for feature in features]
|
||||
|
||||
... batch = self.processor.pad(
|
||||
... input_features,
|
||||
... padding=self.padding,
|
||||
... return_tensors="pt",
|
||||
... )
|
||||
... with self.processor.as_target_processor():
|
||||
... labels_batch = self.processor.pad(
|
||||
... label_features,
|
||||
... padding=self.padding,
|
||||
... return_tensors="pt",
|
||||
... )
|
||||
... batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
|
||||
|
||||
... labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
|
||||
|
||||
... # replace padding with -100 to ignore loss correctly
|
||||
... labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
|
||||
|
||||
@@ -67,7 +67,7 @@ Load the T5 tokenizer to process `text` and `summary`:
|
||||
The preprocessing function needs to:
|
||||
|
||||
1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
|
||||
2. Use a context manager with the `as_target_tokenizer()` function to parallelize tokenization of inputs and labels.
|
||||
2. Use the keyword `text_target` argument when tokenizing labels.
|
||||
3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter.
|
||||
|
||||
```py
|
||||
@@ -78,8 +78,7 @@ The preprocessing function needs to:
|
||||
... inputs = [prefix + doc for doc in examples["text"]]
|
||||
... model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
|
||||
|
||||
... with tokenizer.as_target_tokenizer():
|
||||
... labels = tokenizer(examples["summary"], max_length=128, truncation=True)
|
||||
... labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
|
||||
|
||||
... model_inputs["labels"] = labels["input_ids"]
|
||||
... return model_inputs
|
||||
|
||||
@@ -78,12 +78,7 @@ The preprocessing function needs to:
|
||||
>>> def preprocess_function(examples):
|
||||
... inputs = [prefix + example[source_lang] for example in examples["translation"]]
|
||||
... targets = [example[target_lang] for example in examples["translation"]]
|
||||
... model_inputs = tokenizer(inputs, max_length=128, truncation=True)
|
||||
|
||||
... with tokenizer.as_target_tokenizer():
|
||||
... labels = tokenizer(targets, max_length=128, truncation=True)
|
||||
|
||||
... model_inputs["labels"] = labels["input_ids"]
|
||||
... model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
|
||||
... return model_inputs
|
||||
```
|
||||
|
||||
|
||||
Reference in New Issue
Block a user