Replace as_target context managers by direct calls (#18325)

* Preliminary work on tokenizers

* Quality + fix tests

* Treat processors

* Fix pad

* Remove all uses of  in tests, docs and examples

* Replace all as_target_tokenizer

* Fix tests

* Fix quality

* Update examples/flax/image-captioning/run_image_captioning_flax.py

Co-authored-by: amyeroberts <amy@huggingface.co>

* Style

Co-authored-by: amyeroberts <amy@huggingface.co>
This commit is contained in:
Sylvain Gugger
2022-07-29 08:09:09 -04:00
committed by GitHub
parent a64bcb564d
commit 986526a0e4
80 changed files with 725 additions and 550 deletions

View File

@@ -55,9 +55,7 @@ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en
src_text = "Life is like a box of chocolates." src_text = "Life is like a box of chocolates."
tgt_text = "La vie est comme une boîte de chocolat." tgt_text = "La vie est comme une boîte de chocolat."
model_inputs = tokenizer(src_text, return_tensors="pt") model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
with tokenizer.as_target_tokenizer():
labels = tokenizer(tgt_text, return_tensors="pt").input_ids
loss = model(**model_inputs, labels=labels) # forward pass loss = model(**model_inputs, labels=labels) # forward pass
``` ```

View File

@@ -155,7 +155,7 @@ Example of translating english to many romance languages, using old-style 2 char
## MarianTokenizer ## MarianTokenizer
[[autodoc]] MarianTokenizer [[autodoc]] MarianTokenizer
- as_target_tokenizer - build_inputs_with_special_tokens
## MarianModel ## MarianModel

View File

@@ -34,8 +34,8 @@ model is multilingual it expects the sequences in a different format. A special
source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The
target text format is `[tgt_lang_code] X [eos]`. `bos` is never used. target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
The regular [`~MBartTokenizer.__call__`] will encode source text format, and it should be wrapped The regular [`~MBartTokenizer.__call__`] will encode source text format passed as first argument or with the `text`
inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode target text format. keyword, and target text format passed with the `text_label` keyword argument.
- Supervised training - Supervised training
@@ -46,13 +46,11 @@ inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode tar
>>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria" >>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria" >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> inputs = tokenizer(example_english_phrase, return_tensors="pt") >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(expected_translation_romanian, return_tensors="pt")
>>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro") >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
>>> # forward pass >>> # forward pass
>>> model(**inputs, labels=batch["labels"]) >>> model(**inputs)
``` ```
- Generation - Generation
@@ -108,11 +106,9 @@ tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_
src_text = " UN Chief Says There Is No Military Solution in Syria" src_text = " UN Chief Says There Is No Military Solution in Syria"
tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
model_inputs = tokenizer(src_text, return_tensors="pt") model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
with tokenizer.as_target_tokenizer():
labels = tokenizer(tgt_text, return_tensors="pt").input_ids
model(**model_inputs, labels=labels) # forward pass model(**model_inputs) # forward pass
``` ```
- Generation - Generation
@@ -154,7 +150,6 @@ tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
## MBartTokenizer ## MBartTokenizer
[[autodoc]] MBartTokenizer [[autodoc]] MBartTokenizer
- as_target_tokenizer
- build_inputs_with_special_tokens - build_inputs_with_special_tokens
## MBartTokenizerFast ## MBartTokenizerFast

View File

@@ -48,7 +48,6 @@ This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The origi
- save_pretrained - save_pretrained
- batch_decode - batch_decode
- decode - decode
- as_target_processor
## MCTCTModel ## MCTCTModel

View File

@@ -91,7 +91,6 @@ UN-Chef sagt, es gibt keine militärische Lösung in Syrien
## NllbTokenizer ## NllbTokenizer
[[autodoc]] NllbTokenizer [[autodoc]] NllbTokenizer
- as_target_tokenizer
- build_inputs_with_special_tokens - build_inputs_with_special_tokens
## NllbTokenizerFast ## NllbTokenizerFast

View File

@@ -45,8 +45,9 @@ target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
However, for fine-tuning, in some cases no language token is provided in cases where a single language is used. Please refer to [the paper](https://arxiv.org/abs/2103.06333) to learn more about this. However, for fine-tuning, in some cases no language token is provided in cases where a single language is used. Please refer to [the paper](https://arxiv.org/abs/2103.06333) to learn more about this.
In cases where the language code is needed, The regular [`~PLBartTokenizer.__call__`] will encode source text format, and it should be wrapped In cases where the language code is needed, the regular [`~PLBartTokenizer.__call__`] will encode source text format
inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode target text format. when you pass texts as the first argument or with the keyword argument `text`, and will encode target text format if
it's passed with the `text_target` keyword argument.
- Supervised training - Supervised training
@@ -56,11 +57,7 @@ inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode ta
>>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base", src_lang="en_XX", tgt_lang="python") >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base", src_lang="en_XX", tgt_lang="python")
>>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])" >>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
>>> expected_translation_english = "Returns the maximum value of a b c." >>> expected_translation_english = "Returns the maximum value of a b c."
>>> inputs = tokenizer(example_python_phrase, return_tensors="pt") >>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(expected_translation_english, return_tensors="pt")
>>> inputs["labels"] = labels["input_ids"]
>>> # forward pass
>>> model(**inputs) >>> model(**inputs)
``` ```
@@ -88,7 +85,6 @@ inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode ta
## PLBartTokenizer ## PLBartTokenizer
[[autodoc]] PLBartTokenizer [[autodoc]] PLBartTokenizer
- as_target_tokenizer
- build_inputs_with_special_tokens - build_inputs_with_special_tokens
## PLBartModel ## PLBartModel

View File

@@ -107,7 +107,7 @@ speech inputs) and `labels` (which are the `input_ids` of the encoded target seq
>>> labels = tokenizer(ds[0]["text"], return_tensors="pt").input_ids >>> labels = tokenizer(ds[0]["text"], return_tensors="pt").input_ids
>>> # the forward function automatically creates the correct decoder_input_ids >>> # the forward function automatically creates the correct decoder_input_ids
>>> loss = model(input_values, labels=labels).loss >>> loss = model(**input_features).loss
>>> loss.backward() >>> loss.backward()
``` ```

View File

@@ -120,7 +120,6 @@ See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look
- save_pretrained - save_pretrained
- batch_decode - batch_decode
- decode - decode
- as_target_processor
## Speech2TextModel ## Speech2TextModel

View File

@@ -114,7 +114,6 @@ See [model hub](https://huggingface.co/models?filter=speech2text2) to look for S
- save_pretrained - save_pretrained
- batch_decode - batch_decode
- decode - decode
- as_target_processor
## Speech2Text2ForCausalLM ## Speech2Text2ForCausalLM

View File

@@ -94,7 +94,6 @@ See the [model hub](https://huggingface.co/models?filter=trocr) to look for TrOC
- save_pretrained - save_pretrained
- batch_decode - batch_decode
- decode - decode
- as_target_processor
## TrOCRForCausalLM ## TrOCRForCausalLM

View File

@@ -62,7 +62,6 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
- save_pretrained - save_pretrained
- batch_decode - batch_decode
- decode - decode
- as_target_processor
## Wav2Vec2ProcessorWithLM ## Wav2Vec2ProcessorWithLM
@@ -73,7 +72,6 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
- save_pretrained - save_pretrained
- batch_decode - batch_decode
- decode - decode
- as_target_processor
## Wav2Vec2 specific outputs ## Wav2Vec2 specific outputs

View File

@@ -486,10 +486,8 @@ A processor combines a feature extractor and tokenizer. Load a processor with [`
>>> def prepare_dataset(example): >>> def prepare_dataset(example):
... audio = example["audio"] ... audio = example["audio"]
... example["input_values"] = processor(audio["array"], sampling_rate=16000) ... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
... with processor.as_target_processor():
... example["labels"] = processor(example["text"]).input_ids
... return example ... return example
``` ```

View File

@@ -109,11 +109,10 @@ The preprocessing function needs to:
>>> def prepare_dataset(batch): >>> def prepare_dataset(batch):
... audio = batch["audio"] ... audio = batch["audio"]
... batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0] ... batch = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
... batch["input_length"] = len(batch["input_values"]) ... batch["input_length"] = len(batch["input_values"])
... with processor.as_target_processor(): ... batch["labels"] = processor(text=batch["transcription"]).input_ids
... batch["labels"] = processor(batch["transcription"]).input_ids
... return batch ... return batch
``` ```
@@ -146,17 +145,9 @@ Unlike other data collators, this specific data collator needs to apply a differ
... input_features = [{"input_values": feature["input_values"]} for feature in features] ... input_features = [{"input_values": feature["input_values"]} for feature in features]
... label_features = [{"input_ids": feature["labels"]} for feature in features] ... label_features = [{"input_ids": feature["labels"]} for feature in features]
... batch = self.processor.pad( ... batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
... input_features,
... padding=self.padding, ... labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
... return_tensors="pt",
... )
... with self.processor.as_target_processor():
... labels_batch = self.processor.pad(
... label_features,
... padding=self.padding,
... return_tensors="pt",
... )
... # replace padding with -100 to ignore loss correctly ... # replace padding with -100 to ignore loss correctly
... labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) ... labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

View File

@@ -67,7 +67,7 @@ Load the T5 tokenizer to process `text` and `summary`:
The preprocessing function needs to: The preprocessing function needs to:
1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks. 1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
2. Use a context manager with the `as_target_tokenizer()` function to parallelize tokenization of inputs and labels. 2. Use the keyword `text_target` argument when tokenizing labels.
3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter. 3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter.
```py ```py
@@ -78,8 +78,7 @@ The preprocessing function needs to:
... inputs = [prefix + doc for doc in examples["text"]] ... inputs = [prefix + doc for doc in examples["text"]]
... model_inputs = tokenizer(inputs, max_length=1024, truncation=True) ... model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
... with tokenizer.as_target_tokenizer(): ... labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
... labels = tokenizer(examples["summary"], max_length=128, truncation=True)
... model_inputs["labels"] = labels["input_ids"] ... model_inputs["labels"] = labels["input_ids"]
... return model_inputs ... return model_inputs

View File

@@ -78,12 +78,7 @@ The preprocessing function needs to:
>>> def preprocess_function(examples): >>> def preprocess_function(examples):
... inputs = [prefix + example[source_lang] for example in examples["translation"]] ... inputs = [prefix + example[source_lang] for example in examples["translation"]]
... targets = [example[target_lang] for example in examples["translation"]] ... targets = [example[target_lang] for example in examples["translation"]]
... model_inputs = tokenizer(inputs, max_length=128, truncation=True) ... model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
... with tokenizer.as_target_tokenizer():
... labels = tokenizer(targets, max_length=128, truncation=True)
... model_inputs["labels"] = labels["input_ids"]
... return model_inputs ... return model_inputs
``` ```

View File

@@ -471,10 +471,8 @@ Un processor combina un extractor de características y un tokenizador. Cargue u
>>> def prepare_dataset(example): >>> def prepare_dataset(example):
... audio = example["audio"] ... audio = example["audio"]
... example["input_values"] = processor(audio["array"], sampling_rate=16000) ... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
... with processor.as_target_processor():
... example["labels"] = processor(example["text"]).input_ids
... return example ... return example
``` ```

View File

@@ -471,10 +471,8 @@ Un processor combina un estrattore di caratteristiche e un tokenizer. Carica un
>>> def prepare_dataset(example): >>> def prepare_dataset(example):
... audio = example["audio"] ... audio = example["audio"]
... example["input_values"] = processor(audio["array"], sampling_rate=16000) ... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
... with processor.as_target_processor():
... example["labels"] = processor(example["text"]).input_ids
... return example ... return example
``` ```

View File

@@ -552,11 +552,14 @@ def main():
targets = captions targets = captions
model_inputs = {} model_inputs = {}
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer(): labels = tokenizer(
labels = tokenizer( text_target=targets,
targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np" max_length=max_target_length,
) padding="max_length",
truncation=True,
return_tensors="np",
)
model_inputs["labels"] = labels["input_ids"] model_inputs["labels"] = labels["input_ids"]
decoder_input_ids = shift_tokens_right_fn( decoder_input_ids = shift_tokens_right_fn(
labels["input_ids"], model.config.pad_token_id, model.config.decoder_start_token_id labels["input_ids"], model.config.pad_token_id, model.config.decoder_start_token_id

View File

@@ -590,10 +590,13 @@ def main():
) )
# Setup the tokenizer for targets # Setup the tokenizer for targets
with tokenizer.as_target_tokenizer(): labels = tokenizer(
labels = tokenizer( text_target=targets,
targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np" max_length=max_target_length,
) padding="max_length",
truncation=True,
return_tensors="np",
)
model_inputs["labels"] = labels["input_ids"] model_inputs["labels"] = labels["input_ids"]
decoder_input_ids = shift_tokens_right_fn( decoder_input_ids = shift_tokens_right_fn(

View File

@@ -453,9 +453,8 @@ def main():
inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column) inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)
model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True) model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True)
# Setup the tokenizer for targets # Tokenize targets with text_target=...
with tokenizer.as_target_tokenizer(): labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)
labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss. # padding in the loss.
@@ -479,9 +478,8 @@ def main():
return_overflowing_tokens=True, return_overflowing_tokens=True,
return_offsets_mapping=True, return_offsets_mapping=True,
) )
# Setup the tokenizer for targets # Tokenize targets with the `text_target` keyword argument
with tokenizer.as_target_tokenizer(): labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)
labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True)
# Since one example might give us several features if it has a long context, we need a map from a feature to # Since one example might give us several features if it has a long context, we need a map from a feature to
# its corresponding example. This key gives us just that. # its corresponding example. This key gives us just that.

View File

@@ -305,13 +305,12 @@ class DataCollatorCTCWithPadding:
return_tensors="pt", return_tensors="pt",
) )
with self.processor.as_target_processor(): labels_batch = self.processor.pad(
labels_batch = self.processor.pad( labels=label_features,
label_features, padding=self.padding,
padding=self.padding, pad_to_multiple_of=self.pad_to_multiple_of_labels,
pad_to_multiple_of=self.pad_to_multiple_of_labels, return_tensors="pt",
return_tensors="pt", )
)
# replace padding with -100 to ignore loss correctly # replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

View File

@@ -522,9 +522,8 @@ def main():
inputs = [prefix + inp for inp in inputs] inputs = [prefix + inp for inp in inputs]
model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
# Setup the tokenizer for targets # Tokenize targets with the `text_target` keyword argument
with tokenizer.as_target_tokenizer(): labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss. # padding in the loss.

View File

@@ -470,9 +470,8 @@ def main():
inputs = [prefix + inp for inp in inputs] inputs = [prefix + inp for inp in inputs]
model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True) model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True)
# Setup the tokenizer for targets # Tokenize targets with the `text_target` keyword argument
with tokenizer.as_target_tokenizer(): labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss. # padding in the loss.

View File

@@ -443,9 +443,8 @@ def main():
inputs = [prefix + inp for inp in inputs] inputs = [prefix + inp for inp in inputs]
model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
# Setup the tokenizer for targets # Tokenize targets with the `text_target` keyword argument
with tokenizer.as_target_tokenizer(): labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss. # padding in the loss.

View File

@@ -452,9 +452,8 @@ def main():
inputs = [prefix + inp for inp in inputs] inputs = [prefix + inp for inp in inputs]
model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True) model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True)
# Setup the tokenizer for targets # Tokenize targets with the `text_target` keyword argument
with tokenizer.as_target_tokenizer(): labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss. # padding in the loss.

View File

@@ -304,13 +304,12 @@ class DataCollatorCTCWithPadding:
return_tensors="pt", return_tensors="pt",
) )
with self.processor.as_target_processor(): labels_batch = self.processor.pad(
labels_batch = self.processor.pad( labels=label_features,
label_features, padding=self.padding,
padding=self.padding, pad_to_multiple_of=self.pad_to_multiple_of_labels,
pad_to_multiple_of=self.pad_to_multiple_of_labels, return_tensors="pt",
return_tensors="pt", )
)
# replace padding with -100 to ignore loss correctly # replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

View File

@@ -301,13 +301,12 @@ class DataCollatorCTCWithPadding:
return_tensors="pt", return_tensors="pt",
) )
with self.processor.as_target_processor(): labels_batch = self.processor.pad(
labels_batch = self.processor.pad( labels=label_features,
label_features, padding=self.padding,
padding=self.padding, pad_to_multiple_of=self.pad_to_multiple_of_labels,
pad_to_multiple_of=self.pad_to_multiple_of_labels, return_tensors="pt",
return_tensors="pt", )
)
# replace padding with -100 to ignore loss correctly # replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

View File

@@ -437,13 +437,12 @@ def main():
table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
) )
with tokenizer.as_target_tokenizer(): labels = tokenizer(
labels = tokenizer( answer=[", ".join(answer) for answer in answers],
answer=[", ".join(answer) for answer in answers], max_length=max_target_length,
max_length=max_target_length, padding=padding,
padding=padding, truncation=True,
truncation=True, )
)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss. # padding in the loss.

View File

@@ -413,13 +413,12 @@ def main():
table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
) )
with tokenizer.as_target_tokenizer(): labels = tokenizer(
labels = tokenizer( answer=[", ".join(answer) for answer in answers],
answer=[", ".join(answer) for answer in answers], max_length=max_target_length,
max_length=max_target_length, padding=padding,
padding=padding, truncation=True,
truncation=True, )
)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss. # padding in the loss.

View File

@@ -266,14 +266,13 @@ class DataCollatorCTCWithPadding:
pad_to_multiple_of=self.pad_to_multiple_of, pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt", return_tensors="pt",
) )
with self.processor.as_target_processor(): labels_batch = self.processor.pad(
labels_batch = self.processor.pad( labels=label_features,
label_features, padding=self.padding,
padding=self.padding, max_length=self.max_length_labels,
max_length=self.max_length_labels, pad_to_multiple_of=self.pad_to_multiple_of_labels,
pad_to_multiple_of=self.pad_to_multiple_of_labels, return_tensors="pt",
return_tensors="pt", )
)
# replace padding with -100 to ignore loss correctly # replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
@@ -419,9 +418,10 @@ def main():
len(set(batch["sampling_rate"])) == 1 len(set(batch["sampling_rate"])) == 1
), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values processed_batch = processor(
with processor.as_target_processor(): audio=batch["speech"], text=batch[data_args.target_text_column], sampling_rate=batch["sampling_rate"][0]
batch["labels"] = processor(batch[data_args.target_text_column]).input_ids )
batch.update(processed_batch)
return batch return batch
train_dataset = train_dataset.map( train_dataset = train_dataset.map(

View File

@@ -185,14 +185,13 @@ class DataCollatorCTCWithPadding:
pad_to_multiple_of=self.pad_to_multiple_of, pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt", return_tensors="pt",
) )
with self.processor.as_target_processor(): labels_batch = self.processor.pad(
labels_batch = self.processor.pad( labels=label_features,
label_features, padding=self.padding,
padding=self.padding, max_length=self.max_length_labels,
max_length=self.max_length_labels, pad_to_multiple_of=self.pad_to_multiple_of_labels,
pad_to_multiple_of=self.pad_to_multiple_of_labels, return_tensors="pt",
return_tensors="pt", )
)
# replace padding with -100 to ignore loss correctly # replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
@@ -414,10 +413,11 @@ def main():
assert ( assert (
len(set(batch["sampling_rate"])) == 1 len(set(batch["sampling_rate"])) == 1
), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
# Setup the processor for targets processed_batch = processor(
with processor.as_target_processor(): audio=batch["speech"], text=batch["target_text"], sampling_rate=batch["sampling_rate"][0]
batch["labels"] = processor(batch["target_text"]).input_ids )
batch.update(processed_batch)
return batch return batch
train_dataset = train_dataset.map( train_dataset = train_dataset.map(

View File

@@ -349,13 +349,12 @@ class SpeechDataCollatorWithPadding:
if self.pad_labels: if self.pad_labels:
label_features = [{"input_ids": feature["labels"]} for feature in features] label_features = [{"input_ids": feature["labels"]} for feature in features]
with self.processor.as_target_processor(): labels_batch = self.processor.pad(
labels_batch = self.processor.pad( labels=label_features,
label_features, padding=self.padding,
padding=self.padding, pad_to_multiple_of=self.pad_to_multiple_of_labels,
pad_to_multiple_of=self.pad_to_multiple_of_labels, return_tensors="pt",
return_tensors="pt", )
)
# replace padding with -100 to ignore loss correctly # replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

View File

@@ -504,9 +504,8 @@ def main():
inputs = [prefix + inp for inp in inputs] inputs = [prefix + inp for inp in inputs]
model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
# Setup the tokenizer for targets # Tokenize targets with the `text_target` keyword argument
with tokenizer.as_target_tokenizer(): labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss. # padding in the loss.

View File

@@ -458,9 +458,8 @@ def main():
inputs = [prefix + inp for inp in inputs] inputs = [prefix + inp for inp in inputs]
model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
# Setup the tokenizer for targets # Tokenize targets with the `text_target` keyword argument
with tokenizer.as_target_tokenizer(): labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss. # padding in the loss.

View File

@@ -1612,9 +1612,8 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
>>> # compute loss >>> # compute loss
>>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST" >>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST"
>>> # wrap processor as target processor to encode labels >>> # Pass the transcription as text to encode labels
>>> with processor.as_target_processor(): >>> labels = processor(text=transcription, return_tensors="tf").input_values
... labels = processor(transcription, return_tensors="tf").input_values
>>> loss = model(input_values, labels=labels).loss >>> loss = model(input_values, labels=labels).loss
```""" ```"""

View File

@@ -14,7 +14,6 @@
"""Tokenization classes for M2M100.""" """Tokenization classes for M2M100."""
import json import json
import os import os
from contextlib import contextmanager
from pathlib import Path from pathlib import Path
from shutil import copyfile from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Tuple, Union
@@ -116,10 +115,8 @@ class M2M100Tokenizer(PreTrainedTokenizer):
>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro") >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro")
>>> src_text = " UN Chief Says There Is No Military Solution in Syria" >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
>>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> model_inputs = tokenizer(src_text, return_tensors="pt") >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer(): >>> model(**model_inputs) # should work
... labels = tokenizer(tgt_text, return_tensors="pt").input_ids
>>> model(**model_inputs, labels=labels) # should work
```""" ```"""
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -346,16 +343,12 @@ class M2M100Tokenizer(PreTrainedTokenizer):
inputs["forced_bos_token_id"] = tgt_lang_id inputs["forced_bos_token_id"] = tgt_lang_id
return inputs return inputs
@contextmanager def _switch_to_input_mode(self):
def as_target_tokenizer(self):
"""
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
sequence-to-sequence models that need a slightly different processing for the labels.
"""
self.set_tgt_lang_special_tokens(self.tgt_lang)
yield
self.set_src_lang_special_tokens(self.src_lang) self.set_src_lang_special_tokens(self.src_lang)
def _switch_to_target_mode(self):
self.set_tgt_lang_special_tokens(self.tgt_lang)
def set_src_lang_special_tokens(self, src_lang: str) -> None: def set_src_lang_special_tokens(self, src_lang: str) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
lang_token = self.get_lang_token(src_lang) lang_token = self.get_lang_token(src_lang)

View File

@@ -15,7 +15,6 @@ import json
import os import os
import re import re
import warnings import warnings
from contextlib import contextmanager
from pathlib import Path from pathlib import Path
from shutil import copyfile from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Tuple, Union
@@ -112,10 +111,7 @@ class MarianTokenizer(PreTrainedTokenizer):
>>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de") >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
>>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."] >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
>>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
>>> inputs = tokenizer(src_texts, return_tensors="pt", padding=True) >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(tgt_texts, return_tensors="pt", padding=True)
>>> inputs["labels"] = labels["input_ids"]
# keys [input_ids, attention_mask, labels]. # keys [input_ids, attention_mask, labels].
>>> outputs = model(**inputs) # should work >>> outputs = model(**inputs) # should work
@@ -281,18 +277,14 @@ class MarianTokenizer(PreTrainedTokenizer):
# We don't expect to process pairs, but leave the pair logic for API consistency # We don't expect to process pairs, but leave the pair logic for API consistency
return token_ids_0 + token_ids_1 + [self.eos_token_id] return token_ids_0 + token_ids_1 + [self.eos_token_id]
@contextmanager def _switch_to_input_mode(self):
def as_target_tokenizer(self): self.current_spm = self.spm_source
""" self.current_encoder = self.encoder
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
sequence-to-sequence models that need a slightly different processing for the labels. def _switch_to_target_mode(self):
"""
self.current_spm = self.spm_target self.current_spm = self.spm_target
if self.separate_vocabs: if self.separate_vocabs:
self.current_encoder = self.target_encoder self.current_encoder = self.target_encoder
yield
self.current_spm = self.spm_source
self.current_encoder = self.encoder
@property @property
def vocab_size(self) -> int: def vocab_size(self) -> int:

View File

@@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
import os import os
from contextlib import contextmanager
from shutil import copyfile from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
@@ -69,10 +68,7 @@ class MBartTokenizer(PreTrainedTokenizer):
>>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO") >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria" >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> inputs = tokenizer(example_english_phrase, return_tensors="pt") >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(expected_translation_romanian, return_tensors="pt")
>>> inputs["labels"] = labels["input_ids"]
```""" ```"""
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -340,15 +336,11 @@ class MBartTokenizer(PreTrainedTokenizer):
self.tgt_lang = tgt_lang self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
@contextmanager def _switch_to_input_mode(self):
def as_target_tokenizer(self): return self.set_src_lang_special_tokens(self.src_lang)
"""
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to def _switch_to_target_mode(self):
sequence-to-sequence models that need a slightly different processing for the labels. return self.set_tgt_lang_special_tokens(self.tgt_lang)
"""
self.set_tgt_lang_special_tokens(self.tgt_lang)
yield
self.set_src_lang_special_tokens(self.src_lang)
def set_src_lang_special_tokens(self, src_lang) -> None: def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""

View File

@@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
import os import os
from contextlib import contextmanager
from shutil import copyfile from shutil import copyfile
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
@@ -82,10 +81,7 @@ class MBartTokenizerFast(PreTrainedTokenizerFast):
... ) ... )
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria" >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> inputs = tokenizer(example_english_phrase, return_tensors="pt") >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(expected_translation_romanian, return_tensors="pt")
>>> inputs["labels"] = labels["input_ids"]
```""" ```"""
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -240,15 +236,11 @@ class MBartTokenizerFast(PreTrainedTokenizerFast):
self.tgt_lang = tgt_lang self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
@contextmanager def _switch_to_input_mode(self):
def as_target_tokenizer(self): return self.set_src_lang_special_tokens(self.src_lang)
"""
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to def _switch_to_target_mode(self):
sequence-to-sequence models that need a slightly different processing for the labels. return self.set_tgt_lang_special_tokens(self.tgt_lang)
"""
self.set_tgt_lang_special_tokens(self.tgt_lang)
yield
self.set_src_lang_special_tokens(self.src_lang)
def set_src_lang_special_tokens(self, src_lang) -> None: def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""

View File

@@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
import os import os
from contextlib import contextmanager
from shutil import copyfile from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
@@ -102,10 +101,8 @@ class MBart50Tokenizer(PreTrainedTokenizer):
>>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO") >>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
>>> src_text = " UN Chief Says There Is No Military Solution in Syria" >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
>>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> model_inputs = tokenizer(src_text, return_tensors="pt") >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer(): >>> # model(**model_inputs) should work
... labels = tokenizer(tgt_text, return_tensors="pt").input_ids
>>> # model(**model_inputs, labels=labels) should work
```""" ```"""
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -337,15 +334,11 @@ class MBart50Tokenizer(PreTrainedTokenizer):
self.tgt_lang = tgt_lang self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
@contextmanager def _switch_to_input_mode(self):
def as_target_tokenizer(self): return self.set_src_lang_special_tokens(self.src_lang)
"""
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to def _switch_to_target_mode(self):
sequence-to-sequence models that need a slightly different processing for the labels. return self.set_tgt_lang_special_tokens(self.tgt_lang)
"""
self.set_tgt_lang_special_tokens(self.tgt_lang)
yield
self.set_src_lang_special_tokens(self.src_lang)
def set_src_lang_special_tokens(self, src_lang: str) -> None: def set_src_lang_special_tokens(self, src_lang: str) -> None:
"""Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos].""" """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""

View File

@@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
import os import os
from contextlib import contextmanager
from shutil import copyfile from shutil import copyfile
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
@@ -98,10 +97,8 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
>>> tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO") >>> tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
>>> src_text = " UN Chief Says There Is No Military Solution in Syria" >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
>>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> model_inputs = tokenizer(src_text, return_tensors="pt") >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer(): >>> # model(**model_inputs) should work
... labels = tokenizer(tgt_text, return_tensors="pt").input_ids
>>> # model(**model_inputs, labels=labels) should work
```""" ```"""
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -211,15 +208,11 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
self.tgt_lang = tgt_lang self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
@contextmanager def _switch_to_input_mode(self):
def as_target_tokenizer(self): return self.set_src_lang_special_tokens(self.src_lang)
"""
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to def _switch_to_target_mode(self):
sequence-to-sequence models that need a slightly different processing for the labels. return self.set_tgt_lang_special_tokens(self.tgt_lang)
"""
self.set_tgt_lang_special_tokens(self.tgt_lang)
yield
self.set_src_lang_special_tokens(self.src_lang)
def set_src_lang_special_tokens(self, src_lang: str) -> None: def set_src_lang_special_tokens(self, src_lang: str) -> None:
"""Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos].""" """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""

View File

@@ -15,6 +15,7 @@
""" """
Speech processor class for M-CTC-T Speech processor class for M-CTC-T
""" """
import warnings
from contextlib import contextmanager from contextlib import contextmanager
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
@@ -39,6 +40,7 @@ class MCTCTProcessor(ProcessorMixin):
def __init__(self, feature_extractor, tokenizer): def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer) super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
self._in_target_context_manager = False
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
""" """
@@ -47,7 +49,35 @@ class MCTCTProcessor(ProcessorMixin):
[`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to AutoTokenizer's [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to AutoTokenizer's
[`~AutoTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information. [`~AutoTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
""" """
return self.current_processor(*args, **kwargs) # For backward compatibility
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
if "raw_speech" in kwargs:
warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
audio = kwargs.pop("raw_speech")
else:
audio = kwargs.pop("audio", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, *args, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def batch_decode(self, *args, **kwargs): def batch_decode(self, *args, **kwargs):
""" """
@@ -63,7 +93,28 @@ class MCTCTProcessor(ProcessorMixin):
[`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
[`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information. [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
""" """
return self.current_processor.pad(*args, **kwargs) # For backward compatibility
if self._in_target_context_manager:
return self.current_processor.pad(*args, **kwargs)
input_features = kwargs.pop("input_features", None)
labels = kwargs.pop("labels", None)
if len(args) > 0:
input_features = args[0]
args = args[1:]
if input_features is not None:
input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
if labels is not None:
labels = self.tokenizer.pad(labels, **kwargs)
if labels is None:
return input_features
elif input_features is None:
return labels
else:
input_features["labels"] = labels["input_ids"]
return input_features
def decode(self, *args, **kwargs): def decode(self, *args, **kwargs):
""" """
@@ -77,6 +128,13 @@ class MCTCTProcessor(ProcessorMixin):
""" """
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning MCTCT. Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning MCTCT.
""" """
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your audio inputs, or in a separate call."
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer self.current_processor = self.tokenizer
yield yield
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
self._in_target_context_manager = False

View File

@@ -57,8 +57,7 @@ class FlaxMT5Model(FlaxT5Model):
>>> summary = "Weiter Verhandlung in Syrien." >>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, return_tensors="np") >>> inputs = tokenizer(article, return_tensors="np")
>>> with tokenizer.as_target_tokenizer(): >>> decoder_input_ids = tokenizer(text_target=summary, return_tensors="np").input_ids
... decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids
>>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=decoder_input_ids) >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=decoder_input_ids)
>>> hidden_states = outputs.last_hidden_state >>> hidden_states = outputs.last_hidden_state
@@ -84,8 +83,7 @@ class FlaxMT5EncoderModel(FlaxT5EncoderModel):
>>> summary = "Weiter Verhandlung in Syrien." >>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, return_tensors="np") >>> inputs = tokenizer(article, return_tensors="np")
>>> with tokenizer.as_target_tokenizer(): >>> decoder_input_ids = tokenizer(text_target=summary, return_tensors="np").input_ids
... decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids
>>> outputs = model(input_ids=inputs["input_ids"]) >>> outputs = model(input_ids=inputs["input_ids"])
>>> hidden_states = outputs.last_hidden_state >>> hidden_states = outputs.last_hidden_state
@@ -111,8 +109,7 @@ class FlaxMT5ForConditionalGeneration(FlaxT5ForConditionalGeneration):
>>> summary = "Weiter Verhandlung in Syrien." >>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, return_tensors="np") >>> inputs = tokenizer(article, return_tensors="np")
>>> with tokenizer.as_target_tokenizer(): >>> decoder_input_ids = tokenizer(text_target=summary, return_tensors="np").input_ids
... decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids
>>> outputs = model(**inputs, decoder_input_ids=decoder_input_ids) >>> outputs = model(**inputs, decoder_input_ids=decoder_input_ids)
>>> logits = outputs.logits >>> logits = outputs.logits

View File

@@ -40,8 +40,7 @@ class MT5Model(T5Model):
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien." >>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, return_tensors="pt") >>> inputs = tokenizer(article, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer(): >>> labels = tokenizer(text_target=summary, return_tensors="pt")
... labels = tokenizer(summary, return_tensors="pt")
>>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"]) >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
>>> hidden_states = outputs.last_hidden_state >>> hidden_states = outputs.last_hidden_state
@@ -73,11 +72,9 @@ class MT5ForConditionalGeneration(T5ForConditionalGeneration):
>>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small") >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien." >>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, return_tensors="pt") >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(summary, return_tensors="pt")
>>> outputs = model(**inputs, labels=labels["input_ids"]) >>> outputs = model(**inputs)
>>> loss = outputs.loss >>> loss = outputs.loss
```""" ```"""

View File

@@ -40,8 +40,7 @@ class TFMT5Model(TFT5Model):
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien." >>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, return_tensors="tf") >>> inputs = tokenizer(article, return_tensors="tf")
>>> with tokenizer.as_target_tokenizer(): >>> labels = tokenizer(text_target=summary, return_tensors="tf")
... labels = tokenizer(summary, return_tensors="tf")
>>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"]) >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
>>> hidden_states = outputs.last_hidden_state >>> hidden_states = outputs.last_hidden_state
@@ -64,11 +63,9 @@ class TFMT5ForConditionalGeneration(TFT5ForConditionalGeneration):
>>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small") >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien." >>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, return_tensors="tf") >>> inputs = tokenizer(article, text_target=summary, return_tensors="tf")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(summary, return_tensors="tf")
>>> outputs = model(**inputs, labels=labels["input_ids"]) >>> outputs = model(**inputs)
>>> loss = outputs.loss >>> loss = outputs.loss
```""" ```"""

View File

@@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
import os import os
from contextlib import contextmanager
from shutil import copyfile from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
@@ -67,10 +66,7 @@ class NllbTokenizer(PreTrainedTokenizer):
... ) ... )
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie." >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
>>> inputs = tokenizer(example_english_phrase, return_tensors="pt") >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(expected_translation_french, return_tensors="pt")
>>> inputs["labels"] = labels["input_ids"]
``` ```
Args: Args:
@@ -386,15 +382,11 @@ class NllbTokenizer(PreTrainedTokenizer):
self.tgt_lang = tgt_lang self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
@contextmanager def _switch_to_input_mode(self):
def as_target_tokenizer(self): return self.set_src_lang_special_tokens(self.src_lang)
"""
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to def _switch_to_target_mode(self):
sequence-to-sequence models that need a slightly different processing for the labels. return self.set_tgt_lang_special_tokens(self.tgt_lang)
"""
self.set_tgt_lang_special_tokens(self.tgt_lang)
yield
self.set_src_lang_special_tokens(self.src_lang)
def set_src_lang_special_tokens(self, src_lang) -> None: def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""

View File

@@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
import os import os
from contextlib import contextmanager
from shutil import copyfile from shutil import copyfile
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
@@ -80,10 +79,7 @@ class NllbTokenizerFast(PreTrainedTokenizerFast):
... ) ... )
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie." >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
>>> inputs = tokenizer(example_english_phrase, return_tensors="pt") >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(expected_translation_french, return_tensors="pt")
>>> inputs["labels"] = labels["input_ids"]
``` ```
Args: Args:
@@ -284,15 +280,11 @@ class NllbTokenizerFast(PreTrainedTokenizerFast):
self.tgt_lang = tgt_lang self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
@contextmanager def _switch_to_input_mode(self):
def as_target_tokenizer(self): return self.set_src_lang_special_tokens(self.src_lang)
"""
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to def _switch_to_target_mode(self):
sequence-to-sequence models that need a slightly different processing for the labels. return self.set_tgt_lang_special_tokens(self.tgt_lang)
"""
self.set_tgt_lang_special_tokens(self.tgt_lang)
yield
self.set_src_lang_special_tokens(self.src_lang)
def set_src_lang_special_tokens(self, src_lang) -> None: def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""

View File

@@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
import os import os
from contextlib import contextmanager
from shutil import copyfile from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
@@ -153,10 +152,7 @@ class PLBartTokenizer(PreTrainedTokenizer):
>>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX") >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
>>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])" >>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
>>> expected_translation_english = "Returns the maximum value of a b c." >>> expected_translation_english = "Returns the maximum value of a b c."
>>> inputs = tokenizer(example_python_phrase, return_tensors="pt") >>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(expected_translation_english, return_tensors="pt")
>>> inputs["labels"] = labels["input_ids"]
```""" ```"""
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -441,15 +437,11 @@ class PLBartTokenizer(PreTrainedTokenizer):
self.tgt_lang = tgt_lang self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
@contextmanager def _switch_to_input_mode(self):
def as_target_tokenizer(self): return self.set_src_lang_special_tokens(self.src_lang)
"""
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to def _switch_to_target_mode(self):
sequence-to-sequence models that need a slightly different processing for the labels. return self.set_tgt_lang_special_tokens(self.tgt_lang)
"""
self.set_tgt_lang_special_tokens(self.tgt_lang)
yield
self.set_src_lang_special_tokens(self.src_lang)
def set_src_lang_special_tokens(self, src_lang) -> None: def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""

View File

@@ -818,8 +818,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever) >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
>>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt") >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
>>> with tokenizer.as_target_tokenizer(): >>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
... targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
>>> input_ids = inputs["input_ids"] >>> input_ids = inputs["input_ids"]
>>> labels = targets["input_ids"] >>> labels = targets["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=labels) >>> outputs = model(input_ids=input_ids, labels=labels)
@@ -1287,8 +1286,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever) >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
>>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt") >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
>>> with tokenizer.as_target_tokenizer(): >>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
... targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
>>> input_ids = inputs["input_ids"] >>> input_ids = inputs["input_ids"]
>>> labels = targets["input_ids"] >>> labels = targets["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=labels) >>> outputs = model(input_ids=input_ids, labels=labels)

View File

@@ -15,7 +15,6 @@
"""Tokenization classes for RAG.""" """Tokenization classes for RAG."""
import os import os
import warnings import warnings
from contextlib import contextmanager
from typing import List, Optional from typing import List, Optional
from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_base import BatchEncoding
@@ -68,16 +67,12 @@ class RagTokenizer:
def decode(self, *args, **kwargs): def decode(self, *args, **kwargs):
return self.generator.decode(*args, **kwargs) return self.generator.decode(*args, **kwargs)
@contextmanager def _switch_to_input_mode(self):
def as_target_tokenizer(self):
"""
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
sequence-to-sequence models that need a slightly different processing for the labels.
"""
self.current_tokenizer = self.generator
yield
self.current_tokenizer = self.question_encoder self.current_tokenizer = self.question_encoder
def _switch_to_target_mode(self):
self.current_tokenizer = self.generator
def prepare_seq2seq_batch( def prepare_seq2seq_batch(
self, self,
src_texts: List[str], src_texts: List[str],
@@ -110,17 +105,16 @@ class RagTokenizer:
if tgt_texts is None: if tgt_texts is None:
return model_inputs return model_inputs
# Process tgt_texts # Process tgt_texts
with self.as_target_tokenizer(): if max_target_length is None:
if max_target_length is None: max_target_length = self.current_tokenizer.model_max_length
max_target_length = self.current_tokenizer.model_max_length labels = self(
labels = self( text_target=tgt_texts,
tgt_texts, add_special_tokens=True,
add_special_tokens=True, return_tensors=return_tensors,
return_tensors=return_tensors, padding=padding,
padding=padding, max_length=max_target_length,
max_length=max_target_length, truncation=truncation,
truncation=truncation, **kwargs,
**kwargs, )
)
model_inputs["labels"] = labels["input_ids"] model_inputs["labels"] = labels["input_ids"]
return model_inputs return model_inputs

View File

@@ -482,8 +482,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
'Mr. Quilter ist der Apostel der Mittelschicht und wir freuen uns, sein Evangelium willkommen heißen zu können.' 'Mr. Quilter ist der Apostel der Mittelschicht und wir freuen uns, sein Evangelium willkommen heißen zu können.'
>>> # Training: Train model on English transcription >>> # Training: Train model on English transcription
>>> with processor.as_target_processor(): >>> labels = processor(text=ds[0]["text"], return_tensors="pt").input_ids
... labels = processor(ds[0]["text"], return_tensors="pt").input_ids
>>> loss = model(input_values, labels=labels).loss >>> loss = model(input_values, labels=labels).loss
>>> loss.backward() >>> loss.backward()

View File

@@ -15,6 +15,7 @@
""" """
Speech processor class for Speech2Text Speech processor class for Speech2Text
""" """
import warnings
from contextlib import contextmanager from contextlib import contextmanager
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
@@ -41,6 +42,7 @@ class Speech2TextProcessor(ProcessorMixin):
def __init__(self, feature_extractor, tokenizer): def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer) super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
self._in_target_context_manager = False
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
""" """
@@ -50,7 +52,35 @@ class Speech2TextProcessor(ProcessorMixin):
[`~Speech2TextTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more [`~Speech2TextTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
information. information.
""" """
return self.current_processor(*args, **kwargs) # For backward compatibility
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
if "raw_speech" in kwargs:
warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
audio = kwargs.pop("raw_speech")
else:
audio = kwargs.pop("audio", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, *args, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def batch_decode(self, *args, **kwargs): def batch_decode(self, *args, **kwargs):
""" """
@@ -72,6 +102,13 @@ class Speech2TextProcessor(ProcessorMixin):
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
Speech2Text. Speech2Text.
""" """
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your audio inputs, or in a separate call."
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer self.current_processor = self.tokenizer
yield yield
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
self._in_target_context_manager = False

View File

@@ -15,6 +15,7 @@
""" """
Speech processor class for Speech2Text2 Speech processor class for Speech2Text2
""" """
import warnings
from contextlib import contextmanager from contextlib import contextmanager
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
@@ -40,6 +41,7 @@ class Speech2Text2Processor(ProcessorMixin):
def __init__(self, feature_extractor, tokenizer): def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer) super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
self._in_target_context_manager = False
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
""" """
@@ -49,7 +51,35 @@ class Speech2Text2Processor(ProcessorMixin):
Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the doctsring of the above two Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the doctsring of the above two
methods for more information. methods for more information.
""" """
return self.current_processor(*args, **kwargs) # For backward compatibility
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
if "raw_speech" in kwargs:
warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
audio = kwargs.pop("raw_speech")
else:
audio = kwargs.pop("audio", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, *args, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def batch_decode(self, *args, **kwargs): def batch_decode(self, *args, **kwargs):
""" """
@@ -71,6 +101,13 @@ class Speech2Text2Processor(ProcessorMixin):
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
Speech2Text2. Speech2Text2.
""" """
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your audio inputs, or in a separate call."
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer self.current_processor = self.tokenizer
yield yield
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
self._in_target_context_manager = False

View File

@@ -17,7 +17,6 @@
import json import json
import os import os
import random import random
from contextlib import contextmanager
from functools import lru_cache from functools import lru_cache
from typing import Dict, List, Optional, Tuple, Union from typing import Dict, List, Optional, Tuple, Union
@@ -63,12 +62,6 @@ class TapexTruncationStrategy(ExplicitEnum):
DROP_ROWS_TO_FIT = "drop_rows_to_fit" DROP_ROWS_TO_FIT = "drop_rows_to_fit"
class TokenizerStrategy(ExplicitEnum):
TOKENIZE_SOURCE = "tokenize_source"
TOKENIZE_TARGET = "tokenize_target"
TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
add_special_tokens (`bool`, *optional*, defaults to `True`): add_special_tokens (`bool`, *optional*, defaults to `True`):
Whether or not to encode the sequences with the special tokens relative to their model. Whether or not to encode the sequences with the special tokens relative to their model.
@@ -341,9 +334,6 @@ class TapexTokenizer(PreTrainedTokenizer):
self.max_cell_length = max_cell_length self.max_cell_length = max_cell_length
self.table_linearize = IndexedRowTableLinearize() self.table_linearize = IndexedRowTableLinearize()
# property to decide using which call function
self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE
def build_inputs_with_special_tokens( def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
@@ -555,9 +545,7 @@ class TapexTokenizer(PreTrainedTokenizer):
Optionally, the corresponding answer to the questions as supervision. Optionally, the corresponding answer to the questions as supervision.
""" """
if self.current_tokenizer == TokenizerStrategy.TOKENIZE_SOURCE: if table is not None:
if table is None:
raise ValueError("Please ensure that the table is not empty if you use TAPEX to encode source.")
return self.source_call_func( return self.source_call_func(
table=table, table=table,
query=query, query=query,
@@ -578,9 +566,7 @@ class TapexTokenizer(PreTrainedTokenizer):
verbose=verbose, verbose=verbose,
**kwargs, **kwargs,
) )
else: elif answer is not None:
if answer is None:
raise ValueError("Please ensure that the answer is not empty if you use TAPEX to encode target.")
return self.target_call_func( return self.target_call_func(
answer=answer, answer=answer,
add_special_tokens=add_special_tokens, add_special_tokens=add_special_tokens,
@@ -599,6 +585,8 @@ class TapexTokenizer(PreTrainedTokenizer):
verbose=verbose, verbose=verbose,
**kwargs, **kwargs,
) )
else:
raise ValueError("You need to provide either a `table` or an `answer`.")
def source_call_func( def source_call_func(
self, self,
@@ -1330,17 +1318,6 @@ class TapexTokenizer(PreTrainedTokenizer):
verbose=verbose, verbose=verbose,
) )
@contextmanager
def as_target_tokenizer(self):
"""
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
sequence-to-sequence models that need a slightly different processing for the labels.
"""
self.current_tokenizer = TokenizerStrategy.TOKENIZE_TARGET
yield
# restore the call function
self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE
def prepare_table_query( def prepare_table_query(
self, self,
table, table,

View File

@@ -15,6 +15,7 @@
""" """
Processor class for TrOCR. Processor class for TrOCR.
""" """
import warnings
from contextlib import contextmanager from contextlib import contextmanager
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
@@ -40,6 +41,7 @@ class TrOCRProcessor(ProcessorMixin):
def __init__(self, feature_extractor, tokenizer): def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer) super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
self._in_target_context_manager = False
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
""" """
@@ -48,7 +50,35 @@ class TrOCRProcessor(ProcessorMixin):
[`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to TrOCRTokenizer's [`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to TrOCRTokenizer's
[`~TrOCRTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information. [`~TrOCRTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
""" """
return self.current_processor(*args, **kwargs) # For backward compatibility
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
if "raw_speech" in kwargs:
warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
audio = kwargs.pop("raw_speech")
else:
audio = kwargs.pop("audio", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, *args, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def batch_decode(self, *args, **kwargs): def batch_decode(self, *args, **kwargs):
""" """
@@ -69,6 +99,13 @@ class TrOCRProcessor(ProcessorMixin):
""" """
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR. Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR.
""" """
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your audio inputs, or in a separate call."
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer self.current_processor = self.tokenizer
yield yield
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
self._in_target_context_manager = False

View File

@@ -1650,9 +1650,8 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
>>> # compute loss >>> # compute loss
>>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST" >>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST"
>>> # wrap processor as target processor to encode labels >>> # Pass transcription as `text` to encode labels
>>> with processor.as_target_processor(): >>> labels = processor(text=transcription, return_tensors="tf").input_ids
... labels = processor(transcription, return_tensors="tf").input_ids
>>> loss = model(input_values, labels=labels).loss >>> loss = model(input_values, labels=labels).loss
```""" ```"""

View File

@@ -43,6 +43,7 @@ class Wav2Vec2Processor(ProcessorMixin):
def __init__(self, feature_extractor, tokenizer): def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer) super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
self._in_target_context_manager = False
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
@@ -70,7 +71,35 @@ class Wav2Vec2Processor(ProcessorMixin):
[`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
[`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information. [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
""" """
return self.current_processor(*args, **kwargs) # For backward compatibility
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
if "raw_speech" in kwargs:
warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
audio = kwargs.pop("raw_speech")
else:
audio = kwargs.pop("audio", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, *args, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def pad(self, *args, **kwargs): def pad(self, *args, **kwargs):
""" """
@@ -79,7 +108,28 @@ class Wav2Vec2Processor(ProcessorMixin):
[`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
[`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information. [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
""" """
return self.current_processor.pad(*args, **kwargs) # For backward compatibility
if self._in_target_context_manager:
return self.current_processor.pad(*args, **kwargs)
input_features = kwargs.pop("input_features", None)
labels = kwargs.pop("labels", None)
if len(args) > 0:
input_features = args[0]
args = args[1:]
if input_features is not None:
input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
if labels is not None:
labels = self.tokenizer.pad(labels, **kwargs)
if labels is None:
return input_features
elif input_features is None:
return labels
else:
input_features["labels"] = labels["input_ids"]
return input_features
def batch_decode(self, *args, **kwargs): def batch_decode(self, *args, **kwargs):
""" """
@@ -101,6 +151,13 @@ class Wav2Vec2Processor(ProcessorMixin):
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
Wav2Vec2. Wav2Vec2.
""" """
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your audio inputs, or in a separate call."
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer self.current_processor = self.tokenizer
yield yield
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
self._in_target_context_manager = False

View File

@@ -16,6 +16,7 @@
Speech processor class for Wav2Vec2 Speech processor class for Wav2Vec2
""" """
import os import os
import warnings
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import dataclass from dataclasses import dataclass
from multiprocessing import get_context from multiprocessing import get_context
@@ -99,6 +100,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
self.decoder = decoder self.decoder = decoder
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
self._in_target_context_manager = False
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
super().save_pretrained(save_directory) super().save_pretrained(save_directory)
@@ -214,7 +216,35 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.__call__`]. Please refer to the docstring of the above two Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.__call__`]. Please refer to the docstring of the above two
methods for more information. methods for more information.
""" """
return self.current_processor(*args, **kwargs) # For backward compatibility
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
if "raw_speech" in kwargs:
warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
audio = kwargs.pop("raw_speech")
else:
audio = kwargs.pop("audio", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, *args, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def pad(self, *args, **kwargs): def pad(self, *args, **kwargs):
""" """
@@ -224,7 +254,28 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.pad`]. Please refer to the docstring of the above two methods Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.pad`]. Please refer to the docstring of the above two methods
for more information. for more information.
""" """
return self.current_processor.pad(*args, **kwargs) # For backward compatibility
if self._in_target_context_manager:
return self.current_processor.pad(*args, **kwargs)
input_features = kwargs.pop("input_features", None)
labels = kwargs.pop("labels", None)
if len(args) > 0:
input_features = args[0]
args = args[1:]
if input_features is not None:
input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
if labels is not None:
labels = self.tokenizer.pad(labels, **kwargs)
if labels is None:
return input_features
elif input_features is None:
return labels
else:
input_features["labels"] = labels["input_ids"]
return input_features
def batch_decode( def batch_decode(
self, self,
@@ -486,9 +537,16 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
@contextmanager @contextmanager
def as_target_processor(self): def as_target_processor(self):
""" """
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning Temporarily sets the processor for processing the target. Useful for encoding the labels when fine-tuning
Wav2Vec2. Wav2Vec2.
""" """
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your audio inputs, or in a separate call."
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer self.current_processor = self.tokenizer
yield yield
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
self._in_target_context_manager = False

View File

@@ -1501,7 +1501,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
self.deprecation_warnings = ( self.deprecation_warnings = (
{} {}
) # Use to store when we have already noticed a deprecation warning (avoid overlogging). ) # Use to store when we have already noticed a deprecation warning (avoid overlogging).
self._in_target_context_manager = False
super().__init__(**kwargs) super().__init__(**kwargs)
@property @property
@@ -2431,8 +2431,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def __call__( def __call__(
self, self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair_target: Optional[
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
] = None,
add_special_tokens: bool = True, add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False, padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = False,
@@ -2455,15 +2459,85 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
sequences. sequences.
Args: Args:
text (`str`, `List[str]`, `List[List[str]]`): text (`str`, `List[str]`, `List[List[str]]`, *optional*):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences). `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
text_pair (`str`, `List[str]`, `List[List[str]]`): text_pair (`str`, `List[str]`, `List[List[str]]`, *optional*):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences). `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
text_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
text_pair_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
""" """
# To avoid duplicating
all_kwargs = dict(
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
)
all_kwargs.update(kwargs)
if text is None and text_target is None:
raise ValueError("You need to specify either `text` or `text_target`.")
if text is not None:
# The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the
# input mode in this case.
if not self._in_target_context_manager:
self._switch_to_input_mode()
encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
if text_target is not None:
self._switch_to_target_mode()
target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **all_kwargs)
# Leave back tokenizer in input mode
self._switch_to_input_mode()
if text_target is None:
return encodings
elif text is None:
return target_encodings
else:
encodings["labels"] = target_encodings["input_ids"]
return encodings
def _call_one(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
# Input type checking for clearer error # Input type checking for clearer error
def _is_valid_text_input(t): def _is_valid_text_input(t):
if isinstance(t, str): if isinstance(t, str):
@@ -3456,13 +3530,34 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
) )
self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
def _switch_to_input_mode(self):
"""
Private method to put the tokenizer in input mode (when it has different modes for input/outputs)
"""
pass
def _switch_to_target_mode(self):
"""
Private method to put the tokenizer in target mode (when it has different modes for input/outputs)
"""
pass
@contextmanager @contextmanager
def as_target_tokenizer(self): def as_target_tokenizer(self):
""" """
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
sequence-to-sequence models that need a slightly different processing for the labels. sequence-to-sequence models that need a slightly different processing for the labels.
""" """
warnings.warn(
"`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
"labels by using the argument `text_target` of the regular `__call__` method (either in the same call as "
"your input texts if you use the same keyword arguments, or in a separate call."
)
self._switch_to_target_mode()
self._in_target_context_manager = True
yield yield
self._in_target_context_manager = False
self._switch_to_input_mode()
@classmethod @classmethod
def register_for_auto_class(cls, auto_class="AutoTokenizer"): def register_for_auto_class(cls, auto_class="AutoTokenizer"):
@@ -3563,14 +3658,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
# docstyle-ignore # docstyle-ignore
formatted_warning = """ formatted_warning = """
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular `prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare `__call__` method to prepare your inputs and targets.
your targets.
Here is a short example: Here is a short example:
model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)
If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:
model_inputs = tokenizer(src_texts, ...) model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer(): labels = tokenizer(text_target=tgt_texts, ...)
labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"] model_inputs["labels"] = labels["input_ids"]
See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice. See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.

View File

@@ -428,8 +428,7 @@ PT_SPEECH_CTC_SAMPLE = r"""
``` ```
```python ```python
>>> with processor.as_target_processor(): >>> inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="pt").input_ids
... inputs["labels"] = processor(dataset[0]["text"], return_tensors="pt").input_ids
>>> # compute loss >>> # compute loss
>>> loss = model(**inputs).loss >>> loss = model(**inputs).loss
@@ -849,8 +848,7 @@ TF_SPEECH_CTC_SAMPLE = r"""
``` ```
```python ```python
>>> with processor.as_target_processor(): >>> inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="tf").input_ids
... inputs["labels"] = processor(dataset[0]["text"], return_tensors="tf").input_ids
>>> # compute loss >>> # compute loss
>>> loss = model(**inputs).loss >>> loss = model(**inputs).loss

View File

@@ -112,14 +112,13 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
self.assertNotIn("decoder_attention_mask", batch) self.assertNotIn("decoder_attention_mask", batch)
@require_torch @require_torch
def test_as_target_tokenizer_target_length(self): def test_tokenizer_as_target_length(self):
tgt_text = [ tgt_text = [
"Summary of the text.", "Summary of the text.",
"Another summary.", "Another summary.",
] ]
for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
with tokenizer.as_target_tokenizer(): targets = tokenizer(text_target=tgt_text, max_length=32, padding="max_length", return_tensors="pt")
targets = tokenizer(tgt_text, max_length=32, padding="max_length", return_tensors="pt")
self.assertEqual(32, targets["input_ids"].shape[1]) self.assertEqual(32, targets["input_ids"].shape[1])
@require_torch @require_torch
@@ -140,8 +139,7 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
] ]
for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
inputs = tokenizer(src_text, return_tensors="pt") inputs = tokenizer(src_text, return_tensors="pt")
with tokenizer.as_target_tokenizer(): targets = tokenizer(text_target=tgt_text, return_tensors="pt")
targets = tokenizer(tgt_text, return_tensors="pt")
input_ids = inputs["input_ids"] input_ids = inputs["input_ids"]
labels = targets["input_ids"] labels = targets["input_ids"]
self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item()) self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item())

View File

@@ -152,10 +152,9 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"Summary of the text.", "Summary of the text.",
"Another summary.", "Another summary.",
] ]
with tokenizer.as_target_tokenizer(): targets = tokenizer(
targets = tokenizer( text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK )
)
self.assertEqual(32, targets["input_ids"].shape[1]) self.assertEqual(32, targets["input_ids"].shape[1])
def test_eos_in_input(self): def test_eos_in_input(self):
@@ -167,12 +166,10 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
expected_tgt_tokens = [86, 120, 112, 112, 100, 117, 124, 35, 114, 105, 35, 119, 107, 104, 35, 119, 104, 123, 119, 49, 35, 1] expected_tgt_tokens = [86, 120, 112, 112, 100, 117, 124, 35, 114, 105, 35, 119, 107, 104, 35, 119, 104, 123, 119, 49, 35, 1]
# fmt: on # fmt: on
batch = tokenizer(src_text) batch = tokenizer(src_text, text_target=tgt_text)
with tokenizer.as_target_tokenizer():
targets = tokenizer(tgt_text)
self.assertEqual(expected_src_tokens, batch["input_ids"][0]) self.assertEqual(expected_src_tokens, batch["input_ids"][0])
self.assertEqual(expected_tgt_tokens, targets["input_ids"][0]) self.assertEqual(expected_tgt_tokens, batch["labels"][0])
# cannot use default save_and_load_tokenzier test method because tokenzier has no vocab # cannot use default save_and_load_tokenzier test method because tokenzier has no vocab
def test_save_and_load_tokenizer(self): def test_save_and_load_tokenizer(self):

View File

@@ -80,8 +80,9 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"What's the weater?", "What's the weater?",
"It's about 25 degrees.", "It's about 25 degrees.",
] ]
with tokenizer.as_target_tokenizer(): targets = tokenizer(
targets = tokenizer(tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors="pt") text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors="pt"
)
self.assertEqual(32, targets["input_ids"].shape[1]) self.assertEqual(32, targets["input_ids"].shape[1])
# cannot use default save_and_load_tokenzier test method because tokenzier has no vocab # cannot use default save_and_load_tokenzier test method because tokenzier has no vocab

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from transformers import ( from transformers import (
DPRContextEncoderTokenizer, DPRContextEncoderTokenizer,
DPRContextEncoderTokenizerFast, DPRContextEncoderTokenizerFast,

View File

@@ -187,9 +187,7 @@ class M2M100TokenizerIntegrationTest(unittest.TestCase):
self.tokenizer.src_lang = "en" self.tokenizer.src_lang = "en"
self.tokenizer.tgt_lang = "fr" self.tokenizer.tgt_lang = "fr"
batch = self.tokenizer(self.src_text, padding=True, return_tensors="pt") batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt")
with self.tokenizer.as_target_tokenizer():
batch["labels"] = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt").input_ids
batch["decoder_input_ids"] = shift_tokens_right( batch["decoder_input_ids"] = shift_tokens_right(
batch["labels"], self.tokenizer.pad_token_id, self.tokenizer.eos_token_id batch["labels"], self.tokenizer.pad_token_id, self.tokenizer.eos_token_id
@@ -217,17 +215,19 @@ class M2M100TokenizerIntegrationTest(unittest.TestCase):
self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id]) self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
@require_torch @require_torch
def test_as_target_tokenizer(self): def test_tokenizer_target_mode(self):
self.tokenizer.tgt_lang = "mr" self.tokenizer.tgt_lang = "mr"
with self.tokenizer.as_target_tokenizer(): self.tokenizer._switch_to_target_mode()
self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("mr")]) self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("mr")])
self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id]) self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
self.tokenizer._switch_to_input_mode()
self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id(self.tokenizer.src_lang)]) self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id(self.tokenizer.src_lang)])
self.tokenizer.tgt_lang = "zh" self.tokenizer.tgt_lang = "zh"
with self.tokenizer.as_target_tokenizer(): self.tokenizer._switch_to_target_mode()
self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("zh")]) self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("zh")])
self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id]) self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
self.tokenizer._switch_to_input_mode()
self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id(self.tokenizer.src_lang)]) self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id(self.tokenizer.src_lang)])
@require_torch @require_torch

View File

@@ -438,10 +438,7 @@ class TestMarian_EN_DE_More(MarianIntegrationTest):
src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."] src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."]
expected_ids = [38, 121, 14, 697, 38848, 0] expected_ids = [38, 121, 14, 697, 38848, 0]
model_inputs = self.tokenizer(src, return_tensors="pt").to(torch_device) model_inputs = self.tokenizer(src, text_target=tgt, return_tensors="pt").to(torch_device)
with self.tokenizer.as_target_tokenizer():
targets = self.tokenizer(tgt, return_tensors="pt")
model_inputs["labels"] = targets["input_ids"].to(torch_device)
self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist()) self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist())

View File

@@ -145,9 +145,8 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
src_ids = tokenizer(source_text).input_ids src_ids = tokenizer(source_text).input_ids
self.assertListEqual(src_ids, expected_src_ids) self.assertListEqual(src_ids, expected_src_ids)
with tokenizer.as_target_tokenizer(): target_ids = tokenizer(text_target=target_text).input_ids
target_ids = tokenizer(target_text).input_ids self.assertListEqual(target_ids, expected_target_ids)
self.assertListEqual(target_ids, expected_target_ids)
decoded = tokenizer.decode(target_ids, skip_special_tokens=True) decoded = tokenizer.decode(target_ids, skip_special_tokens=True)
self.assertEqual(decoded, target_text) self.assertEqual(decoded, target_text)

View File

@@ -265,33 +265,27 @@ class MBartEnroIntegrationTest(unittest.TestCase):
@require_torch @require_torch
def test_batch_fairseq_parity(self): def test_batch_fairseq_parity(self):
batch = self.tokenizer(self.src_text, padding=True) batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt")
with self.tokenizer.as_target_tokenizer(): batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
targets = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt")
labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id).tolist()
# fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4 # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4
assert batch.input_ids[1][-2:] == [2, EN_CODE] assert batch.input_ids[1][-2:].tolist() == [2, EN_CODE]
assert batch.decoder_input_ids[1][0] == RO_CODE assert batch.decoder_input_ids[1][0].tolist() == RO_CODE
assert batch.decoder_input_ids[1][-1] == 2 assert batch.decoder_input_ids[1][-1] == 2
assert labels[1][-2:].tolist() == [2, RO_CODE] assert batch.labels[1][-2:].tolist() == [2, RO_CODE]
@require_torch @require_torch
def test_enro_tokenizer_prepare_batch(self): def test_enro_tokenizer_prepare_batch(self):
batch = self.tokenizer( batch = self.tokenizer(
self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt" self.src_text,
text_target=self.tgt_text,
padding=True,
truncation=True,
max_length=len(self.expected_src_tokens),
return_tensors="pt",
) )
with self.tokenizer.as_target_tokenizer():
targets = self.tokenizer( batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
self.tgt_text,
padding=True,
truncation=True,
max_length=len(self.expected_src_tokens),
return_tensors="pt",
)
labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
self.assertIsInstance(batch, BatchEncoding) self.assertIsInstance(batch, BatchEncoding)
@@ -306,8 +300,9 @@ class MBartEnroIntegrationTest(unittest.TestCase):
def test_seq2seq_max_length(self): def test_seq2seq_max_length(self):
batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt") batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt")
with self.tokenizer.as_target_tokenizer(): targets = self.tokenizer(
targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt") text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt"
)
labels = targets["input_ids"] labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id) batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)

View File

@@ -256,35 +256,27 @@ class MBart50OneToManyIntegrationTest(unittest.TestCase):
@require_torch @require_torch
def test_batch_fairseq_parity(self): def test_batch_fairseq_parity(self):
batch = self.tokenizer(self.src_text, padding=True) batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt")
with self.tokenizer.as_target_tokenizer(): batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
targets = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt")
labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id).tolist()
labels = labels.tolist()
# fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4 # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4
assert batch.input_ids[1][0] == EN_CODE assert batch.input_ids[1][0] == EN_CODE
assert batch.input_ids[1][-1] == 2 assert batch.input_ids[1][-1] == 2
assert labels[1][0] == RO_CODE assert batch.labels[1][0] == RO_CODE
assert labels[1][-1] == 2 assert batch.labels[1][-1] == 2
assert batch.decoder_input_ids[1][:2] == [2, RO_CODE] assert batch.decoder_input_ids[1][:2].tolist() == [2, RO_CODE]
@require_torch @require_torch
def test_tokenizer_prepare_batch(self): def test_tokenizer_prepare_batch(self):
batch = self.tokenizer( batch = self.tokenizer(
self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt" self.src_text,
text_target=self.tgt_text,
padding=True,
truncation=True,
max_length=len(self.expected_src_tokens),
return_tensors="pt",
) )
with self.tokenizer.as_target_tokenizer(): batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
targets = self.tokenizer(
self.tgt_text,
padding=True,
truncation=True,
max_length=len(self.expected_src_tokens),
return_tensors="pt",
)
labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
self.assertIsInstance(batch, BatchEncoding) self.assertIsInstance(batch, BatchEncoding)
@@ -299,8 +291,9 @@ class MBart50OneToManyIntegrationTest(unittest.TestCase):
def test_seq2seq_max_target_length(self): def test_seq2seq_max_target_length(self):
batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt") batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt")
with self.tokenizer.as_target_tokenizer(): targets = self.tokenizer(
targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt") text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt"
)
labels = targets["input_ids"] labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id) batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)

View File

@@ -125,8 +125,7 @@ class MCTCTProcessorTest(unittest.TestCase):
input_str = "This is a test string" input_str = "This is a test string"
with processor.as_target_processor(): encoded_processor = processor(text=input_str)
encoded_processor = processor(input_str)
encoded_tok = tokenizer(input_str) encoded_tok = tokenizer(input_str)

View File

@@ -112,14 +112,13 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
self.assertNotIn("decoder_attention_mask", batch) self.assertNotIn("decoder_attention_mask", batch)
@require_torch @require_torch
def test_as_target_tokenizer_target_length(self): def test_tokenizer_as_target_length(self):
tgt_text = [ tgt_text = [
"Summary of the text.", "Summary of the text.",
"Another summary.", "Another summary.",
] ]
for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
with tokenizer.as_target_tokenizer(): targets = tokenizer(text_target=tgt_text, max_length=32, padding="max_length", return_tensors="pt")
targets = tokenizer(tgt_text, max_length=32, padding="max_length", return_tensors="pt")
self.assertEqual(32, targets["input_ids"].shape[1]) self.assertEqual(32, targets["input_ids"].shape[1])
@require_torch @require_torch
@@ -139,11 +138,9 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
"Summary of the text.", "Summary of the text.",
] ]
for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
inputs = tokenizer(src_text, return_tensors="pt") inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
with tokenizer.as_target_tokenizer():
targets = tokenizer(tgt_text, return_tensors="pt")
input_ids = inputs["input_ids"] input_ids = inputs["input_ids"]
labels = targets["input_ids"] labels = inputs["labels"]
self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item()) self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item())
self.assertTrue((labels[:, 0] == tokenizer.bos_token_id).all().item()) self.assertTrue((labels[:, 0] == tokenizer.bos_token_id).all().item())
self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item()) self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item())

View File

@@ -373,19 +373,15 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
@require_torch @require_torch
def test_enro_tokenizer_prepare_batch(self): def test_enro_tokenizer_prepare_batch(self):
batch = self.tokenizer( batch = self.tokenizer(
self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt" self.src_text,
text_target=self.tgt_text,
padding=True,
truncation=True,
max_length=len(self.expected_src_tokens),
return_tensors="pt",
) )
with self.tokenizer.as_target_tokenizer():
targets = self.tokenizer(
self.tgt_text,
padding=True,
truncation=True,
max_length=len(self.expected_src_tokens),
return_tensors="pt",
)
labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right( batch["decoder_input_ids"] = shift_tokens_right(
labels, self.tokenizer.pad_token_id, self.tokenizer.lang_code_to_id["ron_Latn"] batch["labels"], self.tokenizer.pad_token_id, self.tokenizer.lang_code_to_id["ron_Latn"]
) )
self.assertIsInstance(batch, BatchEncoding) self.assertIsInstance(batch, BatchEncoding)
@@ -401,8 +397,9 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
def test_seq2seq_max_length(self): def test_seq2seq_max_length(self):
batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt") batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt")
with self.tokenizer.as_target_tokenizer(): targets = self.tokenizer(
targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt") text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt"
)
labels = targets["input_ids"] labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right( batch["decoder_input_ids"] = shift_tokens_right(
labels, labels,

View File

@@ -109,10 +109,9 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
src_texts = ["This is going to be way too long." * 150, "short example"] src_texts = ["This is going to be way too long." * 150, "short example"]
tgt_texts = ["not super long but more than 5 tokens", "tiny"] tgt_texts = ["not super long but more than 5 tokens", "tiny"]
batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt") batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt")
with self._large_tokenizer.as_target_tokenizer(): targets = self._large_tokenizer(
targets = self._large_tokenizer( text_target=tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt" )
)
assert batch.input_ids.shape == (2, 1024) assert batch.input_ids.shape == (2, 1024)
assert batch.attention_mask.shape == (2, 1024) assert batch.attention_mask.shape == (2, 1024)
@@ -174,10 +173,9 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
src_texts = ["This is going to be way too long." * 1000, "short example"] src_texts = ["This is going to be way too long." * 1000, "short example"]
tgt_texts = ["not super long but more than 5 tokens", "tiny"] tgt_texts = ["not super long but more than 5 tokens", "tiny"]
batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt") batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt")
with self._large_tokenizer.as_target_tokenizer(): targets = self._large_tokenizer(
targets = self._large_tokenizer( text_target=tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt" )
)
assert batch.input_ids.shape == (2, 4096) assert batch.input_ids.shape == (2, 4096)
assert batch.attention_mask.shape == (2, 4096) assert batch.attention_mask.shape == (2, 4096)

View File

@@ -146,10 +146,9 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"Summary of the text.", "Summary of the text.",
"Another summary.", "Another summary.",
] ]
with tokenizer.as_target_tokenizer(): targets = tokenizer(
targets = tokenizer( text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK )
)
self.assertEqual(32, targets["input_ids"].shape[1]) self.assertEqual(32, targets["input_ids"].shape[1])
# cannot use default save_and_load_tokenzier test method because tokenzier has no vocab # cannot use default save_and_load_tokenzier test method because tokenzier has no vocab

View File

@@ -299,33 +299,26 @@ class PLBartPythonEnIntegrationTest(unittest.TestCase):
@require_torch @require_torch
def test_batch_fairseq_parity(self): def test_batch_fairseq_parity(self):
batch = self.tokenizer(self.src_text, padding=True) batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt")
with self.tokenizer.as_target_tokenizer(): batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
targets = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt")
labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id).tolist()
# fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4 # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4
self.assertEqual(batch.input_ids[1][-2:], [2, PYTHON_CODE]) self.assertEqual(batch.input_ids[1][-2:].tolist(), [2, PYTHON_CODE])
self.assertEqual(batch.decoder_input_ids[1][0], EN_CODE) self.assertEqual(batch.decoder_input_ids[1][0], EN_CODE)
self.assertEqual(batch.decoder_input_ids[1][-1], 2) self.assertEqual(batch.decoder_input_ids[1][-1], 2)
self.assertEqual(labels[1][-2:].tolist(), [2, EN_CODE]) self.assertEqual(batch.labels[1][-2:].tolist(), [2, EN_CODE])
@require_torch @require_torch
def test_python_en_tokenizer_prepare_batch(self): def test_python_en_tokenizer_prepare_batch(self):
batch = self.tokenizer( batch = self.tokenizer(
self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt" self.src_text,
text_target=self.tgt_text,
padding=True,
truncation=True,
max_length=len(self.expected_src_tokens),
return_tensors="pt",
) )
with self.tokenizer.as_target_tokenizer(): batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
targets = self.tokenizer(
self.tgt_text,
padding=True,
truncation=True,
max_length=len(self.expected_src_tokens),
return_tensors="pt",
)
labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
self.assertIsInstance(batch, BatchEncoding) self.assertIsInstance(batch, BatchEncoding)
@@ -340,8 +333,9 @@ class PLBartPythonEnIntegrationTest(unittest.TestCase):
def test_seq2seq_max_length(self): def test_seq2seq_max_length(self):
batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt") batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt")
with self.tokenizer.as_target_tokenizer(): targets = self.tokenizer(
targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt") text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt"
)
labels = targets["input_ids"] labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id) batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)

View File

@@ -125,8 +125,7 @@ class Speech2TextProcessorTest(unittest.TestCase):
input_str = "This is a test string" input_str = "This is a test string"
with processor.as_target_processor(): encoded_processor = processor(text=input_str)
encoded_processor = processor(input_str)
encoded_tok = tokenizer(input_str) encoded_tok = tokenizer(input_str)

View File

@@ -210,10 +210,9 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"Summary of the text.", "Summary of the text.",
"Another summary.", "Another summary.",
] ]
with tokenizer.as_target_tokenizer(): targets = tokenizer(
targets = tokenizer( text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK )
)
self.assertEqual(32, targets["input_ids"].shape[1]) self.assertEqual(32, targets["input_ids"].shape[1])
def test_outputs_not_longer_than_maxlen(self): def test_outputs_not_longer_than_maxlen(self):
@@ -235,12 +234,10 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
expected_src_tokens = [71, 307, 8986, 21, 4505, 1635, 1707, 5, 1] expected_src_tokens = [71, 307, 8986, 21, 4505, 1635, 1707, 5, 1]
expected_tgt_tokens = [20698, 13, 8, 1499, 5, 1] expected_tgt_tokens = [20698, 13, 8, 1499, 5, 1]
batch = tokenizer(src_text) batch = tokenizer(src_text, text_target=tgt_text)
with tokenizer.as_target_tokenizer():
targets = tokenizer(tgt_text)
self.assertEqual(expected_src_tokens, batch["input_ids"][0]) self.assertEqual(expected_src_tokens, batch["input_ids"][0])
self.assertEqual(expected_tgt_tokens, targets["input_ids"][0]) self.assertEqual(expected_tgt_tokens, batch["labels"][0])
def test_token_type_ids(self): def test_token_type_ids(self):
src_text_1 = ["A first paragraph for summarization."] src_text_1 = ["A first paragraph for summarization."]

View File

@@ -859,9 +859,8 @@ class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base") tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base")
answer_text = "tapex is a good model!" answer_text = "tapex is a good model!"
expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2] expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2]
with tokenizer.as_target_tokenizer(): answer_encoding = tokenizer(answer=answer_text)
answer_encoding = tokenizer(answer=answer_text) self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
@slow @slow
def test_tokenizer_lower_case(self): def test_tokenizer_lower_case(self):
@@ -870,23 +869,21 @@ class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
answer_text = "Beijing, London, Paris" answer_text = "Beijing, London, Paris"
answer_text_lower = "beijing, london, paris" answer_text_lower = "beijing, london, paris"
with cased_tokenizer.as_target_tokenizer(): self.assertNotEqual(
with uncased_tokenizer.as_target_tokenizer(): cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids
self.assertNotEqual( )
cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids self.assertEqual(
) cased_tokenizer(answer=answer_text_lower).input_ids,
self.assertEqual( uncased_tokenizer(answer=answer_text).input_ids,
cased_tokenizer(answer=answer_text_lower).input_ids, )
uncased_tokenizer(answer=answer_text).input_ids, # batched encoding assert
) self.assertNotEqual(
# batched encoding assert cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids
self.assertNotEqual( )
cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids self.assertEqual(
) cased_tokenizer(answer=[answer_text_lower]).input_ids,
self.assertEqual( uncased_tokenizer(answer=[answer_text]).input_ids,
cased_tokenizer(answer=[answer_text_lower]).input_ids, )
uncased_tokenizer(answer=[answer_text]).input_ids,
)
# test input encoding lowercase # test input encoding lowercase
question = "Greece held its last Summer Olympics in 2004" question = "Greece held its last Summer Olympics in 2004"
table_dict = { table_dict = {

View File

@@ -118,8 +118,7 @@ class Wav2Vec2ProcessorTest(unittest.TestCase):
input_str = "This is a test string" input_str = "This is a test string"
with processor.as_target_processor(): encoded_processor = processor(text=input_str)
encoded_processor = processor(input_str)
encoded_tok = tokenizer(input_str) encoded_tok = tokenizer(input_str)

View File

@@ -164,8 +164,7 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase):
input_str = "This is a test string" input_str = "This is a test string"
with processor.as_target_processor(): encoded_processor = processor(text=input_str)
encoded_processor = processor(input_str)
encoded_tok = tokenizer(input_str) encoded_tok = tokenizer(input_str)