Replace as_target context managers by direct calls (#18325)

* Preliminary work on tokenizers

* Quality + fix tests

* Treat processors

* Fix pad

* Remove all uses of  in tests, docs and examples

* Replace all as_target_tokenizer

* Fix tests

* Fix quality

* Update examples/flax/image-captioning/run_image_captioning_flax.py

Co-authored-by: amyeroberts <amy@huggingface.co>

* Style

Co-authored-by: amyeroberts <amy@huggingface.co>
This commit is contained in:
Sylvain Gugger
2022-07-29 08:09:09 -04:00
committed by GitHub
parent a64bcb564d
commit 986526a0e4
80 changed files with 725 additions and 550 deletions

View File

@@ -265,33 +265,27 @@ class MBartEnroIntegrationTest(unittest.TestCase):
@require_torch
def test_batch_fairseq_parity(self):
batch = self.tokenizer(self.src_text, padding=True)
with self.tokenizer.as_target_tokenizer():
targets = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt")
labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id).tolist()
batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt")
batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
# fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4
assert batch.input_ids[1][-2:] == [2, EN_CODE]
assert batch.decoder_input_ids[1][0] == RO_CODE
assert batch.input_ids[1][-2:].tolist() == [2, EN_CODE]
assert batch.decoder_input_ids[1][0].tolist() == RO_CODE
assert batch.decoder_input_ids[1][-1] == 2
assert labels[1][-2:].tolist() == [2, RO_CODE]
assert batch.labels[1][-2:].tolist() == [2, RO_CODE]
@require_torch
def test_enro_tokenizer_prepare_batch(self):
batch = self.tokenizer(
self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt"
self.src_text,
text_target=self.tgt_text,
padding=True,
truncation=True,
max_length=len(self.expected_src_tokens),
return_tensors="pt",
)
with self.tokenizer.as_target_tokenizer():
targets = self.tokenizer(
self.tgt_text,
padding=True,
truncation=True,
max_length=len(self.expected_src_tokens),
return_tensors="pt",
)
labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
self.assertIsInstance(batch, BatchEncoding)
@@ -306,8 +300,9 @@ class MBartEnroIntegrationTest(unittest.TestCase):
def test_seq2seq_max_length(self):
batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt")
with self.tokenizer.as_target_tokenizer():
targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt")
targets = self.tokenizer(
text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt"
)
labels = targets["input_ids"]
batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)