Deprecate prepare_seq2seq_batch (#10287)
* Deprecate prepare_seq2seq_batch * Fix last tests * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Suraj Patil <surajp815@gmail.com> * More review comments Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Suraj Patil <surajp815@gmail.com>
This commit is contained in:
@@ -70,7 +70,7 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
def test_tokenizer_equivalence_en_de(self):
|
||||
en_de_tokenizer = MarianTokenizer.from_pretrained(f"{ORG_NAME}opus-mt-en-de")
|
||||
batch = en_de_tokenizer.prepare_seq2seq_batch(["I am a small frog"], return_tensors=None)
|
||||
batch = en_de_tokenizer(["I am a small frog"], return_tensors=None)
|
||||
self.assertIsInstance(batch, BatchEncoding)
|
||||
expected = [38, 121, 14, 697, 38848, 0]
|
||||
self.assertListEqual(expected, batch.input_ids[0])
|
||||
@@ -84,12 +84,14 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def test_outputs_not_longer_than_maxlen(self):
|
||||
tok = self.get_tokenizer()
|
||||
|
||||
batch = tok.prepare_seq2seq_batch(["I am a small frog" * 1000, "I am a small frog"], return_tensors=FRAMEWORK)
|
||||
batch = tok(
|
||||
["I am a small frog" * 1000, "I am a small frog"], padding=True, truncation=True, return_tensors=FRAMEWORK
|
||||
)
|
||||
self.assertIsInstance(batch, BatchEncoding)
|
||||
self.assertEqual(batch.input_ids.shape, (2, 512))
|
||||
|
||||
def test_outputs_can_be_shorter(self):
|
||||
tok = self.get_tokenizer()
|
||||
batch_smaller = tok.prepare_seq2seq_batch(["I am a tiny frog", "I am a small frog"], return_tensors=FRAMEWORK)
|
||||
batch_smaller = tok(["I am a tiny frog", "I am a small frog"], padding=True, return_tensors=FRAMEWORK)
|
||||
self.assertIsInstance(batch_smaller, BatchEncoding)
|
||||
self.assertEqual(batch_smaller.input_ids.shape, (2, 10))
|
||||
|
||||
Reference in New Issue
Block a user