rename prepare_translation_batch -> prepare_seq2seq_batch (#6103)

This commit is contained in:
Sam Shleifer
2020-08-11 15:57:07 -04:00
committed by GitHub
parent 66fa8ceaea
commit be1520d3a3
14 changed files with 208 additions and 123 deletions

View File

@@ -97,7 +97,7 @@ class MarianIntegrationTest(unittest.TestCase):
self.assertListEqual(self.expected_text, generated_words)
def translate_src_text(self, **tokenizer_kwargs):
model_inputs = self.tokenizer.prepare_translation_batch(src_texts=self.src_text, **tokenizer_kwargs).to(
model_inputs = self.tokenizer.prepare_seq2seq_batch(src_texts=self.src_text, **tokenizer_kwargs).to(
torch_device
)
self.assertEqual(self.model.device, model_inputs.input_ids.device)
@@ -114,7 +114,7 @@ class TestMarian_EN_DE_More(MarianIntegrationTest):
src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."]
expected_ids = [38, 121, 14, 697, 38848, 0]
model_inputs: dict = self.tokenizer.prepare_translation_batch(src, tgt_texts=tgt).to(torch_device)
model_inputs: dict = self.tokenizer.prepare_seq2seq_batch(src, tgt_texts=tgt).to(torch_device)
self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist())
desired_keys = {
@@ -131,12 +131,12 @@ class TestMarian_EN_DE_More(MarianIntegrationTest):
def test_unk_support(self):
t = self.tokenizer
ids = t.prepare_translation_batch(["||"]).to(torch_device).input_ids[0].tolist()
ids = t.prepare_seq2seq_batch(["||"]).to(torch_device).input_ids[0].tolist()
expected = [t.unk_token_id, t.unk_token_id, t.eos_token_id]
self.assertEqual(expected, ids)
def test_pad_not_split(self):
input_ids_w_pad = self.tokenizer.prepare_translation_batch(["I am a small frog <pad>"]).input_ids[0].tolist()
input_ids_w_pad = self.tokenizer.prepare_seq2seq_batch(["I am a small frog <pad>"]).input_ids[0].tolist()
expected_w_pad = [38, 121, 14, 697, 38848, self.tokenizer.pad_token_id, 0] # pad
self.assertListEqual(expected_w_pad, input_ids_w_pad)
@@ -229,7 +229,7 @@ class TestMarian_en_ROMANCE(MarianIntegrationTest):
normalized = self.tokenizer.normalize("")
self.assertIsInstance(normalized, str)
with self.assertRaises(ValueError):
self.tokenizer.prepare_translation_batch([""])
self.tokenizer.prepare_seq2seq_batch([""])
def test_pipeline(self):
device = 0 if torch_device == "cuda" else -1