Tokenizers should be framework agnostic (#8599)
* Tokenizers should be framework agnostic * Run the slow tests * Not testing * Fix documentation * Apply suggestions from code review Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
@@ -61,7 +61,9 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def test_pegasus_large_seq2seq_truncation(self):
|
||||
src_texts = ["This is going to be way too long." * 150, "short example"]
|
||||
tgt_texts = ["not super long but more than 5 tokens", "tiny"]
|
||||
batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, max_target_length=5)
|
||||
batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch(
|
||||
src_texts, tgt_texts=tgt_texts, max_target_length=5, return_tensors="pt"
|
||||
)
|
||||
assert batch.input_ids.shape == (2, 1024)
|
||||
assert batch.attention_mask.shape == (2, 1024)
|
||||
assert "labels" in batch # because tgt_texts was specified
|
||||
|
||||
Reference in New Issue
Block a user