Tokenizers should be framework agnostic (#8599)

* Tokenizers should be framework agnostic

* Run the slow tests

* Not testing

* Fix documentation

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
Lysandre Debut
2020-11-17 14:03:03 -05:00
committed by GitHub
parent 7f3b41a306
commit 3095ee9dab
28 changed files with 73 additions and 177 deletions

View File

@@ -61,7 +61,9 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_pegasus_large_seq2seq_truncation(self):
src_texts = ["This is going to be way too long." * 150, "short example"]
tgt_texts = ["not super long but more than 5 tokens", "tiny"]
batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, max_target_length=5)
batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch(
src_texts, tgt_texts=tgt_texts, max_target_length=5, return_tensors="pt"
)
assert batch.input_ids.shape == (2, 1024)
assert batch.attention_mask.shape == (2, 1024)
assert "labels" in batch # because tgt_texts was specified