Tokenizers should be framework agnostic (#8599)

* Tokenizers should be framework agnostic * Run the slow tests * Not testing * Fix documentation * Apply suggestions from code review Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
2020-11-17 14:03:03 -05:00
parent 7f3b41a306
commit 3095ee9dab
28 changed files with 73 additions and 177 deletions
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -44,7 +44,7 @@ the sequences for sequence-to-sequence fine-tuning.

    example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
    expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-    batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian)
+    batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt")
    model(input_ids=batch['input_ids'], labels=batch['labels']) # forward pass

 - Generation
@@ -58,7 +58,7 @@ the sequences for sequence-to-sequence fine-tuning.
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
    article = "UN Chief Says There Is No Military Solution in Syria"
-    batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX")
+    batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX", return_tensors="pt")
    translated_tokens = model.generate(**batch, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
    translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"