[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors

* style

* fix tests - better type checking error messages

* better type checking

* include awesome fix by @LysandreJik for #5310

* updated doc and examples
This commit is contained in:
Thomas Wolf
2020-06-26 19:48:14 +02:00
committed by GitHub
parent fd405e9a93
commit 601d4d699c
73 changed files with 180 additions and 138 deletions

View File

@@ -45,9 +45,9 @@ def generate_summaries_or_translations(
for batch in tqdm(list(chunks(examples, batch_size))):
if "t5" in model_name:
batch = [model.config.prefix + text for text in batch]
batch = tokenizer.batch_encode_plus(
batch, max_length=1024, return_tensors="pt", truncation=True, pad_to_max_length=True
).to(device)
batch = tokenizer(batch, max_length=1024, return_tensors="pt", truncation=True, padding="max_length").to(
device
)
summaries = model.generate(**batch, **gen_kwargs)
dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
for hypothesis in dec: