[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples
2020-06-26 19:48:14 +02:00
parent fd405e9a93
commit 601d4d699c
73 changed files with 180 additions and 138 deletions
--- a/model_cards/SparkBeyond/roberta-large-sts-b/README.md
+++ b/model_cards/SparkBeyond/roberta-large-sts-b/README.md
@@ -40,7 +40,7 @@ def roberta_similarity_batches(to_predict):
  return similarity_scores

 def similarity_roberta(model, tokenizer, sent_pairs):
-  batch_token = tokenizer.batch_encode_plus(sent_pairs, pad_to_max_length=True, max_length=500)
+  batch_token = tokenizer(sent_pairs, padding='max_length', truncation=True, max_length=500)
  res = model(torch.tensor(batch_token['input_ids']).cuda(), attention_mask=torch.tensor(batch_token["attention_mask"]).cuda())  
  return res

--- a/model_cards/a-ware/bart-squadv2/README.md
+++ b/model_cards/a-ware/bart-squadv2/README.md
@@ -60,7 +60,7 @@ tokenizer = BartTokenizer.from_pretrained('a-ware/bart-squadv2')
 model = BartForQuestionAnswering.from_pretrained('a-ware/bart-squadv2')

 question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
+encoding = tokenizer(question, text, return_tensors='pt')
 input_ids = encoding['input_ids']
 attention_mask = encoding['attention_mask']

--- a/model_cards/a-ware/xlmroberta-squadv2/README.md
+++ b/model_cards/a-ware/xlmroberta-squadv2/README.md
@@ -43,7 +43,7 @@ tokenizer = XLMRobertaTokenizer.from_pretrained('a-ware/xlmroberta-squadv2')
 model = XLMRobertaForQuestionAnswering.from_pretrained('a-ware/xlmroberta-squadv2')

 question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
+encoding = tokenizer(question, text, return_tensors='pt')
 input_ids = encoding['input_ids']
 attention_mask = encoding['attention_mask']

--- a/model_cards/google/reformer-enwik8/README.md
+++ b/model_cards/google/reformer-enwik8/README.md
@@ -14,7 +14,7 @@ Therefore, this model does not need a tokenizer. The following function can inst
 import torch

 # Encoding
-def encode(list_of_strings, pad_to_max_length=True, pad_token_id=0):
+def encode(list_of_strings, pad_token_id=0):
    max_length = max([len(string) for string in list_of_strings])

    # create emtpy tensors
--- a/model_cards/lserinol/bert-turkish-question-answering/README.md
+++ b/model_cards/lserinol/bert-turkish-question-answering/README.md
@@ -43,7 +43,7 @@ questions = [
 ]

 for question in questions:
-    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
+    inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
--- a/model_cards/mrm8488/longformer-base-4096-finetuned-squadv2/README.md
+++ b/model_cards/mrm8488/longformer-base-4096-finetuned-squadv2/README.md
@@ -50,7 +50,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/longformer-base-4

 text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this."
 question = "What has Huggingface done ?"
-encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
+encoding = tokenizer(question, text, return_tensors="pt")
 input_ids = encoding["input_ids"]

 # default is local attention everywhere
--- a/model_cards/mrm8488/t5-base-finetuned-squadv2/README.md
+++ b/model_cards/mrm8488/t5-base-finetuned-squadv2/README.md
@@ -55,7 +55,7 @@ model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-squadv2")

 def get_answer(question, context):
  input_text = "question: %s  context: %s </s>" % (question, context)
-  features = tokenizer.batch_encode_plus([input_text], return_tensors='pt')
+  features = tokenizer([input_text], return_tensors='pt')

  output = model.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'])
--- a/model_cards/oliverguhr/german-sentiment-bert/README.md
+++ b/model_cards/oliverguhr/german-sentiment-bert/README.md
@@ -55,7 +55,7 @@ class SentimentModel():
    def predict_sentiment(self, texts: List[str])-> List[str]:
        texts = [self.clean_text(text) for text in texts]
        # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
-        input_ids = self.tokenizer.batch_encode_plus(texts,pad_to_max_length=True, add_special_tokens=True)
+        input_ids = self.tokenizer(texts, padding=True, truncation=True, add_special_tokens=True)
        input_ids = torch.tensor(input_ids["input_ids"])

        with torch.no_grad():
--- a/model_cards/valhalla/bart-large-finetuned-squadv1/README.md
+++ b/model_cards/valhalla/bart-large-finetuned-squadv1/README.md
@@ -50,7 +50,7 @@ tokenizer = BartTokenizer.from_pretrained('valhalla/bart-large-finetuned-squadv1
 model = BartForQuestionAnswering.from_pretrained('valhalla/bart-large-finetuned-squadv1')

 question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
+encoding = tokenizer(question, text, return_tensors='pt')
 input_ids = encoding['input_ids']
 attention_mask = encoding['attention_mask']

--- a/model_cards/valhalla/longformer-base-4096-finetuned-squadv1/README.md
+++ b/model_cards/valhalla/longformer-base-4096-finetuned-squadv1/README.md
@@ -33,7 +33,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("valhalla/longformer-base-

 text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this."
 question = "What has Huggingface done ?"
-encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
+encoding = tokenizer(question, text, return_tensors="pt")
 input_ids = encoding["input_ids"]

 # default is local attention everywhere
--- a/model_cards/valhalla/t5-base-squad/README.md
+++ b/model_cards/valhalla/t5-base-squad/README.md
@@ -19,7 +19,7 @@ model = AutoModelWithLMHead.from_pretrained("valhalla/t5-base-squad")

 def get_answer(question, context):
  input_text = "question: %s  context: %s </s>" % (question, context)
-  features = tokenizer.batch_encode_plus([input_text], return_tensors='pt')
+  features = tokenizer([input_text], return_tensors='pt')

  out = model.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'])