[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors

* style

* fix tests - better type checking error messages

* better type checking

* include awesome fix by @LysandreJik for #5310

* updated doc and examples
This commit is contained in:
Thomas Wolf
2020-06-26 19:48:14 +02:00
committed by GitHub
parent fd405e9a93
commit 601d4d699c
73 changed files with 180 additions and 138 deletions

View File

@@ -40,7 +40,7 @@ def roberta_similarity_batches(to_predict):
return similarity_scores
def similarity_roberta(model, tokenizer, sent_pairs):
batch_token = tokenizer.batch_encode_plus(sent_pairs, pad_to_max_length=True, max_length=500)
batch_token = tokenizer(sent_pairs, padding='max_length', truncation=True, max_length=500)
res = model(torch.tensor(batch_token['input_ids']).cuda(), attention_mask=torch.tensor(batch_token["attention_mask"]).cuda())
return res

View File

@@ -60,7 +60,7 @@ tokenizer = BartTokenizer.from_pretrained('a-ware/bart-squadv2')
model = BartForQuestionAnswering.from_pretrained('a-ware/bart-squadv2')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
encoding = tokenizer(question, text, return_tensors='pt')
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

View File

@@ -43,7 +43,7 @@ tokenizer = XLMRobertaTokenizer.from_pretrained('a-ware/xlmroberta-squadv2')
model = XLMRobertaForQuestionAnswering.from_pretrained('a-ware/xlmroberta-squadv2')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
encoding = tokenizer(question, text, return_tensors='pt')
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

View File

@@ -14,7 +14,7 @@ Therefore, this model does not need a tokenizer. The following function can inst
import torch
# Encoding
def encode(list_of_strings, pad_to_max_length=True, pad_token_id=0):
def encode(list_of_strings, pad_token_id=0):
max_length = max([len(string) for string in list_of_strings])
# create emtpy tensors

View File

@@ -43,7 +43,7 @@ questions = [
]
for question in questions:
inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)

View File

@@ -50,7 +50,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/longformer-base-4
text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this."
question = "What has Huggingface done ?"
encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
encoding = tokenizer(question, text, return_tensors="pt")
input_ids = encoding["input_ids"]
# default is local attention everywhere

View File

@@ -55,7 +55,7 @@ model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-squadv2")
def get_answer(question, context):
input_text = "question: %s context: %s </s>" % (question, context)
features = tokenizer.batch_encode_plus([input_text], return_tensors='pt')
features = tokenizer([input_text], return_tensors='pt')
output = model.generate(input_ids=features['input_ids'],
attention_mask=features['attention_mask'])

View File

@@ -55,7 +55,7 @@ class SentimentModel():
def predict_sentiment(self, texts: List[str])-> List[str]:
texts = [self.clean_text(text) for text in texts]
# Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
input_ids = self.tokenizer.batch_encode_plus(texts,pad_to_max_length=True, add_special_tokens=True)
input_ids = self.tokenizer(texts, padding=True, truncation=True, add_special_tokens=True)
input_ids = torch.tensor(input_ids["input_ids"])
with torch.no_grad():

View File

@@ -50,7 +50,7 @@ tokenizer = BartTokenizer.from_pretrained('valhalla/bart-large-finetuned-squadv1
model = BartForQuestionAnswering.from_pretrained('valhalla/bart-large-finetuned-squadv1')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
encoding = tokenizer(question, text, return_tensors='pt')
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

View File

@@ -33,7 +33,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("valhalla/longformer-base-
text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this."
question = "What has Huggingface done ?"
encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
encoding = tokenizer(question, text, return_tensors="pt")
input_ids = encoding["input_ids"]
# default is local attention everywhere

View File

@@ -19,7 +19,7 @@ model = AutoModelWithLMHead.from_pretrained("valhalla/t5-base-squad")
def get_answer(question, context):
input_text = "question: %s context: %s </s>" % (question, context)
features = tokenizer.batch_encode_plus([input_text], return_tensors='pt')
features = tokenizer([input_text], return_tensors='pt')
out = model.generate(input_ids=features['input_ids'],
attention_mask=features['attention_mask'])