[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)
* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples
This commit is contained in:
@@ -40,7 +40,7 @@ def roberta_similarity_batches(to_predict):
|
||||
return similarity_scores
|
||||
|
||||
def similarity_roberta(model, tokenizer, sent_pairs):
|
||||
batch_token = tokenizer.batch_encode_plus(sent_pairs, pad_to_max_length=True, max_length=500)
|
||||
batch_token = tokenizer(sent_pairs, padding='max_length', truncation=True, max_length=500)
|
||||
res = model(torch.tensor(batch_token['input_ids']).cuda(), attention_mask=torch.tensor(batch_token["attention_mask"]).cuda())
|
||||
return res
|
||||
|
||||
|
||||
@@ -60,7 +60,7 @@ tokenizer = BartTokenizer.from_pretrained('a-ware/bart-squadv2')
|
||||
model = BartForQuestionAnswering.from_pretrained('a-ware/bart-squadv2')
|
||||
|
||||
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||
encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
|
||||
encoding = tokenizer(question, text, return_tensors='pt')
|
||||
input_ids = encoding['input_ids']
|
||||
attention_mask = encoding['attention_mask']
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ tokenizer = XLMRobertaTokenizer.from_pretrained('a-ware/xlmroberta-squadv2')
|
||||
model = XLMRobertaForQuestionAnswering.from_pretrained('a-ware/xlmroberta-squadv2')
|
||||
|
||||
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||
encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
|
||||
encoding = tokenizer(question, text, return_tensors='pt')
|
||||
input_ids = encoding['input_ids']
|
||||
attention_mask = encoding['attention_mask']
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ Therefore, this model does not need a tokenizer. The following function can inst
|
||||
import torch
|
||||
|
||||
# Encoding
|
||||
def encode(list_of_strings, pad_to_max_length=True, pad_token_id=0):
|
||||
def encode(list_of_strings, pad_token_id=0):
|
||||
max_length = max([len(string) for string in list_of_strings])
|
||||
|
||||
# create emtpy tensors
|
||||
|
||||
@@ -43,7 +43,7 @@ questions = [
|
||||
]
|
||||
|
||||
for question in questions:
|
||||
inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
|
||||
inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
|
||||
input_ids = inputs["input_ids"].tolist()[0]
|
||||
|
||||
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
||||
|
||||
@@ -50,7 +50,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/longformer-base-4
|
||||
|
||||
text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this."
|
||||
question = "What has Huggingface done ?"
|
||||
encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
|
||||
encoding = tokenizer(question, text, return_tensors="pt")
|
||||
input_ids = encoding["input_ids"]
|
||||
|
||||
# default is local attention everywhere
|
||||
|
||||
@@ -55,7 +55,7 @@ model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-squadv2")
|
||||
|
||||
def get_answer(question, context):
|
||||
input_text = "question: %s context: %s </s>" % (question, context)
|
||||
features = tokenizer.batch_encode_plus([input_text], return_tensors='pt')
|
||||
features = tokenizer([input_text], return_tensors='pt')
|
||||
|
||||
output = model.generate(input_ids=features['input_ids'],
|
||||
attention_mask=features['attention_mask'])
|
||||
|
||||
@@ -55,7 +55,7 @@ class SentimentModel():
|
||||
def predict_sentiment(self, texts: List[str])-> List[str]:
|
||||
texts = [self.clean_text(text) for text in texts]
|
||||
# Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
|
||||
input_ids = self.tokenizer.batch_encode_plus(texts,pad_to_max_length=True, add_special_tokens=True)
|
||||
input_ids = self.tokenizer(texts, padding=True, truncation=True, add_special_tokens=True)
|
||||
input_ids = torch.tensor(input_ids["input_ids"])
|
||||
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -50,7 +50,7 @@ tokenizer = BartTokenizer.from_pretrained('valhalla/bart-large-finetuned-squadv1
|
||||
model = BartForQuestionAnswering.from_pretrained('valhalla/bart-large-finetuned-squadv1')
|
||||
|
||||
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||
encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
|
||||
encoding = tokenizer(question, text, return_tensors='pt')
|
||||
input_ids = encoding['input_ids']
|
||||
attention_mask = encoding['attention_mask']
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("valhalla/longformer-base-
|
||||
|
||||
text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this."
|
||||
question = "What has Huggingface done ?"
|
||||
encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
|
||||
encoding = tokenizer(question, text, return_tensors="pt")
|
||||
input_ids = encoding["input_ids"]
|
||||
|
||||
# default is local attention everywhere
|
||||
|
||||
@@ -19,7 +19,7 @@ model = AutoModelWithLMHead.from_pretrained("valhalla/t5-base-squad")
|
||||
|
||||
def get_answer(question, context):
|
||||
input_text = "question: %s context: %s </s>" % (question, context)
|
||||
features = tokenizer.batch_encode_plus([input_text], return_tensors='pt')
|
||||
features = tokenizer([input_text], return_tensors='pt')
|
||||
|
||||
out = model.generate(input_ids=features['input_ids'],
|
||||
attention_mask=features['attention_mask'])
|
||||
|
||||
Reference in New Issue
Block a user