Merge pull request #1601 from huggingface/clean-roberta

Clean roberta model & all tokenizers now add special tokens by default (breaking change)
This commit is contained in:
Thomas Wolf
2019-10-30 17:00:40 +01:00
committed by GitHub
12 changed files with 41 additions and 67 deletions

View File

@@ -309,7 +309,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript)
model = AutoModel.from_pretrained(model_name, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_sequence = tokenizer.encode(input_text)
tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
max_input_size = tokenizer.max_model_input_sizes[model_name]
batch_sizes = [1, 2, 4, 8]
@@ -353,7 +353,7 @@ def _compute_tensorflow(model_names, dictionary, average_over):
model = TFAutoModel.from_pretrained(model_name, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_sequence = tokenizer.encode(input_text)
tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
max_input_size = tokenizer.max_model_input_sizes[model_name]
batch_sizes = [1, 2, 4, 8]

View File

@@ -68,7 +68,7 @@ def main():
start = time.time()
for text in data:
text = f'{bos} {text.strip()} {sep}'
token_ids = tokenizer.encode(text)
token_ids = tokenizer.encode(text, add_special_tokens=False)
rslt.append(token_ids)
iter += 1

View File

@@ -223,7 +223,7 @@ def main():
if args.model_type in ["transfo-xl", "xlnet"]:
# Models with memory likes to have a long prompt for short inputs.
raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text
context_tokens = tokenizer.encode(raw_text)
context_tokens = tokenizer.encode(raw_text, add_special_tokens=False)
if args.model_type == "ctrl":
if not any(context_tokens[0] == x for x in tokenizer.control_codes.values()):
logger.info("WARNING! You are not starting your generation from a control code so you won't get good results")