Reformat source code with black.
This is the result of:
$ black --line-length 119 examples templates transformers utils hubconf.py setup.py
There's a lot of fairly long lines in the project. As a consequence, I'm
picking the longest widely accepted line length, 119 characters.
This is also Thomas' preference, because it allows for explicit variable
names, to make the code easier to understand.
This commit is contained in:
@@ -10,38 +10,37 @@ from transformers.modeling_camembert import CamembertForMaskedLM
|
||||
|
||||
def fill_mask(masked_input, model, tokenizer, topk=5):
|
||||
# Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
|
||||
assert masked_input.count('<mask>') == 1
|
||||
assert masked_input.count("<mask>") == 1
|
||||
input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
logits = model(input_ids)[0] # The last hidden-state is the first element of the output tuple
|
||||
masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
|
||||
logits = logits[0, masked_index, :]
|
||||
prob = logits.softmax(dim=0)
|
||||
values, indices = prob.topk(k=topk, dim=0)
|
||||
topk_predicted_token_bpe = ' '.join([tokenizer.convert_ids_to_tokens(indices[i].item())
|
||||
for i in range(len(indices))])
|
||||
topk_predicted_token_bpe = " ".join(
|
||||
[tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
|
||||
)
|
||||
masked_token = tokenizer.mask_token
|
||||
topk_filled_outputs = []
|
||||
for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')):
|
||||
predicted_token = predicted_token_bpe.replace('\u2581', ' ')
|
||||
for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
|
||||
predicted_token = predicted_token_bpe.replace("\u2581", " ")
|
||||
if " {0}".format(masked_token) in masked_input:
|
||||
topk_filled_outputs.append((
|
||||
masked_input.replace(
|
||||
' {0}'.format(masked_token), predicted_token
|
||||
),
|
||||
values[index].item(),
|
||||
predicted_token,
|
||||
))
|
||||
topk_filled_outputs.append(
|
||||
(
|
||||
masked_input.replace(" {0}".format(masked_token), predicted_token),
|
||||
values[index].item(),
|
||||
predicted_token,
|
||||
)
|
||||
)
|
||||
else:
|
||||
topk_filled_outputs.append((
|
||||
masked_input.replace(masked_token, predicted_token),
|
||||
values[index].item(),
|
||||
predicted_token,
|
||||
))
|
||||
topk_filled_outputs.append(
|
||||
(masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
|
||||
)
|
||||
return topk_filled_outputs
|
||||
|
||||
|
||||
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
|
||||
model = CamembertForMaskedLM.from_pretrained('camembert-base')
|
||||
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
|
||||
model = CamembertForMaskedLM.from_pretrained("camembert-base")
|
||||
model.eval()
|
||||
|
||||
masked_input = "Le camembert est <mask> :)"
|
||||
|
||||
Reference in New Issue
Block a user