From ab1238393caba8e5ab9eafc50fe6b6defe15bbb1 Mon Sep 17 00:00:00 2001 From: Ilias Chalkidis Date: Wed, 19 Feb 2020 13:26:16 +0200 Subject: [PATCH] Update to include example of LM The model files have been updated in order to include the classification layers, based on https://github.com/huggingface/transformers/issues/2901, and now can be also used as a LM. --- .../bert-base-greek-uncased-v1/README.md | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/model_cards/nlpaueb/bert-base-greek-uncased-v1/README.md b/model_cards/nlpaueb/bert-base-greek-uncased-v1/README.md index 14c5ccece8..05d69d58e3 100644 --- a/model_cards/nlpaueb/bert-base-greek-uncased-v1/README.md +++ b/model_cards/nlpaueb/bert-base-greek-uncased-v1/README.md @@ -68,6 +68,47 @@ tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1") model = AutoModel.from_pretrained("nlpaueb/bert-base-greek-uncased-v1") ``` +## Use Pretrained Model as a Language Model + +```python +import torch +from transformers import * + +# Load model and tokenizer +tokenizer_greek = AutoTokenizer.from_pretrained('nlpaueb/bert-base-greek-uncased-v1') +lm_model_greek = AutoModelWithLMHead.from_pretrained('nlpaueb/bert-base-greek-uncased-v1') + +# ================ EXAMPLE 1 ================ +text_1 = 'O ποιητής έγραψε ένα [MASK] .' +# EN: 'The poet wrote a [MASK].' +input_ids = tokenizer_greek.encode(text_1) +print(tokenizer_greek.convert_ids_to_tokens(input_ids)) +# ['[CLS]', 'o', 'ποιητης', 'εγραψε', 'ενα', '[MASK]', '.', '[SEP]'] +outputs = lm_model_greek(torch.tensor([input_ids]))[0] +print(tokenizer_greek.convert_ids_to_tokens(outputs[0, 5].max(0)[1].item())) +# the most plausible prediction for [MASK] is "song" + +# ================ EXAMPLE 2 ================ +text_2 = 'Είναι ένας [MASK] άνθρωπος.' +# EN: 'He is a [MASK] person.' +input_ids = tokenizer_greek.encode(text_1) +print(tokenizer_greek.convert_ids_to_tokens(input_ids)) +# ['[CLS]', 'ειναι', 'ενας', '[MASK]', 'ανθρωπος', '.', '[SEP]'] +outputs = lm_model_greek(torch.tensor([input_ids]))[0] +print(tokenizer_greek.convert_ids_to_tokens(outputs[0, 3].max(0)[1].item())) +# the most plausible prediction for [MASK] is "good" + +# ================ EXAMPLE 3 ================ +text_3 = 'Είναι ένας [MASK] άνθρωπος και κάνει συχνά [MASK].' +# EN: 'He is a [MASK] person he does frequently [MASK].' +input_ids = tokenizer_greek.encode(text_3) +print(tokenizer_greek.convert_ids_to_tokens(input_ids)) +# ['[CLS]', 'ειναι', 'ενας', '[MASK]', 'ανθρωπος', 'και', 'κανει', 'συχνα', '[MASK]', '.', '[SEP]'] +outputs = lm_model_greek(torch.tensor([input_ids]))[0] +print(tokenizer_greek.convert_ids_to_tokens(outputs[0, 8].max(0)[1].item())) +# the most plausible prediction for the second [MASK] is "trips" +``` + ## Evaluation on downstream tasks TBA