From d7b3bf547c9d488192cb4a7ac394907a23dc8bae Mon Sep 17 00:00:00 2001 From: monologg Date: Mon, 27 Apr 2020 02:07:26 +0900 Subject: [PATCH] Model cards for KoELECTRA --- .../koelectra-base-discriminator/README.md | 52 +++++++++++++++++++ .../koelectra-base-generator/README.md | 45 ++++++++++++++++ .../koelectra-small-discriminator/README.md | 52 +++++++++++++++++++ .../koelectra-small-generator/README.md | 45 ++++++++++++++++ 4 files changed, 194 insertions(+) create mode 100644 model_cards/monologg/koelectra-base-discriminator/README.md create mode 100644 model_cards/monologg/koelectra-base-generator/README.md create mode 100644 model_cards/monologg/koelectra-small-discriminator/README.md create mode 100644 model_cards/monologg/koelectra-small-generator/README.md diff --git a/model_cards/monologg/koelectra-base-discriminator/README.md b/model_cards/monologg/koelectra-base-discriminator/README.md new file mode 100644 index 0000000000..7bfe13e469 --- /dev/null +++ b/model_cards/monologg/koelectra-base-discriminator/README.md @@ -0,0 +1,52 @@ +--- +language: Korean +--- + +# KoELECTRA (Base Discriminator) + +Pretrained ELECTRA Language Model for Korean (`koelectra-base-discriminator`) + +For more detail, please see [original repository](https://github.com/monologg/KoELECTRA/blob/master/README_EN.md). + +## Usage + +### Load model and tokenizer + +```python +>>> from transformers import ElectraModel, ElectraTokenizer + +>>> model = ElectraModel.from_pretrained("monologg/koelectra-base-discriminator") +>>> tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator") +``` + +### Tokenizer example + +```python +>>> from transformers import ElectraTokenizer +>>> tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator") +>>> tokenizer.tokenize("[CLS] 한국어 ELECTRA를 공유합니다. [SEP]") +['[CLS]', '한국어', 'E', '##L', '##EC', '##T', '##RA', '##를', '공유', '##합니다', '.', '[SEP]'] +>>> tokenizer.convert_tokens_to_ids(['[CLS]', '한국어', 'E', '##L', '##EC', '##T', '##RA', '##를', '공유', '##합니다', '.', '[SEP]']) +[2, 18429, 41, 6240, 15229, 6204, 20894, 5689, 12622, 10690, 18, 3] +``` + +## Example using ElectraForPreTraining + +```python +import torch +from transformers import ElectraForPreTraining, ElectraTokenizer + +discriminator = ElectraForPreTraining.from_pretrained("monologg/koelectra-base-discriminator") +tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator") + +sentence = "나는 방금 밥을 먹었다." +fake_sentence = "나는 내일 밥을 먹었다." + +fake_tokens = tokenizer.tokenize(fake_sentence) +fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt") + +discriminator_outputs = discriminator(fake_inputs) +predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2) + +print(list(zip(fake_tokens, predictions.tolist()[1:-1]))) +``` diff --git a/model_cards/monologg/koelectra-base-generator/README.md b/model_cards/monologg/koelectra-base-generator/README.md new file mode 100644 index 0000000000..08c0e74714 --- /dev/null +++ b/model_cards/monologg/koelectra-base-generator/README.md @@ -0,0 +1,45 @@ +--- +language: Korean +--- + +# KoELECTRA (Base Generator) + +Pretrained ELECTRA Language Model for Korean (`koelectra-base-generator`) + +For more detail, please see [original repository](https://github.com/monologg/KoELECTRA/blob/master/README_EN.md). + +## Usage + +### Load model and tokenizer + +```python +>>> from transformers import ElectraModel, ElectraTokenizer + +>>> model = ElectraModel.from_pretrained("monologg/koelectra-base-generator") +>>> tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-generator") +``` + +### Tokenizer example + +```python +>>> from transformers import ElectraTokenizer +>>> tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-generator") +>>> tokenizer.tokenize("[CLS] 한국어 ELECTRA를 공유합니다. [SEP]") +['[CLS]', '한국어', 'E', '##L', '##EC', '##T', '##RA', '##를', '공유', '##합니다', '.', '[SEP]'] +>>> tokenizer.convert_tokens_to_ids(['[CLS]', '한국어', 'E', '##L', '##EC', '##T', '##RA', '##를', '공유', '##합니다', '.', '[SEP]']) +[2, 18429, 41, 6240, 15229, 6204, 20894, 5689, 12622, 10690, 18, 3] +``` + +## Example using ElectraForMaskedLM + +```python +from transformers import pipeline + +fill_mask = pipeline( + "fill-mask", + model="monologg/koelectra-base-generator", + tokenizer="monologg/koelectra-base-generator" +) + +print(fill_mask("나는 {} 밥을 먹었다.".format(fill_mask.tokenizer.mask_token))) +``` diff --git a/model_cards/monologg/koelectra-small-discriminator/README.md b/model_cards/monologg/koelectra-small-discriminator/README.md new file mode 100644 index 0000000000..950736209e --- /dev/null +++ b/model_cards/monologg/koelectra-small-discriminator/README.md @@ -0,0 +1,52 @@ +--- +language: Korean +--- + +# KoELECTRA (Small Discriminator) + +Pretrained ELECTRA Language Model for Korean (`koelectra-small-discriminator`) + +For more detail, please see [original repository](https://github.com/monologg/KoELECTRA/blob/master/README_EN.md). + +## Usage + +### Load model and tokenizer + +```python +>>> from transformers import ElectraModel, ElectraTokenizer + +>>> model = ElectraModel.from_pretrained("monologg/koelectra-small-discriminator") +>>> tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-discriminator") +``` + +### Tokenizer example + +```python +>>> from transformers import ElectraTokenizer +>>> tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-discriminator") +>>> tokenizer.tokenize("[CLS] 한국어 ELECTRA를 공유합니다. [SEP]") +['[CLS]', '한국어', 'E', '##L', '##EC', '##T', '##RA', '##를', '공유', '##합니다', '.', '[SEP]'] +>>> tokenizer.convert_tokens_to_ids(['[CLS]', '한국어', 'E', '##L', '##EC', '##T', '##RA', '##를', '공유', '##합니다', '.', '[SEP]']) +[2, 18429, 41, 6240, 15229, 6204, 20894, 5689, 12622, 10690, 18, 3] +``` + +## Example using ElectraForPreTraining + +```python +import torch +from transformers import ElectraForPreTraining, ElectraTokenizer + +discriminator = ElectraForPreTraining.from_pretrained("monologg/koelectra-small-discriminator") +tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-discriminator") + +sentence = "나는 방금 밥을 먹었다." +fake_sentence = "나는 내일 밥을 먹었다." + +fake_tokens = tokenizer.tokenize(fake_sentence) +fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt") + +discriminator_outputs = discriminator(fake_inputs) +predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2) + +print(list(zip(fake_tokens, predictions.tolist()[1:-1]))) +``` diff --git a/model_cards/monologg/koelectra-small-generator/README.md b/model_cards/monologg/koelectra-small-generator/README.md new file mode 100644 index 0000000000..5fd7bb7acc --- /dev/null +++ b/model_cards/monologg/koelectra-small-generator/README.md @@ -0,0 +1,45 @@ +--- +language: Korean +--- + +# KoELECTRA (Small Generator) + +Pretrained ELECTRA Language Model for Korean (`koelectra-small-generator`) + +For more detail, please see [original repository](https://github.com/monologg/KoELECTRA/blob/master/README_EN.md). + +## Usage + +### Load model and tokenizer + +```python +>>> from transformers import ElectraModel, ElectraTokenizer + +>>> model = ElectraModel.from_pretrained("monologg/koelectra-small-generator") +>>> tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-generator") +``` + +### Tokenizer example + +```python +>>> from transformers import ElectraTokenizer +>>> tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-generator") +>>> tokenizer.tokenize("[CLS] 한국어 ELECTRA를 공유합니다. [SEP]") +['[CLS]', '한국어', 'E', '##L', '##EC', '##T', '##RA', '##를', '공유', '##합니다', '.', '[SEP]'] +>>> tokenizer.convert_tokens_to_ids(['[CLS]', '한국어', 'E', '##L', '##EC', '##T', '##RA', '##를', '공유', '##합니다', '.', '[SEP]']) +[2, 18429, 41, 6240, 15229, 6204, 20894, 5689, 12622, 10690, 18, 3] +``` + +## Example using ElectraForMaskedLM + +```python +from transformers import pipeline + +fill_mask = pipeline( + "fill-mask", + model="monologg/koelectra-small-generator", + tokenizer="monologg/koelectra-small-generator" +) + +print(fill_mask("나는 {} 밥을 먹었다.".format(fill_mask.tokenizer.mask_token))) +```