From d6fc34b45984ead95fb59c3b0758347fcb95603c Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Fri, 31 Jan 2020 17:10:04 -0500 Subject: [PATCH] [model_cards] add mine --- .../julien-c/bert-xsmall-dummy/README.md | 25 +++++++++ model_cards/julien-c/dummy-unknown/README.md | 52 +++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 model_cards/julien-c/bert-xsmall-dummy/README.md create mode 100644 model_cards/julien-c/dummy-unknown/README.md diff --git a/model_cards/julien-c/bert-xsmall-dummy/README.md b/model_cards/julien-c/bert-xsmall-dummy/README.md new file mode 100644 index 0000000000..36eef62327 --- /dev/null +++ b/model_cards/julien-c/bert-xsmall-dummy/README.md @@ -0,0 +1,25 @@ +## How to build a dummy model + + +```python +from transformers.configuration_bert import BertConfig +from transformers.modeling_bert import BertForMaskedLM +from transformers.modeling_tf_bert import TFBertForMaskedLM +from transformers.tokenization_bert import BertTokenizer + + +SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy" +DIRNAME = "./bert-xsmall-dummy" + +config = BertConfig(10, 20, 1, 1, 40) + +model = BertForMaskedLM(config) +model.save_pretrained(DIRNAME) + +tf_model = TFBertForMaskedLM.from_pretrained(DIRNAME, from_pt=True) +tf_model.save_pretrained(DIRNAME) + +# Slightly different for tokenizer. +# tokenizer = BertTokenizer.from_pretrained(DIRNAME) +# tokenizer.save_pretrained() +``` diff --git a/model_cards/julien-c/dummy-unknown/README.md b/model_cards/julien-c/dummy-unknown/README.md new file mode 100644 index 0000000000..9cdc3d2437 --- /dev/null +++ b/model_cards/julien-c/dummy-unknown/README.md @@ -0,0 +1,52 @@ + +```python +import json +import os +from transformers.configuration_roberta import RobertaConfig +from transformers import RobertaForMaskedLM, TFRobertaForMaskedLM + +DIRNAME = "./dummy-unknown" + + +config = RobertaConfig(10, 20, 1, 1, 40) + +model = RobertaForMaskedLM(config) +model.save_pretrained(DIRNAME) + +tf_model = TFRobertaForMaskedLM.from_pretrained(DIRNAME, from_pt=True) +tf_model.save_pretrained(DIRNAME) + +# Tokenizer: + +vocab = [ + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "\u0120", + "\u0120l", + "\u0120n", + "\u0120lo", + "\u0120low", + "er", + "\u0120lowest", + "\u0120newer", + "\u0120wider", + "", +] +vocab_tokens = dict(zip(vocab, range(len(vocab)))) +merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] + +vocab_file = os.path.join(DIRNAME, "vocab.json") +merges_file = os.path.join(DIRNAME, "merges.txt") +with open(vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") +with open(merges_file, "w", encoding="utf-8") as fp: + fp.write("\n".join(merges)) +```