From 1c5cd8e5f59154905c5ae0f47a8c8905618a12ff Mon Sep 17 00:00:00 2001
From: Lee Haau-Sing <55363337+leehaausing@users.noreply.github.com>
Date: Mon, 22 Jun 2020 17:24:27 -0400
Subject: [PATCH] Add README.md (nyu-mll) (#5174)

* nyu-mll: roberta on smaller datasets

* Update README.md

* Update README.md

Co-authored-by: Alex Warstadt <alexwarstadt@gmail.com>
---
 .../nyu-mll/roberta-base-100M-1/README.md     |  1 +
 .../nyu-mll/roberta-base-100M-2/README.md     |  1 +
 .../nyu-mll/roberta-base-100M-3/README.md     |  1 +
 .../nyu-mll/roberta-base-10M-1/README.md      |  1 +
 .../nyu-mll/roberta-base-10M-2/README.md      |  1 +
 .../nyu-mll/roberta-base-10M-3/README.md      |  1 +
 .../nyu-mll/roberta-base-1B-1/README.md       |  1 +
 .../nyu-mll/roberta-base-1B-2/README.md       |  1 +
 .../nyu-mll/roberta-base-1B-3/README.md       |  1 +
 .../nyu-mll/roberta-med-small-1M-1/README.md  |  1 +
 .../nyu-mll/roberta-med-small-1M-2/README.md  |  1 +
 .../nyu-mll/roberta-med-small-1M-3/README.md  |  1 +
 .../nyu-mll/roberta_1M_to_1B/README.md        | 49 +++++++++++++++++++
 13 files changed, 61 insertions(+)
 create mode 120000 model_cards/nyu-mll/roberta-base-100M-1/README.md
 create mode 120000 model_cards/nyu-mll/roberta-base-100M-2/README.md
 create mode 120000 model_cards/nyu-mll/roberta-base-100M-3/README.md
 create mode 120000 model_cards/nyu-mll/roberta-base-10M-1/README.md
 create mode 120000 model_cards/nyu-mll/roberta-base-10M-2/README.md
 create mode 120000 model_cards/nyu-mll/roberta-base-10M-3/README.md
 create mode 120000 model_cards/nyu-mll/roberta-base-1B-1/README.md
 create mode 120000 model_cards/nyu-mll/roberta-base-1B-2/README.md
 create mode 120000 model_cards/nyu-mll/roberta-base-1B-3/README.md
 create mode 120000 model_cards/nyu-mll/roberta-med-small-1M-1/README.md
 create mode 120000 model_cards/nyu-mll/roberta-med-small-1M-2/README.md
 create mode 120000 model_cards/nyu-mll/roberta-med-small-1M-3/README.md
 create mode 100644 model_cards/nyu-mll/roberta_1M_to_1B/README.md

diff --git a/model_cards/nyu-mll/roberta-base-100M-1/README.md b/model_cards/nyu-mll/roberta-base-100M-1/README.md
new file mode 120000
index 0000000000..34a367f775
--- /dev/null
+++ b/model_cards/nyu-mll/roberta-base-100M-1/README.md
@@ -0,0 +1 @@
+../roberta_1M_to_1B/README.md
\ No newline at end of file
diff --git a/model_cards/nyu-mll/roberta-base-100M-2/README.md b/model_cards/nyu-mll/roberta-base-100M-2/README.md
new file mode 120000
index 0000000000..34a367f775
--- /dev/null
+++ b/model_cards/nyu-mll/roberta-base-100M-2/README.md
@@ -0,0 +1 @@
+../roberta_1M_to_1B/README.md
\ No newline at end of file
diff --git a/model_cards/nyu-mll/roberta-base-100M-3/README.md b/model_cards/nyu-mll/roberta-base-100M-3/README.md
new file mode 120000
index 0000000000..34a367f775
--- /dev/null
+++ b/model_cards/nyu-mll/roberta-base-100M-3/README.md
@@ -0,0 +1 @@
+../roberta_1M_to_1B/README.md
\ No newline at end of file
diff --git a/model_cards/nyu-mll/roberta-base-10M-1/README.md b/model_cards/nyu-mll/roberta-base-10M-1/README.md
new file mode 120000
index 0000000000..34a367f775
--- /dev/null
+++ b/model_cards/nyu-mll/roberta-base-10M-1/README.md
@@ -0,0 +1 @@
+../roberta_1M_to_1B/README.md
\ No newline at end of file
diff --git a/model_cards/nyu-mll/roberta-base-10M-2/README.md b/model_cards/nyu-mll/roberta-base-10M-2/README.md
new file mode 120000
index 0000000000..34a367f775
--- /dev/null
+++ b/model_cards/nyu-mll/roberta-base-10M-2/README.md
@@ -0,0 +1 @@
+../roberta_1M_to_1B/README.md
\ No newline at end of file
diff --git a/model_cards/nyu-mll/roberta-base-10M-3/README.md b/model_cards/nyu-mll/roberta-base-10M-3/README.md
new file mode 120000
index 0000000000..34a367f775
--- /dev/null
+++ b/model_cards/nyu-mll/roberta-base-10M-3/README.md
@@ -0,0 +1 @@
+../roberta_1M_to_1B/README.md
\ No newline at end of file
diff --git a/model_cards/nyu-mll/roberta-base-1B-1/README.md b/model_cards/nyu-mll/roberta-base-1B-1/README.md
new file mode 120000
index 0000000000..34a367f775
--- /dev/null
+++ b/model_cards/nyu-mll/roberta-base-1B-1/README.md
@@ -0,0 +1 @@
+../roberta_1M_to_1B/README.md
\ No newline at end of file
diff --git a/model_cards/nyu-mll/roberta-base-1B-2/README.md b/model_cards/nyu-mll/roberta-base-1B-2/README.md
new file mode 120000
index 0000000000..34a367f775
--- /dev/null
+++ b/model_cards/nyu-mll/roberta-base-1B-2/README.md
@@ -0,0 +1 @@
+../roberta_1M_to_1B/README.md
\ No newline at end of file
diff --git a/model_cards/nyu-mll/roberta-base-1B-3/README.md b/model_cards/nyu-mll/roberta-base-1B-3/README.md
new file mode 120000
index 0000000000..34a367f775
--- /dev/null
+++ b/model_cards/nyu-mll/roberta-base-1B-3/README.md
@@ -0,0 +1 @@
+../roberta_1M_to_1B/README.md
\ No newline at end of file
diff --git a/model_cards/nyu-mll/roberta-med-small-1M-1/README.md b/model_cards/nyu-mll/roberta-med-small-1M-1/README.md
new file mode 120000
index 0000000000..34a367f775
--- /dev/null
+++ b/model_cards/nyu-mll/roberta-med-small-1M-1/README.md
@@ -0,0 +1 @@
+../roberta_1M_to_1B/README.md
\ No newline at end of file
diff --git a/model_cards/nyu-mll/roberta-med-small-1M-2/README.md b/model_cards/nyu-mll/roberta-med-small-1M-2/README.md
new file mode 120000
index 0000000000..34a367f775
--- /dev/null
+++ b/model_cards/nyu-mll/roberta-med-small-1M-2/README.md
@@ -0,0 +1 @@
+../roberta_1M_to_1B/README.md
\ No newline at end of file
diff --git a/model_cards/nyu-mll/roberta-med-small-1M-3/README.md b/model_cards/nyu-mll/roberta-med-small-1M-3/README.md
new file mode 120000
index 0000000000..34a367f775
--- /dev/null
+++ b/model_cards/nyu-mll/roberta-med-small-1M-3/README.md
@@ -0,0 +1 @@
+../roberta_1M_to_1B/README.md
\ No newline at end of file
diff --git a/model_cards/nyu-mll/roberta_1M_to_1B/README.md b/model_cards/nyu-mll/roberta_1M_to_1B/README.md
new file mode 100644
index 0000000000..1689faf800
--- /dev/null
+++ b/model_cards/nyu-mll/roberta_1M_to_1B/README.md
@@ -0,0 +1,49 @@
+# RoBERTa Pretrained on Smaller Datasets
+
+We pretrain RoBERTa on smaller datasets (1M, 10M, 100M, 1B tokens). We release 3 models with lowest perplexities for each pretraining data size out of 25 runs (or 10 in the case of 1B tokens). The pretraining data reproduces that of BERT: We combine English Wikipedia and a reproduction of BookCorpus using texts from smashwords in a ratio of approximately 3:1.
+
+### Hyperparameters and Validation Perplexity
+
+The hyperparameters and validation perplexities corresponding to each model are as follows:
+
+| Model Name               | Training Size | Model Size | Max Steps | Batch Size | Validation Perplexity |
+|--------------------------|---------------|------------|-----------|------------|-----------------------|
+| [roberta-base-1B-1][link-roberta-base-1B-1]        | 1B            | BASE       | 100K      | 512        | 3.93                  |
+| [roberta-base-1B-2][link-roberta-base-1B-2]        | 1B            | BASE       | 31K       | 1024       | 4.25                  |
+| [roberta-base-1B-3][link-roberta-base-1B-3]        | 1B            | BASE       | 31K       | 4096       | 3.84                  |
+| [roberta-base-100M-1][link-roberta-base-100M-1]      | 100M          | BASE       | 100K      | 512        | 4.99                  |
+| [roberta-base-100M-2][link-roberta-base-100M-2]      | 100M          | BASE       | 31K       | 1024       | 4.61                  |
+| [roberta-base-100M-3][link-roberta-base-100M-3]      | 100M          | BASE       | 31K       | 512        | 5.02                  |
+| [roberta-base-10M-1][link-roberta-base-10M-1]       | 10M           | BASE       | 10K       | 1024       | 11.31                 |
+| [roberta-base-10M-2][link-roberta-base-10M-2]       | 10M           | BASE       | 10K       | 512        | 10.78                 |
+| [roberta-base-10M-3][link-roberta-base-10M-3]       | 10M           | BASE       | 31K       | 512        | 11.58                 |
+| [roberta-med-small-1M-1][link-roberta-med-small-1M-1]   | 1M            | MED-SMALL  | 100K      | 512        | 153.38                |
+| [roberta-med-small-1M-2][link-roberta-med-small-1M-2]   | 1M            | MED-SMALL  | 10K       | 512        | 134.18                |
+| [roberta-med-small-1M-3][link-roberta-med-small-1M-3]   | 1M            | MED-SMALL  | 31K       | 512        | 139.39                |
+
+The hyperparameters corresponding to model sizes mentioned above are as follows:
+
+| Model Size | L  | AH | HS  | FFN  | P    |
+|------------|----|----|-----|------|------|
+| BASE       | 12 | 12 | 768 | 3072 | 125M |
+| MED-SMALL  | 6  | 8  | 512 | 2048 | 45M  |
+
+(AH = number of attention heads; HS = hidden size; FFN = feedforward network dimension; P = number of parameters.)
+
+For other hyperparameters, we select:
+- Peak Learning rate: 5e-4
+- Warmup Steps: 6% of max steps
+- Dropout: 0.1
+
+[link-roberta-med-small-1M-1]: https://huggingface.co/nyu-mll/roberta-med-small-1M-1
+[link-roberta-med-small-1M-2]: https://huggingface.co/nyu-mll/roberta-med-small-1M-2
+[link-roberta-med-small-1M-3]: https://huggingface.co/nyu-mll/roberta-med-small-1M-3
+[link-roberta-base-10M-1]: https://huggingface.co/nyu-mll/roberta-base-10M-1
+[link-roberta-base-10M-2]: https://huggingface.co/nyu-mll/roberta-base-10M-2
+[link-roberta-base-10M-3]: https://huggingface.co/nyu-mll/roberta-base-10M-3
+[link-roberta-base-100M-1]: https://huggingface.co/nyu-mll/roberta-base-100M-1
+[link-roberta-base-100M-2]: https://huggingface.co/nyu-mll/roberta-base-100M-2
+[link-roberta-base-100M-3]: https://huggingface.co/nyu-mll/roberta-base-100M-3
+[link-roberta-base-1B-1]: https://huggingface.co/nyu-mll/roberta-base-1B-1
+[link-roberta-base-1B-2]: https://huggingface.co/nyu-mll/roberta-base-1B-2
+[link-roberta-base-1B-3]: https://huggingface.co/nyu-mll/roberta-base-1B-3