From 337802783f0ba87a58e2e9adddb47ac5cb00646d Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 19 Nov 2019 19:50:32 +0100 Subject: [PATCH 1/7] distilbert: add configuration for new German distilbert model --- transformers/configuration_distilbert.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transformers/configuration_distilbert.py b/transformers/configuration_distilbert.py index 2a8a149acf..57b9e57fe0 100644 --- a/transformers/configuration_distilbert.py +++ b/transformers/configuration_distilbert.py @@ -27,7 +27,8 @@ logger = logging.getLogger(__name__) DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", - 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json" + 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json", + 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json" } From 22333945fbd1f5eff8cfb664ac6f63363e6b72d4 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 19 Nov 2019 19:51:01 +0100 Subject: [PATCH 2/7] distilbert: add pytorch model for new German distilbert model --- transformers/modeling_distilbert.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py index d30f493c69..cb2fa1915e 100644 --- a/transformers/modeling_distilbert.py +++ b/transformers/modeling_distilbert.py @@ -42,7 +42,8 @@ logger = logging.getLogger(__name__) DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin", - 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin" + 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin", + 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin" } From f21dfe36baaf316675a4d2f8c918e9e8afc11db2 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 19 Nov 2019 19:51:31 +0100 Subject: [PATCH 3/7] distilbert: add vocab for new German distilbert model --- transformers/tokenization_distilbert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py index dfa02926d8..61f967bf0b 100644 --- a/transformers/tokenization_distilbert.py +++ b/transformers/tokenization_distilbert.py @@ -33,6 +33,7 @@ PRETRAINED_VOCAB_FILES_MAP = { { 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt" } } From e631383d4fb55ed2dff53f2b80a29e51cd2f563d Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 19 Nov 2019 19:52:40 +0100 Subject: [PATCH 4/7] docs: add new German distilbert model to pretrained models --- docs/source/pretrained_models.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index b0a578fd80..3b25a79802 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -151,6 +151,10 @@ Here is the full list of the currently provided pretrained models together with | | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | | | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. | | | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``distilbert-base-german-cased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters | +| | | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint. | +| | | (see `details `__) | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | CTRL | ``ctrl`` | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters | | | | | Salesforce's Large-sized CTRL English model | From e7cf2ccd1567615513013d5fc9f9002733f70e13 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 19 Nov 2019 19:55:19 +0100 Subject: [PATCH 5/7] distillation: add German distilbert model --- examples/distillation/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/distillation/README.md b/examples/distillation/README.md index 8efd1ea6f4..c2765c28ff 100644 --- a/examples/distillation/README.md +++ b/examples/distillation/README.md @@ -2,6 +2,8 @@ This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2. +**November 19th, 2019 - Update** We release German **DistilBERT**: 98.8% of `bert-base-german-dbmdz-cased` on NER tasks. + **October 23rd, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller. **October 3rd, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.** @@ -45,10 +47,11 @@ This part of the library has only be tested with Python3.6+. There are few speci ## How to use DistilBERT -Transformers includes two pre-trained Distil* models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT): +Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT): - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters. - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score). +- `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score). - `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2. - `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base. - and more to come! 🤗🤗🤗 From 2e2c0375c3e725483d2ec297632dc52ed6a9f5fb Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 19 Nov 2019 20:41:18 +0100 Subject: [PATCH 6/7] distilbert: add German distilbert model to positional embedding sizes map --- transformers/tokenization_distilbert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py index 61f967bf0b..c82ac09727 100644 --- a/transformers/tokenization_distilbert.py +++ b/transformers/tokenization_distilbert.py @@ -40,6 +40,7 @@ PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 'distilbert-base-uncased': 512, 'distilbert-base-uncased-distilled-squad': 512, + 'distilbert-base-german-cased': 512, } From da06afafc87b03f7588b6bd319e1b7592b091339 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 19 Nov 2019 21:57:00 +0100 Subject: [PATCH 7/7] tree-wide: add trailing comma in configuration maps --- transformers/configuration_distilbert.py | 2 +- transformers/modeling_distilbert.py | 2 +- transformers/tokenization_distilbert.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/transformers/configuration_distilbert.py b/transformers/configuration_distilbert.py index 57b9e57fe0..1c35f5c0dd 100644 --- a/transformers/configuration_distilbert.py +++ b/transformers/configuration_distilbert.py @@ -28,7 +28,7 @@ logger = logging.getLogger(__name__) DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json", - 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json" + 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json", } diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py index cb2fa1915e..4ba75248a1 100644 --- a/transformers/modeling_distilbert.py +++ b/transformers/modeling_distilbert.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin", 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin", - 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin" + 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin", } diff --git a/transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py index c82ac09727..6574e0bc4d 100644 --- a/transformers/tokenization_distilbert.py +++ b/transformers/tokenization_distilbert.py @@ -33,7 +33,7 @@ PRETRAINED_VOCAB_FILES_MAP = { { 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", - 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt" + 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", } }