From 5f25a5f367497278bf19c9994569db43f96d5278 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Fri, 11 Oct 2019 10:20:33 +0200 Subject: [PATCH] model: add support for new German BERT models (cased and uncased) from @dbmdz --- docs/source/pretrained_models.rst | 8 ++++++++ docs/source/serialization.rst | 2 ++ transformers/configuration_bert.py | 2 ++ transformers/modeling_bert.py | 2 ++ transformers/tokenization_bert.py | 6 ++++++ 5 files changed, 20 insertions(+) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index e7aa1a9b43..1d02cd0dd7 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -53,6 +53,14 @@ Here is the full list of the currently provided pretrained models together with | | ``bert-base-cased-finetuned-mrpc`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | | | | The ``bert-base-cased`` model fine-tuned on MRPC | | | | (see `details of fine-tuning in the example section `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-german-dbmdz-cased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | +| | | | Trained on cased German text by DBMDZ | +| | | (see `details on dbmdz repository `__). | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | +| | | | Trained on uncased German text by DBMDZ | +| | | (see `details on dbmdz repository `__). | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | | | | OpenAI GPT English model | diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst index 0b0b600ec1..c948104d69 100644 --- a/docs/source/serialization.rst +++ b/docs/source/serialization.rst @@ -33,6 +33,8 @@ where * ``bert-large-uncased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once) * ``bert-large-cased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once) * ``bert-large-uncased-whole-word-masking-finetuned-squad``: The ``bert-large-uncased-whole-word-masking`` model finetuned on SQuAD (using the ``run_bert_squad.py`` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869* + * ``bert-base-german-dbmdz-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation `__ + * ``bert-base-german-dbmdz-uncased``: Trained on (uncased) German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation `__ * ``openai-gpt``: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters * ``gpt2``: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters * ``gpt2-medium``: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py index 122a2c9aab..d63be963eb 100644 --- a/transformers/configuration_bert.py +++ b/transformers/configuration_bert.py @@ -40,6 +40,8 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", + 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", + 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", } diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index fc448fa366..fecf1e4de8 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -48,6 +48,8 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", + 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin", + 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin", } def load_tf_weights_in_bert(model, config, tf_checkpoint_path): diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index d256f27a58..8affdd9036 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -44,6 +44,8 @@ PRETRAINED_VOCAB_FILES_MAP = { 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt", 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt", 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", + 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", + 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", } } @@ -61,6 +63,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 'bert-large-uncased-whole-word-masking-finetuned-squad': 512, 'bert-large-cased-whole-word-masking-finetuned-squad': 512, 'bert-base-cased-finetuned-mrpc': 512, + 'bert-base-german-dbmdz-cased': 512, + 'bert-base-german-dbmdz-uncased': 512, } PRETRAINED_INIT_CONFIGURATION = { @@ -77,6 +81,8 @@ PRETRAINED_INIT_CONFIGURATION = { 'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True}, 'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False}, 'bert-base-cased-finetuned-mrpc': {'do_lower_case': False}, + 'bert-base-german-dbmdz-cased': {'do_lower_case': False}, + 'bert-base-german-dbmdz-uncased': {'do_lower_case': True}, }