From 912a377e904d1ec10ce2555c80035c074ff51e12 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 28 Aug 2019 13:59:42 +0200 Subject: [PATCH] dilbert -> distilbert --- README.md | 2 +- examples/distillation/README.md | 28 ++-- examples/distillation/dataset.py | 2 +- examples/distillation/distiller.py | 2 +- .../distillation/scripts/binarized_data.py | 2 +- .../scripts/extract_for_distil.py | 22 ++-- examples/distillation/scripts/token_counts.py | 2 +- examples/distillation/train.py | 12 +- examples/distillation/utils.py | 2 +- pytorch_transformers/__init__.py | 8 +- pytorch_transformers/modeling_auto.py | 10 +- ...ling_dilbert.py => modeling_distilbert.py} | 120 +++++++++--------- .../tests/modeling_dilbert_test.py | 50 ++++---- .../tests/tokenization_dilbert_test.py | 10 +- ..._dilbert.py => tokenization_distilbert.py} | 16 +-- 15 files changed, 144 insertions(+), 144 deletions(-) rename pytorch_transformers/{modeling_dilbert.py => modeling_distilbert.py} (87%) rename pytorch_transformers/{tokenization_dilbert.py => tokenization_distilbert.py} (75%) diff --git a/README.md b/README.md index de69e69788..5f69ad778f 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ The library currently contains PyTorch implementations, pre-trained model weight 5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. 6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. 7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. -8. **[DilBERT](https://github.com/huggingface/pytorch-transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DilBERT, a distilled version of BERT](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-dilbert-a-distilled-version-of-bert-8cf3380435b5 +8. **[DistilBERT](https://github.com/huggingface/pytorch-transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5 ) by Victor Sanh, Lysandre Debut and Thomas Wolf. These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/pytorch-transformers/examples.html). diff --git a/examples/distillation/README.md b/examples/distillation/README.md index c037bd0c24..1b8a4f7178 100644 --- a/examples/distillation/README.md +++ b/examples/distillation/README.md @@ -1,33 +1,33 @@ -# DilBERT +# DistilBERT -This folder contains the original code used to train DilBERT as well as examples showcasing how to use DilBERT. +This folder contains the original code used to train DistilBERT as well as examples showcasing how to use DistilBERT. -## What is DilBERT +## What is DistilBERT -DilBERT stands for Distillated-BERT. DilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on the GLUE language understanding benchmark. DilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DilBERT is thus an interesting option to put large-scaled trained Transformer model into production. +DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production. -For more information on DilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-dilbert-a-distilled-version-of-bert-8cf3380435b5 +For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5 ). -## How to use DilBERT +## How to use DistilBERT -PyTorch-Transformers includes two pre-trained DilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DilBERT): +PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT): -- `dilbert-base-uncased`: DilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters. -- `dilbert-base-uncased-distilled-squad`: A finetuned version of `dilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.2 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score). +- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters. +- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.2 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score). -Using DilBERT is very similar to using BERT. DilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DilBertTokenizer` name to have a consistent naming between the library models. +Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models. ```python -tokenizer = DilBertTokenizer.from_pretrained('dilbert-base-uncased') -model = DilBertModel.from_pretrained('dilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') +model = DistilBertModel.from_pretrained('distilbert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple ``` -## How to train DilBERT +## How to train DistilBERT In the following, we will explain how you can train your own compressed model. @@ -68,7 +68,7 @@ python train.py \ By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them. -We highly encourage you to distributed training for training DilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs: +We highly encourage you to distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs: ```bash export NODE_RANK=0 diff --git a/examples/distillation/dataset.py b/examples/distillation/dataset.py index b9f58f775e..b3b76fd83c 100644 --- a/examples/distillation/dataset.py +++ b/examples/distillation/dataset.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Dataloaders to train DilBERT. +Dataloaders to train DistilBERT. """ from typing import List import math diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py index c2d4a9785a..e6c27fe365 100644 --- a/examples/distillation/distiller.py +++ b/examples/distillation/distiller.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -The distiller to distil DilBERT. +The distiller to distil DistilBERT. """ import os import math diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py index c79001bb5e..d1c97bd296 100644 --- a/examples/distillation/scripts/binarized_data.py +++ b/examples/distillation/scripts/binarized_data.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Preprocessing script before training DilBERT. +Preprocessing script before training DistilBERT. """ import argparse import pickle diff --git a/examples/distillation/scripts/extract_for_distil.py b/examples/distillation/scripts/extract_for_distil.py index 1cbf19d2cf..f3eee024ec 100644 --- a/examples/distillation/scripts/extract_for_distil.py +++ b/examples/distillation/scripts/extract_for_distil.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Preprocessing script before training DilBERT. +Preprocessing script before training DistilBERT. """ from pytorch_transformers import BertForPreTraining import torch @@ -33,32 +33,32 @@ if __name__ == '__main__': compressed_sd = {} for w in ['word_embeddings', 'position_embeddings']: - compressed_sd[f'dilbert.embeddings.{w}.weight'] = \ + compressed_sd[f'distilbert.embeddings.{w}.weight'] = \ state_dict[f'bert.embeddings.{w}.weight'] for w in ['weight', 'bias']: - compressed_sd[f'dilbert.embeddings.LayerNorm.{w}'] = \ + compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \ state_dict[f'bert.embeddings.LayerNorm.{w}'] std_idx = 0 for teacher_idx in [0, 2, 4, 7, 9, 11]: for w in ['weight', 'bias']: - compressed_sd[f'dilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \ + compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \ state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.query.{w}'] - compressed_sd[f'dilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \ + compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \ state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.key.{w}'] - compressed_sd[f'dilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \ + compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \ state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.value.{w}'] - compressed_sd[f'dilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \ + compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \ state_dict[f'bert.encoder.layer.{teacher_idx}.attention.output.dense.{w}'] - compressed_sd[f'dilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \ + compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \ state_dict[f'bert.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}'] - compressed_sd[f'dilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \ + compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \ state_dict[f'bert.encoder.layer.{teacher_idx}.intermediate.dense.{w}'] - compressed_sd[f'dilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \ + compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \ state_dict[f'bert.encoder.layer.{teacher_idx}.output.dense.{w}'] - compressed_sd[f'dilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \ + compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \ state_dict[f'bert.encoder.layer.{teacher_idx}.output.LayerNorm.{w}'] std_idx += 1 diff --git a/examples/distillation/scripts/token_counts.py b/examples/distillation/scripts/token_counts.py index 2f5ed83922..eb3fb738e0 100644 --- a/examples/distillation/scripts/token_counts.py +++ b/examples/distillation/scripts/token_counts.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Preprocessing script before training DilBERT. +Preprocessing script before training DistilBERT. """ from collections import Counter import argparse diff --git a/examples/distillation/train.py b/examples/distillation/train.py index 5af42dd8f4..712f10b47d 100644 --- a/examples/distillation/train.py +++ b/examples/distillation/train.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Training DilBERT. +Training DistilBERT. """ import os import argparse @@ -24,7 +24,7 @@ import numpy as np import torch from pytorch_transformers import BertTokenizer, BertForMaskedLM -from pytorch_transformers import DilBertForMaskedLM, DilBertConfig +from pytorch_transformers import DistilBertForMaskedLM, DistilBertConfig from distiller import Distiller from utils import git_log, logger, init_gpu_params, set_seed @@ -201,13 +201,13 @@ def main(): assert os.path.isfile(os.path.join(args.from_pretrained_config)) logger.info(f'Loading pretrained weights from {args.from_pretrained_weights}') logger.info(f'Loading pretrained config from {args.from_pretrained_config}') - stu_architecture_config = DilBertConfig.from_json_file(args.from_pretrained_config) - student = DilBertForMaskedLM.from_pretrained(args.from_pretrained_weights, + stu_architecture_config = DistilBertConfig.from_json_file(args.from_pretrained_config) + student = DistilBertForMaskedLM.from_pretrained(args.from_pretrained_weights, config=stu_architecture_config) else: args.vocab_size_or_config_json_file = args.vocab_size - stu_architecture_config = DilBertConfig(**vars(args)) - student = DilBertForMaskedLM(stu_architecture_config) + stu_architecture_config = DistilBertConfig(**vars(args)) + student = DistilBertForMaskedLM(stu_architecture_config) if args.n_gpu > 0: diff --git a/examples/distillation/utils.py b/examples/distillation/utils.py index 14bb0e0016..461c371898 100644 --- a/examples/distillation/utils.py +++ b/examples/distillation/utils.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Utils to train DilBERT. +Utils to train DistilBERT. """ import git import json diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py index 22bc4d3c21..47783057d1 100644 --- a/pytorch_transformers/__init__.py +++ b/pytorch_transformers/__init__.py @@ -7,7 +7,7 @@ from .tokenization_gpt2 import GPT2Tokenizer from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE from .tokenization_xlm import XLMTokenizer from .tokenization_roberta import RobertaTokenizer -from .tokenization_dilbert import DilBertTokenizer +from .tokenization_distilbert import DistilBertTokenizer from .tokenization_utils import (PreTrainedTokenizer) @@ -41,9 +41,9 @@ from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_dilbert import (DilBertConfig, DilBertForMaskedLM, DilBertModel, - DilBertForSequenceClassification, DilBertForQuestionAnswering, - DILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) +from .modeling_distilbert import (DistilBertConfig, DistilBertForMaskedLM, DistilBertModel, + DistilBertForSequenceClassification, DistilBertForQuestionAnswering, + DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, prune_layer, Conv1D) diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py index 7e65269926..cdacb7b552 100644 --- a/pytorch_transformers/modeling_auto.py +++ b/pytorch_transformers/modeling_auto.py @@ -30,7 +30,7 @@ from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel from .modeling_xlnet import XLNetConfig, XLNetModel from .modeling_xlm import XLMConfig, XLMModel from .modeling_roberta import RobertaConfig, RobertaModel -from .modeling_dilbert import DilBertConfig, DilBertModel +from .modeling_distilbert import DistilBertConfig, DistilBertModel from .modeling_utils import PreTrainedModel, SequenceSummary @@ -111,8 +111,8 @@ class AutoConfig(object): assert unused_kwargs == {'foo': False} """ - if 'dilbert' in pretrained_model_name_or_path: - return DilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + if 'distilbert' in pretrained_model_name_or_path: + return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) elif 'bert' in pretrained_model_name_or_path: @@ -228,8 +228,8 @@ class AutoModel(object): model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ - if 'dilbert' in pretrained_model_name_or_path: - return DilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + if 'distilbert' in pretrained_model_name_or_path: + return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'bert' in pretrained_model_name_or_path: diff --git a/pytorch_transformers/modeling_dilbert.py b/pytorch_transformers/modeling_distilbert.py similarity index 87% rename from pytorch_transformers/modeling_dilbert.py rename to pytorch_transformers/modeling_distilbert.py index 867ba0e6a8..af77757293 100644 --- a/pytorch_transformers/modeling_dilbert.py +++ b/pytorch_transformers/modeling_distilbert.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -PyTorch DilBERT model. +PyTorch DistilBERT model. """ from __future__ import absolute_import, division, print_function, unicode_literals @@ -36,19 +36,19 @@ import logging logger = logging.getLogger(__name__) -DILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'dilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/dilbert-base-uncased-pytorch_model.bin", - 'dilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/dilbert-base-uncased-distilled-squad-pytorch_model.bin" +DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { + 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin", + 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin" } -DILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'dilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/dilbert-base-uncased-config.json", - 'dilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/dilbert-base-uncased-distilled-squad-config.json" +DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", + 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json" } -class DilBertConfig(PretrainedConfig): - pretrained_config_archive_map = DILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP +class DistilBertConfig(PretrainedConfig): + pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__(self, vocab_size_or_config_json_file=30522, @@ -66,7 +66,7 @@ class DilBertConfig(PretrainedConfig): qa_dropout=0.1, seq_classif_dropout=0.2, **kwargs): - super(DilBertConfig, self).__init__(**kwargs) + super(DistilBertConfig, self).__init__(**kwargs) if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 and isinstance(vocab_size_or_config_json_file, unicode)): @@ -398,17 +398,17 @@ class Transformer(nn.Module): ### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ### -class DilBertPreTrainedModel(PreTrainedModel): +class DistilBertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ - config_class = DilBertConfig - pretrained_model_archive_map = DILBERT_PRETRAINED_MODEL_ARCHIVE_MAP + config_class = DistilBertConfig + pretrained_model_archive_map = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = None - base_model_prefix = "dilbert" + base_model_prefix = "distilbert" def __init__(self, *inputs, **kwargs): - super(DilBertPreTrainedModel, self).__init__(*inputs, **kwargs) + super(DistilBertPreTrainedModel, self).__init__(*inputs, **kwargs) def init_weights(self, module): """ Initialize the weights. @@ -425,36 +425,36 @@ class DilBertPreTrainedModel(PreTrainedModel): module.bias.data.zero_() -DILBERT_START_DOCSTRING = r""" - DilBERT is a small, fast, cheap and light Transformer model +DISTILBERT_START_DOCSTRING = r""" + DistilBERT is a small, fast, cheap and light Transformer model trained by distilling Bert base. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on the GLUE language understanding benchmark. - Here are the differences between the interface of Bert and DilBert: + Here are the differences between the interface of Bert and DistilBert: - - DilBert doesn't have `token_type_ids`, you don't need to indicate which token belong to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`) - - DilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option. + - DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belong to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`) + - DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option. - For more information on DilBERT, please refer to our + For more information on DistilBERT, please refer to our `detailed blog post`_ .. _`detailed blog post`: - https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-dilbert-a-distilled-version-of-bert-8cf3380435b5 + https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5 Parameters: - config (:class:`~pytorch_transformers.DilBertConfig`): Model configuration class with all the parameters of the model. + config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ -DILBERT_INPUTS_DOCSTRING = r""" +DISTILBERT_INPUTS_DOCSTRING = r""" Inputs: **input_ids**L ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Indices oof input sequence tokens in the vocabulary. The input sequences should start with `[CLS]` and `[SEP]` tokens. - For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DilBERT. + For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT. **attention_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: @@ -465,9 +465,9 @@ DILBERT_INPUTS_DOCSTRING = r""" ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. """ -@add_start_docstrings("The bare DilBERT encoder/transformer outputing raw hidden-states without any specific head on top.", - DILBERT_START_DOCSTRING, DILBERT_INPUTS_DOCSTRING) -class DilBertModel(DilBertPreTrainedModel): +@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.", + DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) +class DistilBertModel(DistilBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` @@ -482,15 +482,15 @@ class DilBertModel(DilBertPreTrainedModel): Examples:: - tokenizer = DilBertTokenizer.from_pretrained('dilbert-base-uncased') - model = DilBertModel.from_pretrained('dilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') + model = DistilBertModel.from_pretrained('distilbert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ def __init__(self, config): - super(DilBertModel, self).__init__(config) + super(DistilBertModel, self).__init__(config) self.embeddings = Embeddings(config) # Embeddings self.transformer = Transformer(config) # Encoder @@ -543,9 +543,9 @@ class DilBertModel(DilBertPreTrainedModel): return output # last-layer hidden-state, (all hidden_states), (all attentions) -@add_start_docstrings("""DilBert Model with a `masked language modeling` head on top. """, - DILBERT_START_DOCSTRING, DILBERT_INPUTS_DOCSTRING) -class DilBertForMaskedLM(DilBertPreTrainedModel): +@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """, + DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) +class DistilBertForMaskedLM(DistilBertPreTrainedModel): r""" **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. @@ -568,19 +568,19 @@ class DilBertForMaskedLM(DilBertPreTrainedModel): Examples:: - tokenizer = DilBertTokenizer.from_pretrained('dilbert-base-uncased') - model = DilBertForMaskedLM.from_pretrained('dilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') + model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ def __init__(self, config): - super(DilBertForMaskedLM, self).__init__(config) + super(DistilBertForMaskedLM, self).__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states - self.dilbert = DilBertModel(config) + self.distilbert = DistilBertModel(config) self.vocab_transform = nn.Linear(config.dim, config.dim) self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) self.vocab_projector = nn.Linear(config.dim, config.vocab_size) @@ -595,14 +595,14 @@ class DilBertForMaskedLM(DilBertPreTrainedModel): Export to TorchScript can't handle parameter sharing so we are cloning them instead. """ self._tie_or_clone_weights(self.vocab_projector, - self.dilbert.embeddings.word_embeddings) + self.distilbert.embeddings.word_embeddings) def forward(self, input_ids: torch.tensor, attention_mask: torch.tensor = None, masked_lm_labels: torch.tensor = None, head_mask: torch.tensor = None): - dlbrt_output = self.dilbert(input_ids=input_ids, + dlbrt_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask) hidden_states = dlbrt_output[0] # (bs, seq_length, dim) @@ -620,10 +620,10 @@ class DilBertForMaskedLM(DilBertPreTrainedModel): return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions) -@add_start_docstrings("""DilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - DILBERT_START_DOCSTRING, DILBERT_INPUTS_DOCSTRING) -class DilBertForSequenceClassification(DilBertPreTrainedModel): + DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) +class DistilBertForSequenceClassification(DistilBertPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: Labels for computing the sequence classification/regression loss. @@ -646,8 +646,8 @@ class DilBertForSequenceClassification(DilBertPreTrainedModel): Examples:: - tokenizer = DilBertTokenizer.from_pretrained('dilbert-base-uncased') - model = DilBertForSequenceClassification.from_pretrained('dilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') + model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) @@ -655,10 +655,10 @@ class DilBertForSequenceClassification(DilBertPreTrainedModel): """ def __init__(self, config): - super(DilBertForSequenceClassification, self).__init__(config) + super(DistilBertForSequenceClassification, self).__init__(config) self.num_labels = config.num_labels - self.dilbert = DilBertModel(config) + self.distilbert = DistilBertModel(config) self.pre_classifier = nn.Linear(config.dim, config.dim) self.classifier = nn.Linear(config.dim, config.num_labels) self.dropout = nn.Dropout(config.seq_classif_dropout) @@ -670,17 +670,17 @@ class DilBertForSequenceClassification(DilBertPreTrainedModel): attention_mask: torch.tensor = None, labels: torch.tensor = None, head_mask: torch.tensor = None): - dilbert_output = self.dilbert(input_ids=input_ids, + distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask) - hidden_state = dilbert_output[0] # (bs, seq_len, dim) + hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = nn.ReLU()(pooled_output) # (bs, dim) pooled_output = self.dropout(pooled_output) # (bs, dim) logits = self.classifier(pooled_output) # (bs, dim) - outputs = (logits,) + dilbert_output[1:] + outputs = (logits,) + distilbert_output[1:] if labels is not None: if self.num_labels == 1: loss_fct = nn.MSELoss() @@ -693,10 +693,10 @@ class DilBertForSequenceClassification(DilBertPreTrainedModel): return outputs # (loss), logits, (hidden_states), (attentions) -@add_start_docstrings("""DilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - DILBERT_START_DOCSTRING, DILBERT_INPUTS_DOCSTRING) -class DilBertForQuestionAnswering(DilBertPreTrainedModel): + DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) +class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): r""" **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -724,8 +724,8 @@ class DilBertForQuestionAnswering(DilBertPreTrainedModel): Examples:: - tokenizer = DilBertTokenizer.from_pretrained('dilbert-base-uncased') - model = DilBertForQuestionAnswering.from_pretrained('dilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') + model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) @@ -734,9 +734,9 @@ class DilBertForQuestionAnswering(DilBertPreTrainedModel): """ def __init__(self, config): - super(DilBertForQuestionAnswering, self).__init__(config) + super(DistilBertForQuestionAnswering, self).__init__(config) - self.dilbert = DilBertModel(config) + self.distilbert = DistilBertModel(config) self.qa_outputs = nn.Linear(config.dim, config.num_labels) assert config.num_labels == 2 self.dropout = nn.Dropout(config.qa_dropout) @@ -749,10 +749,10 @@ class DilBertForQuestionAnswering(DilBertPreTrainedModel): start_positions: torch.tensor = None, end_positions: torch.tensor = None, head_mask: torch.tensor = None): - dilbert_output = self.dilbert(input_ids=input_ids, + distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask) - hidden_states = dilbert_output[0] # (bs, max_query_len, dim) + hidden_states = distilbert_output[0] # (bs, max_query_len, dim) hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) @@ -760,7 +760,7 @@ class DilBertForQuestionAnswering(DilBertPreTrainedModel): start_logits = start_logits.squeeze(-1) # (bs, max_query_len) end_logits = end_logits.squeeze(-1) # (bs, max_query_len) - outputs = (start_logits, end_logits,) + dilbert_output[1:] + outputs = (start_logits, end_logits,) + distilbert_output[1:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: diff --git a/pytorch_transformers/tests/modeling_dilbert_test.py b/pytorch_transformers/tests/modeling_dilbert_test.py index 2fd707dfd8..1c9d9c792d 100644 --- a/pytorch_transformers/tests/modeling_dilbert_test.py +++ b/pytorch_transformers/tests/modeling_dilbert_test.py @@ -20,23 +20,23 @@ import unittest import shutil import pytest -from pytorch_transformers import (DilBertConfig, DilBertModel, DilBertForMaskedLM, - DilBertForQuestionAnswering, DilBertForSequenceClassification) -from pytorch_transformers.modeling_dilbert import DILBERT_PRETRAINED_MODEL_ARCHIVE_MAP +from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM, + DistilBertForQuestionAnswering, DistilBertForSequenceClassification) +from pytorch_transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) -class DilBertModelTest(CommonTestCases.CommonModelTester): +class DistilBertModelTest(CommonTestCases.CommonModelTester): - all_model_classes = (DilBertModel, DilBertForMaskedLM, DilBertForQuestionAnswering, - DilBertForSequenceClassification) + all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, + DistilBertForSequenceClassification) test_pruning = True test_torchscript = True test_resize_embeddings = True test_head_masking = True - class DilBertModelTester(object): + class DistilBertModelTester(object): def __init__(self, parent, @@ -100,7 +100,7 @@ class DilBertModelTest(CommonTestCases.CommonModelTester): token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) - config = DilBertConfig( + config = DistilBertConfig( vocab_size_or_config_json_file=self.vocab_size, dim=self.hidden_size, n_layers=self.num_hidden_layers, @@ -119,8 +119,8 @@ class DilBertModelTest(CommonTestCases.CommonModelTester): list(result["loss"].size()), []) - def create_and_check_dilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): - model = DilBertModel(config=config) + def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + model = DistilBertModel(config=config) model.eval() (sequence_output,) = model(input_ids, input_mask) (sequence_output,) = model(input_ids) @@ -132,8 +132,8 @@ class DilBertModelTest(CommonTestCases.CommonModelTester): list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) - def create_and_check_dilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): - model = DilBertForMaskedLM(config=config) + def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + model = DistilBertForMaskedLM(config=config) model.eval() loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels) result = { @@ -145,8 +145,8 @@ class DilBertModelTest(CommonTestCases.CommonModelTester): [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result) - def create_and_check_dilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): - model = DilBertForQuestionAnswering(config=config) + def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + model = DistilBertForQuestionAnswering(config=config) model.eval() loss, start_logits, end_logits = model(input_ids, input_mask, sequence_labels, sequence_labels) result = { @@ -162,9 +162,9 @@ class DilBertModelTest(CommonTestCases.CommonModelTester): [self.batch_size, self.seq_length]) self.check_loss_output(result) - def create_and_check_dilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_labels = self.num_labels - model = DilBertForSequenceClassification(config) + model = DistilBertForSequenceClassification(config) model.eval() loss, logits = model(input_ids, input_mask, sequence_labels) result = { @@ -183,33 +183,33 @@ class DilBertModelTest(CommonTestCases.CommonModelTester): return config, inputs_dict def setUp(self): - self.model_tester = DilBertModelTest.DilBertModelTester(self) - self.config_tester = ConfigTester(self, config_class=DilBertConfig, dim=37) + self.model_tester = DistilBertModelTest.DistilBertModelTester(self) + self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37) def test_config(self): self.config_tester.run_common_tests() - def test_dilbert_model(self): + def test_distilbert_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_dilbert_model(*config_and_inputs) + self.model_tester.create_and_check_distilbert_model(*config_and_inputs) def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_dilbert_for_masked_lm(*config_and_inputs) + self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs) def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_dilbert_for_question_answering(*config_and_inputs) + self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs) def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_dilbert_for_sequence_classification(*config_and_inputs) + self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs) # @pytest.mark.slow # def test_model_from_pretrained(self): # cache_dir = "/tmp/pytorch_transformers_test/" - # for model_name in list(DILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - # model = DilBertModel.from_pretrained(model_name, cache_dir=cache_dir) + # for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: + # model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir) # shutil.rmtree(cache_dir) # self.assertIsNotNone(model) diff --git a/pytorch_transformers/tests/tokenization_dilbert_test.py b/pytorch_transformers/tests/tokenization_dilbert_test.py index 4cc7aa6c88..30268db216 100644 --- a/pytorch_transformers/tests/tokenization_dilbert_test.py +++ b/pytorch_transformers/tests/tokenization_dilbert_test.py @@ -18,20 +18,20 @@ import os import unittest from io import open -from pytorch_transformers.tokenization_dilbert import (DilBertTokenizer) +from pytorch_transformers.tokenization_distilbert import (DistilBertTokenizer) from .tokenization_tests_commons import CommonTestCases from .tokenization_bert_test import BertTokenizationTest -class DilBertTokenizationTest(BertTokenizationTest): +class DistilBertTokenizationTest(BertTokenizationTest): - tokenizer_class = DilBertTokenizer + tokenizer_class = DistilBertTokenizer def get_tokenizer(self): - return DilBertTokenizer.from_pretrained(self.tmpdirname) + return DistilBertTokenizer.from_pretrained(self.tmpdirname) def test_sequence_builders(self): - tokenizer = DilBertTokenizer.from_pretrained("dilbert-base-uncased") + tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") diff --git a/pytorch_transformers/tokenization_dilbert.py b/pytorch_transformers/tokenization_distilbert.py similarity index 75% rename from pytorch_transformers/tokenization_dilbert.py rename to pytorch_transformers/tokenization_distilbert.py index 8d71e1b486..116da41b37 100644 --- a/pytorch_transformers/tokenization_dilbert.py +++ b/pytorch_transformers/tokenization_distilbert.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Tokenization classes for DilBERT.""" +"""Tokenization classes for DistilBERT.""" from __future__ import absolute_import, division, print_function, unicode_literals @@ -31,21 +31,21 @@ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} PRETRAINED_VOCAB_FILES_MAP = { 'vocab_file': { - 'dilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", - 'dilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'dilbert-base-uncased': 512, - 'dilbert-base-uncased-distilled-squad': 512, + 'distilbert-base-uncased': 512, + 'distilbert-base-uncased-distilled-squad': 512, } -class DilBertTokenizer(BertTokenizer): +class DistilBertTokenizer(BertTokenizer): r""" - Constructs a DilBertTokenizer. - :class:`~pytorch_transformers.DilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece + Constructs a DistilBertTokenizer. + :class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece Args: vocab_file: Path to a one-wordpiece-per-line vocabulary file