From e16d46843a19ab289b82138e4eccec5610a76de7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juli=C3=A1n=20Peller=20=28dataista=29?= Date: Tue, 22 Oct 2019 16:11:02 -0300 Subject: [PATCH 01/10] Fix architectures count --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ad771f2ab1..e8506d6a39 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ State-of-the-art NLP for everyone Lower compute costs, smaller carbon footprint - Researchers can share trained models instead of always retraining - Practitioners can reduce compute time and production costs -- 8 architectures with over 30 pretrained models, some in more than 100 languages +- 10 architectures with over 30 pretrained models, some in more than 100 languages Choose the right framework for every part of a model's lifetime - Train state-of-the-art models in 3 lines of code @@ -111,7 +111,7 @@ At some point in the future, you'll be able to seamlessly move from pre-training ## Model architectures -🤗 Transformers currently provides 8 NLU/NLG architectures: +🤗 Transformers currently provides 10 NLU/NLG architectures: 1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. 2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. From ef1b8b2ae5ad1057154a126879f7eb8de685f862 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Tue, 22 Oct 2019 21:27:20 +0000 Subject: [PATCH 02/10] [CTRL] warn if generation prompt does not start with a control code see also https://github.com/salesforce/ctrl/pull/50 --- README.md | 2 +- examples/README.md | 2 +- examples/run_generation.py | 5 ++- transformers/tokenization_ctrl.py | 59 +++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e8506d6a39..ecba50a74e 100644 --- a/README.md +++ b/README.md @@ -413,7 +413,7 @@ and from the Salesforce CTRL model: python ./examples/run_generation.py \ --model_type=ctrl \ --length=20 \ - --model_name_or_path=gpt2 \ + --model_name_or_path=ctrl \ --temperature=0 \ --repetition_penalty=1.2 \ ``` diff --git a/examples/README.md b/examples/README.md index 6b68d880eb..3a76a4a830 100644 --- a/examples/README.md +++ b/examples/README.md @@ -101,7 +101,7 @@ python run_lm_finetuning.py \ Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py). -Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. +Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL. A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you can try out the different models available in the library. diff --git a/examples/run_generation.py b/examples/run_generation.py index ef58cfd844..ae0e27dcf0 100644 --- a/examples/run_generation.py +++ b/examples/run_generation.py @@ -196,7 +196,7 @@ def main(): logger.info(args) if args.model_type in ["ctrl"]: - if args.temperature > 0.7 : + if args.temperature > 0.7: logger.info('CTRL typically works better with lower temperatures (and lower top_k).') while True: @@ -224,6 +224,9 @@ def main(): # Models with memory likes to have a long prompt for short inputs. raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text context_tokens = tokenizer.encode(raw_text) + if args.model_type == "ctrl": + if not any(context_tokens[0] == x for x in tokenizer.control_codes.values()): + logger.info("WARNING! You are not starting your generation from a control code so you won't get good results") out = sample_sequence( model=model, context=context_tokens, diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py index c8d67ad043..3d67fa2c5b 100644 --- a/transformers/tokenization_ctrl.py +++ b/transformers/tokenization_ctrl.py @@ -46,6 +46,64 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 'ctrl': 256, } +CONTROL_CODES = { + "Pregnancy": 168629, + "Christianity": 7675, + "Explain": 106423, + "Fitness": 63440, + "Saving": 63163, + "Ask": 27171, + "Ass": 95985, + "Joke": 163509, + "Questions": 45622, + "Thoughts": 49605, + "Retail": 52342, + "Feminism": 164338, + "Writing": 11992, + "Atheism": 192263, + "Netflix": 48616, + "Computing": 39639, + "Opinion": 43213, + "Alone": 44967, + "Funny": 58917, + "Gaming": 40358, + "Human": 4088, + "India": 1331, + "Joker": 77138, + "Diet": 36206, + "Legal": 11859, + "Norman": 4939, + "Tip": 72689, + "Weight": 52343, + "Movies": 46273, + "Running": 23425, + "Science": 2090, + "Horror": 37793, + "Confession": 60572, + "Finance": 12250, + "Politics": 16360, + "Scary": 191985, + "Support": 12654, + "Technologies": 32516, + "Teenage": 66160, + "Event": 32769, + "Learned": 67460, + "Notion": 182770, + "Wikipedia": 37583, + "Books": 6665, + "Extract": 76050, + "Confessions": 102701, + "Conspiracy": 75932, + "Links": 63674, + "Narcissus": 150425, + "Relationship": 54766, + "Relationships": 134796, + "Reviews": 41671, + "News": 4256, + "Translation": 26820, + "multilingual": 128406, +} + def get_pairs(word): """Return set of symbol pairs in a word. @@ -68,6 +126,7 @@ class CTRLTokenizer(PreTrainedTokenizer): vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + control_codes = CONTROL_CODES def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): super(CTRLTokenizer, self).__init__(unk_token=unk_token, **kwargs) From 8ad5c591cda96a40d2fd2662a6b76af86527289d Mon Sep 17 00:00:00 2001 From: VictorSanh Date: Wed, 23 Oct 2019 10:29:47 -0400 Subject: [PATCH 03/10] [RELEASE] DistilRoBERTa --- docs/source/pretrained_models.rst | 4 ++++ examples/distillation/README.md | 38 +++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 1d02cd0dd7..43c08228bd 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -136,6 +136,10 @@ Here is the full list of the currently provided pretrained models together with | | ``distilgpt2`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | | | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. | | | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | +| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. | +| | | (see `details `__) | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | CTRL | ``ctrl`` | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters | | | | | Salesforce's Large-sized CTRL English model | diff --git a/examples/distillation/README.md b/examples/distillation/README.md index 0fbcb5628b..344b5f7d46 100644 --- a/examples/distillation/README.md +++ b/examples/distillation/README.md @@ -1,25 +1,38 @@ # Distil* -This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT and DistilGPT2. +This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2. -**2019, October 3rd - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.** +**October 23rd, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller. + +**October 3rd, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.** + +**September 19th, 2019 - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future! -**2019, September 19th - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future! ## What is Distil* Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production. -We have applied the same method to GPT2 and release the weights of the compressed model. On the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for DistilGPT2 (after fine-tuning on the train set). +We have applied the same method to other Transformer architectures and released the weights: +- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for **DistilGPT2** (after fine-tuning on the train set). +- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base` performance on GLUE while being twice faster and 35% smaller. +- and more to come! 🤗🤗🤗 For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108). Here are the results on the dev sets of GLUE: -| Model | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP | RTE | SST-2| STS-B| WNLI | -| :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| -| BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 | -| DistilBERT | **76.8** | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 | +| Model | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP | RTE | SST-2| STS-B| WNLI | +| :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---: | +| BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 | +| DistilBERT | **76.8** | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 | +| :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---: | +| RoBERTa-base (reported) | **83.2**/**86.4**2 | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7** | +| DistilRoBERTa1 | **79.0**/**82.3**2 | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 | + +1 We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa. +2 Macro-score computed without WNLI. +3 We compute this score ourselves for completeness. ## Setup @@ -27,13 +40,15 @@ This part of the library has only be tested with Python3.6+. There are few speci **Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). + ## How to use DistilBERT Transformers includes two pre-trained Distil* models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT): - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters. - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score). -- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset and . The model has 6 layers, 768 dimension and 12 heads, totalizing 82M (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2. +- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2. +- `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base. - and more to come! 🤗🤗🤗 Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models. @@ -47,7 +62,10 @@ outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple ``` -Similarly, using DistilGPT2 simply consists in calling the GPT2 classes from a different pretrained checkpoint: `model = GPT2Model.from_pretrained('distilgpt2')`. +Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint: +- DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')` +- DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')` + ## How to train Distil* From 5b6cafb11b39e78724dc13b57b81bd73c9a66b49 Mon Sep 17 00:00:00 2001 From: VictorSanh Date: Wed, 23 Oct 2019 10:35:16 -0400 Subject: [PATCH 04/10] [release] fix table weirdness --- examples/distillation/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/distillation/README.md b/examples/distillation/README.md index 344b5f7d46..7da1ad015b 100644 --- a/examples/distillation/README.md +++ b/examples/distillation/README.md @@ -26,12 +26,14 @@ Here are the results on the dev sets of GLUE: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---: | | BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 | | DistilBERT | **76.8** | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 | -| :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---: | -| RoBERTa-base (reported) | **83.2**/**86.4**2 | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7** | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| RoBERTa-base (reported) | **83.2**/**86.4**2 | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.73 | | DistilRoBERTa1 | **79.0**/**82.3**2 | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 | -1 We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa. +1 We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa. + 2 Macro-score computed without WNLI. + 3 We compute this score ourselves for completeness. ## Setup From 66085a132161d3257bb971d886bea1b52a476e4e Mon Sep 17 00:00:00 2001 From: Matt Maybeno Date: Wed, 23 Oct 2019 21:05:13 -0700 Subject: [PATCH 05/10] RoBERTa token classification [WIP] copy paste bert token classification for roberta --- transformers/__init__.py | 2 + transformers/modeling_roberta.py | 72 +++++++++++++++++++ transformers/modeling_tf_roberta.py | 51 +++++++++++++ transformers/tests/modeling_roberta_test.py | 19 ++++- .../tests/modeling_tf_roberta_test.py | 15 ++++ 5 files changed, 158 insertions(+), 1 deletion(-) diff --git a/transformers/__init__.py b/transformers/__init__.py index fbc92f078e..dbc66f86b9 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -89,6 +89,7 @@ if is_torch_available(): XLM_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification, RobertaForMultipleChoice, + RobertaForTokenClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, @@ -139,6 +140,7 @@ if is_tf_available(): from .modeling_tf_roberta import (TFRobertaPreTrainedModel, TFRobertaMainLayer, TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, + TFRobertaForTokenClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer, diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index eb340dc7fb..6b8d381579 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -343,6 +343,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): return outputs # (loss), logits, (hidden_states), (attentions) + @add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) @@ -451,6 +452,77 @@ class RobertaForMultipleChoice(BertPreTrainedModel): return outputs # (loss), reshaped_logits, (hidden_states), (attentions) +@add_start_docstrings("""Roberta Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) +class RobertaForTokenClassification(BertPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + model = RobertaForTokenClassification.from_pretrained('roberta-base') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, scores = outputs[:2] + + """ + def __init__(self, config): + super(RobertaForTokenClassification, self).__init__(config) + self.num_labels = config.num_labels + + self.roberta = RobertaModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, + position_ids=None, head_mask=None, labels=None): + + outputs = self.roberta(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), scores, (hidden_states), (attentions) + class RobertaClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py index 244c83f2b3..13a0522211 100644 --- a/transformers/modeling_tf_roberta.py +++ b/transformers/modeling_tf_roberta.py @@ -371,3 +371,54 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): outputs = (logits,) + outputs[2:] return outputs # logits, (hidden_states), (attentions) + + +@add_start_docstrings("""RoBERTa Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) +class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import RobertaTokenizer, TFRobertaForTokenClassification + + tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + model = TFRobertaForTokenClassification.from_pretrained('roberta-base') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + outputs = model(input_ids) + scores = outputs[0] + + """ + def __init__(self, config, *inputs, **kwargs): + super(TFRobertaForTokenClassification, self).__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta = TFRobertaMainLayer(config, name='roberta') + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = tf.keras.layers.Dense(config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name='classifier') + + def call(self, inputs, **kwargs): + outputs = self.roberta(inputs, **kwargs) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False)) + logits = self.classifier(sequence_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + + return outputs # scores, (hidden_states), (attentions) diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py index 82e10da915..0620ddf630 100644 --- a/transformers/tests/modeling_roberta_test.py +++ b/transformers/tests/modeling_roberta_test.py @@ -24,7 +24,8 @@ from transformers import is_torch_available if is_torch_available(): import torch - from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification) + from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, + RobertaForSequenceClassification, RobertaForTokenClassification) from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP else: pytestmark = pytest.mark.skip("Require Torch") @@ -156,6 +157,22 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result) + def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask, + sequence_labels, token_labels, choice_labels): + config.num_labels = self.num_labels + model = RobertaForTokenClassification(config=config) + model.eval() + loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, + labels=token_labels) + result = { + "loss": loss, + "logits": logits, + } + self.parent.assertListEqual( + list(result["logits"].size()), + [self.batch_size, self.seq_length, self.num_labels]) + self.check_loss_output(result) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() (config, input_ids, token_type_ids, input_mask, diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py index 735c9aae27..edbfa4e205 100644 --- a/transformers/tests/modeling_tf_roberta_test.py +++ b/transformers/tests/modeling_tf_roberta_test.py @@ -30,6 +30,7 @@ if is_tf_available(): import numpy from transformers.modeling_tf_roberta import (TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, + TFRobertaForTokenClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) else: pytestmark = pytest.mark.skip("Require TensorFlow") @@ -154,6 +155,20 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester): list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]) + def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + config.num_labels = self.num_labels + model = TFRobertaForTokenClassification(config=config) + inputs = {'input_ids': input_ids, + 'attention_mask': input_mask, + 'token_type_ids': token_type_ids} + logits, = model(inputs) + result = { + "logits": logits.numpy(), + } + self.parent.assertListEqual( + list(result["logits"].shape), + [self.batch_size, self.seq_length, self.num_labels]) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() (config, input_ids, token_type_ids, input_mask, From b92d68421dee75c3a078b26b78a05bd59007d855 Mon Sep 17 00:00:00 2001 From: Matt Maybeno Date: Wed, 23 Oct 2019 21:31:28 -0700 Subject: [PATCH 06/10] Use roberta model and update doc strings --- transformers/modeling_roberta.py | 6 +++++- transformers/modeling_tf_roberta.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index 6b8d381579..9d16c87888 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -478,12 +478,16 @@ class RobertaForTokenClassification(BertPreTrainedModel): tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForTokenClassification.from_pretrained('roberta-base') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ + config_class = RobertaConfig + pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP + base_model_prefix = "roberta" + def __init__(self, config): super(RobertaForTokenClassification, self).__init__(config) self.num_labels = config.num_labels diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py index 13a0522211..a239bc642b 100644 --- a/transformers/modeling_tf_roberta.py +++ b/transformers/modeling_tf_roberta.py @@ -396,7 +396,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForTokenClassification.from_pretrained('roberta-base') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] From 4e5f88b74fa914a5f45aec3260977acfc3513536 Mon Sep 17 00:00:00 2001 From: Matt Maybeno Date: Wed, 23 Oct 2019 22:50:03 -0700 Subject: [PATCH 07/10] Add Roberta to run_ner.py --- examples/run_ner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/run_ner.py b/examples/run_ner.py index 00eb039258..16fa89c3e7 100644 --- a/examples/run_ner.py +++ b/examples/run_ner.py @@ -35,15 +35,17 @@ from utils_ner import convert_examples_to_features, get_labels, read_examples_fr from transformers import AdamW, WarmupLinearSchedule from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer +from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer logger = logging.getLogger(__name__) ALL_MODELS = sum( - (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, )), + (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig)), ()) MODEL_CLASSES = { "bert": (BertConfig, BertForTokenClassification, BertTokenizer), + "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer) } From ae1d03fc51bb22ed59517ee6f92c560417fdb049 Mon Sep 17 00:00:00 2001 From: Matt Maybeno Date: Thu, 24 Oct 2019 10:43:57 -0700 Subject: [PATCH 08/10] Add roberta to doc --- examples/run_ner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_ner.py b/examples/run_ner.py index 16fa89c3e7..740b422429 100644 --- a/examples/run_ner.py +++ b/examples/run_ner.py @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert). """ +""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """ from __future__ import absolute_import, division, print_function From 438f2730a03e19bc21f2823c659ceaed0dfe8ef7 Mon Sep 17 00:00:00 2001 From: altsoph Date: Fri, 25 Oct 2019 13:22:58 +0300 Subject: [PATCH 09/10] Evaluation code fixed. --- examples/run_lm_finetuning.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 571bcb4391..4d32385e40 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -86,6 +86,7 @@ class TextDataset(Dataset): # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. + self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[-block_size:])) # DIRTY! logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, 'wb') as handle: @@ -309,10 +310,12 @@ def evaluate(args, model, tokenizer, prefix=""): model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): - batch = batch.to(args.device) + inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) + inputs = inputs.to(args.device) + labels = labels.to(args.device) with torch.no_grad(): - outputs = model(batch, masked_lm_labels=batch) if args.mlm else model(batch, labels=batch) + outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 @@ -540,4 +543,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file From 079bfb32fba4f2b39d344ca7af88d79a3ff27c7c Mon Sep 17 00:00:00 2001 From: altsoph Date: Fri, 25 Oct 2019 13:26:37 +0300 Subject: [PATCH 10/10] Evaluation fixed. --- examples/run_lm_finetuning.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 4d32385e40..982d8aa1b7 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -86,7 +86,6 @@ class TextDataset(Dataset): # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. - self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[-block_size:])) # DIRTY! logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, 'wb') as handle: @@ -543,4 +542,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()