From 770043eea2927eea1664fdd56b3996a8fb41731c Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 7 Aug 2019 12:53:19 -0400 Subject: [PATCH] Sentence-pair tasks handling. Using common tests on RoBERTa. Forced push to fix indentation. --- pytorch_transformers/__init__.py | 3 + pytorch_transformers/modeling_roberta.py | 28 ++- .../tests/modeling_roberta_test.py | 200 ++++++++++++++---- .../tests/tokenization_roberta_test.py | 45 ++-- pytorch_transformers/tokenization_roberta.py | 90 ++++++-- 5 files changed, 279 insertions(+), 87 deletions(-) diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py index b4b957192c..d1e42b130a 100644 --- a/pytorch_transformers/__init__.py +++ b/pytorch_transformers/__init__.py @@ -5,6 +5,7 @@ from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) from .tokenization_gpt2 import GPT2Tokenizer from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE from .tokenization_xlm import XLMTokenizer +from .tokenization_roberta import RobertaTokenizer from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization) from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining, @@ -33,6 +34,8 @@ from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLM_PRETRAINED_MODEL_ARCHIVE_MAP) +from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel, + ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, prune_layer, Conv1D) diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py index 109a719616..43f76989f4 100644 --- a/pytorch_transformers/modeling_roberta.py +++ b/pytorch_transformers/modeling_roberta.py @@ -23,6 +23,7 @@ import logging import torch import torch.nn as nn import torch.nn.functional as F +from torch.nn import CrossEntropyLoss from pytorch_transformers.modeling_bert import (BertConfig, BertEmbeddings, BertLayerNorm, BertModel, @@ -78,7 +79,7 @@ class RobertaModel(BertModel): super(RobertaModel, self).__init__(config) self.embeddings = RobertaEmbeddings(config) - + self.apply(self.init_weights) class RobertaForMaskedLM(BertPreTrainedModel): @@ -94,16 +95,31 @@ class RobertaForMaskedLM(BertPreTrainedModel): self.roberta = RobertaModel(config) self.lm_head = RobertaLMHead(config) - - def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None): + + self.apply(self.init_weights) + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, position_ids=None, + head_mask=None): outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, head_mask=head_mask) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) outputs = (prediction_scores,) + outputs[2:] - return outputs + if masked_lm_labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + outputs = (masked_lm_loss,) + outputs + + return outputs class RobertaLMHead(nn.Module): @@ -114,7 +130,7 @@ class RobertaLMHead(nn.Module): self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.weight = nn.Linear(config.hidden_size, config.vocab_size, bias=False).weight + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) def forward(self, features, **kwargs): @@ -123,6 +139,6 @@ class RobertaLMHead(nn.Module): x = self.layer_norm(x) # project back to size of vocabulary with bias - x = F.linear(x, self.weight) + self.bias + x = self.decoder(x) + self.bias return x diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/pytorch_transformers/tests/modeling_roberta_test.py index 62707326a6..273176b27a 100644 --- a/pytorch_transformers/tests/modeling_roberta_test.py +++ b/pytorch_transformers/tests/modeling_roberta_test.py @@ -12,58 +12,172 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function -import os import unittest +import shutil import pytest -import torch -from pytorch_transformers.modeling_roberta import (RobertaForMaskedLM, - RobertaModel) +from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM) +from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP + +from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) -class RobertaModelTest(unittest.TestCase): +class RobertaModelTest(CommonTestCases.CommonModelTester): - # @pytest.mark.slow - def test_inference_masked_lm(self): - model = RobertaForMaskedLM.from_pretrained('roberta-base') - - input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) - output = model(input_ids)[0] - expected_shape = torch.Size((1, 11, 50265)) - self.assertEqual( - output.shape, - expected_shape - ) - # compare the actual values for a slice. - expected_slice = torch.Tensor( - [[[33.8843, -4.3107, 22.7779], - [ 4.6533, -2.8099, 13.6252], - [ 1.8222, -3.6898, 8.8600]]] - ) - self.assertTrue( - torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3) - ) + all_model_classes = (RobertaForMaskedLM, RobertaModel) - # @pytest.mark.slow - def test_inference_no_head(self): - model = RobertaModel.from_pretrained('roberta-base') - - input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) - output = model(input_ids)[0] - # compare the actual values for a slice. - expected_slice = torch.Tensor( - [[[-0.0231, 0.0782, 0.0074], - [-0.1854, 0.0539, -0.0174], - [ 0.0548, 0.0799, 0.1687]]] - ) - self.assertTrue( - torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3) - ) + class RobertaModelTester(object): + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) -if __name__ == '__main__': + input_mask = None + if self.use_input_mask: + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = RobertaConfig( + vocab_size_or_config_json_file=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range) + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def check_loss_output(self, result): + self.parent.assertListEqual( + list(result["loss"].size()), + []) + + def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, + token_labels, choice_labels): + model = RobertaModel(config=config) + model.eval() + sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask) + sequence_output, pooled_output = model(input_ids, token_type_ids) + sequence_output, pooled_output = model(input_ids) + + result = { + "sequence_output": sequence_output, + "pooled_output": pooled_output, + } + self.parent.assertListEqual( + list(result["sequence_output"].size()), + [self.batch_size, self.seq_length, self.hidden_size]) + self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) + + def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, + token_labels, choice_labels): + model = RobertaForMaskedLM(config=config) + model.eval() + loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels) + result = { + "loss": loss, + "prediction_scores": prediction_scores, + } + self.parent.assertListEqual( + list(result["prediction_scores"].size()), + [self.batch_size, self.seq_length, self.vocab_size]) + self.check_loss_output(result) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, token_type_ids, input_mask, + sequence_labels, token_labels, choice_labels) = config_and_inputs + inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + return config, inputs_dict + + def setUp(self): + self.model_tester = RobertaModelTest.RobertaModelTester(self) + self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_roberta_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_roberta_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs) + + @pytest.mark.slow + def test_model_from_pretrained(self): + cache_dir = "/tmp/pytorch_transformers_test/" + for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: + model = RobertaModel.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(model) + +if __name__ == "__main__": unittest.main() diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py index cd4e17ec34..60df18ae2b 100644 --- a/pytorch_transformers/tests/tokenization_roberta_test.py +++ b/pytorch_transformers/tests/tokenization_roberta_test.py @@ -12,32 +12,45 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import os import unittest -import pytest -import six -from pytorch_transformers.tokenization_roberta import RobertaTokenizer +from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES +from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory class RobertaTokenizationTest(unittest.TestCase): - # @pytest.mark.slow def test_full_tokenizer(self): - tokenizer = RobertaTokenizer.from_pretrained('roberta-base') - self.assertListEqual( - tokenizer.encode('Hello world!'), - [0, 31414, 232, 328, 2] - ) - if six.PY3: - self.assertListEqual( - tokenizer.encode('Hello world! cécé herlolip'), - [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2] - ) + """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ + vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", + "lo", "low", "er", + "low", "lowest", "newer", "wider", ""] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + special_tokens_map = {"unk_token": ""} + with TemporaryDirectory() as tmpdirname: + vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file']) + with open(vocab_file, "w") as fp: + [fp.write(f"{vocab} {index}\n") for index, vocab in enumerate(vocab_tokens)] + + input_text = u"lower newer" + output_text = u"lowernewer" + + create_and_check_tokenizer_commons(self, input_text, output_text, RobertaTokenizer, tmpdirname, **special_tokens_map) + + tokenizer = RobertaTokenizer(vocab_file, **special_tokens_map) + text = "lower" + bpe_tokens = ["low", "er"] + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [tokenizer.unk_token] + input_bpe_tokens = [13, 12, 17] + self.assertListEqual( + tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) if __name__ == '__main__': diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py index 4f9a7bc0fa..7fa42bfb1c 100644 --- a/pytorch_transformers/tokenization_roberta.py +++ b/pytorch_transformers/tokenization_roberta.py @@ -22,22 +22,22 @@ import re from io import open import six -from .tokenization_utils import PreTrainedTokenizer +from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization from .tokenization_gpt2 import GPT2Tokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { - 'dict_file': 'dict.txt', + 'vocab_file': 'dict.txt', } PRETRAINED_VOCAB_FILES_MAP = { - 'dict_file': - { - 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt", - 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt", - 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt", - }, + 'vocab_file': + { + 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt", + 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt", + 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt", + }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { @@ -46,7 +46,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 'roberta-large-mnli': 512, } - SPACE_NORMALIZER = re.compile(r"\s+") def tokenize_line(line): @@ -142,7 +141,7 @@ class Dictionary(object): "rebuild the dataset".format(f)) return - lines = f.readlines() + lines = f.read().splitlines() for line in lines: idx = line.rfind(' ') if idx == -1: @@ -152,7 +151,7 @@ class Dictionary(object): self.indices[word] = len(self.symbols) self.symbols.append(word) self.count.append(count) - + def encode_line(self, line, line_tokenizer=tokenize_line, add_if_not_exist=True, consumer=None, append_eos=True, reverse_order=False): words = line_tokenizer(line) @@ -174,8 +173,6 @@ class Dictionary(object): return ids - - class RobertaTokenizer(PreTrainedTokenizer): """ RoBERTa tokenizer. Peculiarities: @@ -185,25 +182,53 @@ class RobertaTokenizer(PreTrainedTokenizer): pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, dict_file, + def __init__(self, vocab_file, bos_token="", eos_token="", **kwargs): - super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, **kwargs) + super(RobertaTokenizer, self).__init__(cls_token=bos_token, sep_token=eos_token, eos_token=eos_token, **kwargs) self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') - self.dictionary = Dictionary.load(dict_file) + self.dictionary = Dictionary.load(vocab_file) def _tokenize(self, text): """ Use GPT-2 Tokenizer """ return self.gpt2_tokenizer._tokenize(text) - def encode(self, text): + def encode(self, text, *args): """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. """ - gpt2_tokens_joined = " ".join( - str(x) for x in self.gpt2_tokenizer.convert_tokens_to_ids(self.tokenize(text)) - ) - bpe_sentence = ' ' + gpt2_tokens_joined + ' ' - return self.dictionary.encode_line(bpe_sentence, append_eos=False) + bpe_sentence = [self.cls_token] + \ + self.gpt2_tokenizer.convert_tokens_to_ids(self.tokenize(text)) + \ + [self.sep_token] + + if len(args): + for additional_sentence in args: + bpe_sentence += [self.sep_token + ] + \ + self.gpt2_tokenizer.convert_tokens_to_ids(self.tokenize(additional_sentence)) + \ + [self.sep_token] + + return self.dictionary.encode_line(' '.join([str(token) for token in bpe_sentence]), append_eos=False) + + def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): + """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary + with options to remove special tokens and clean up tokenization spaces. + Handles sentence pairs. + """ + filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) + + if any(isinstance(element, list) for element in filtered_tokens): + texts = [] + for element in filtered_tokens: + text = self.convert_tokens_to_string(element) + if clean_up_tokenization_spaces: + text = clean_up_tokenization(text) + texts.append(text) + return texts + else: + text = self.convert_tokens_to_string(filtered_tokens) + if clean_up_tokenization_spaces: + text = clean_up_tokenization(text) + return text def _convert_token_to_id(self, token): return self.dictionary.index(token) @@ -218,3 +243,24 @@ class RobertaTokenizer(PreTrainedTokenizer): def convert_tokens_to_string(self, tokens): return self.gpt2_tokenizer.convert_tokens_to_string(tokens) + + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): + # Remove the first and last tokens which are cls and sep tokens + ids = ids[1:-1] + # If multi sentence, then split (multi sentence found by looking for two sequential sep tokens) + ids = [list(map(int, example.split(' '))) for example in ' '.join([str(id) for id in ids]).split(' 2 2 ')] + + if len(ids) == 1: + tokens = self.gpt2_tokenizer.convert_ids_to_tokens(list(map(lambda id: int(self.dictionary[id]), ids[0]))) + else: + tokens = [] + for example in ids: + tokens += [ + self.gpt2_tokenizer.convert_ids_to_tokens(list(map(lambda id: int(self.dictionary[id]), example)))] + return tokens + + def convert_tokens_to_ids(self, tokens): + tokens = " ".join(str(x) for x in self.gpt2_tokenizer.convert_tokens_to_ids(tokens)) + bpe_sentence = ' ' + tokens + ' ' + return self.dictionary.encode_line(bpe_sentence, append_eos=False) +