From fbd746bd065a9aaacd1ef25840cdc9ec957e8cac Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 8 Aug 2019 18:21:34 -0400 Subject: [PATCH] Updated test architecture --- .../tests/modeling_roberta_test.py | 43 +++++++++++- .../tests/tokenization_roberta_test.py | 70 +++++++++++++------ .../tests/tokenization_tests_commons.py | 5 +- 3 files changed, 91 insertions(+), 27 deletions(-) diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/pytorch_transformers/tests/modeling_roberta_test.py index 36145466b9..e0455d8508 100644 --- a/pytorch_transformers/tests/modeling_roberta_test.py +++ b/pytorch_transformers/tests/modeling_roberta_test.py @@ -19,8 +19,9 @@ from __future__ import print_function import unittest import shutil import pytest +import torch -from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM) +from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification) from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) @@ -156,6 +157,42 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} return config, inputs_dict + def test_inference_masked_lm(self): + model = RobertaForMaskedLM.from_pretrained('roberta-base') + + input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + output = model(input_ids)[0] + expected_shape = torch.Size((1, 11, 50265)) + self.assertEqual( + output.shape, + expected_shape + ) + # compare the actual values for a slice. + expected_slice = torch.Tensor( + [[[33.8843, -4.3107, 22.7779], + [4.6533, -2.8099, 13.6252], + [1.8222, -3.6898, 8.8600]]] + ) + self.assertTrue( + torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3) + ) + + # @pytest.mark.slow + def test_inference_no_head(self): + model = RobertaModel.from_pretrained('roberta-base') + + input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + output = model(input_ids)[0] + # compare the actual values for a slice. + expected_slice = torch.Tensor( + [[[-0.0231, 0.0782, 0.0074], + [-0.1854, 0.0539, -0.0174], + [0.0548, 0.0799, 0.1687]]] + ) + self.assertTrue( + torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3) + ) + def setUp(self): self.model_tester = RobertaModelTest.RobertaModelTester(self) self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37) @@ -183,7 +220,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): class RobertaModelIntegrationTest(unittest.TestCase): - @pytest.mark.slow + # @pytest.mark.slow def test_inference_masked_lm(self): model = RobertaForMaskedLM.from_pretrained('roberta-base') @@ -204,7 +241,7 @@ class RobertaModelIntegrationTest(unittest.TestCase): torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3) ) - @pytest.mark.slow + # @pytest.mark.slow def test_inference_no_head(self): model = RobertaModel.from_pretrained('roberta-base') diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py index 60df18ae2b..fbb3f8381d 100644 --- a/pytorch_transformers/tests/tokenization_roberta_test.py +++ b/pytorch_transformers/tests/tokenization_roberta_test.py @@ -15,42 +15,68 @@ from __future__ import absolute_import, division, print_function, unicode_literals import os +import json import unittest -from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES -from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory +from pytorch_transformers.tokenization_roberta import RobertaTokenizer, DICT_FILES_NAMES +from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES +from .tokenization_tests_commons import CommonTestCases -class RobertaTokenizationTest(unittest.TestCase): +class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): + tokenizer_class = RobertaTokenizer - def test_full_tokenizer(self): - """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ + def setUp(self): + super(RobertaTokenizationTest, self).setUp() + + # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "low", "er", "low", "lowest", "newer", "wider", ""] vocab_tokens = dict(zip(vocab, range(len(vocab)))) - special_tokens_map = {"unk_token": ""} + merges = ["#version: 0.2", "l o", "lo w", "e r", ""] + self.special_tokens_map = {"unk_token": ""} - with TemporaryDirectory() as tmpdirname: - vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file']) - with open(vocab_file, "w") as fp: - [fp.write(f"{vocab} {index}\n") for index, vocab in enumerate(vocab_tokens)] + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) + self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) + with open(self.vocab_file, "w") as fp: + fp.write(json.dumps(vocab_tokens)) + with open(self.merges_file, "w") as fp: + fp.write("\n".join(merges)) - input_text = u"lower newer" - output_text = u"lowernewer" + def get_tokenizer(self): + bpe_tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map) + return RobertaTokenizer.from_pretrained("roberta-base", bpe_tokenizer=bpe_tokenizer) - create_and_check_tokenizer_commons(self, input_text, output_text, RobertaTokenizer, tmpdirname, **special_tokens_map) + def get_input_output_texts(self): + input_text = u"lower newer" + output_text = u"lowernewer" + return input_text, output_text - tokenizer = RobertaTokenizer(vocab_file, **special_tokens_map) - text = "lower" - bpe_tokens = ["low", "er"] - tokens = tokenizer.tokenize(text) - self.assertListEqual(tokens, bpe_tokens) + def test_full_tokenizer(self): + tokenizer = self.get_tokenizer() + text = "lower" + bpe_tokens = ["low", "er"] + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) - input_tokens = tokens + [tokenizer.unk_token] - input_bpe_tokens = [13, 12, 17] - self.assertListEqual( - tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + input_tokens = tokens + [tokenizer.unk_token] + input_bpe_tokens = [0, 4, 12, 176, 2] + tokenizer.convert_tokens_to_ids(input_tokens) + self.assertListEqual( + tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + def roberta_dict_integration_testing(self): + tokenizer = self.get_tokenizer() + + self.assertListEqual( + tokenizer.encode('Hello world!'), + [0, 31414, 232, 328, 2] + ) + self.assertListEqual( + tokenizer.encode('Hello world! cécé herlolip'), + [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2] + ) if __name__ == '__main__': diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py index ebcf6f48d8..e766a825a0 100644 --- a/pytorch_transformers/tests/tokenization_tests_commons.py +++ b/pytorch_transformers/tests/tokenization_tests_commons.py @@ -105,7 +105,7 @@ class CommonTestCases: self.assertEqual(added_toks, len(new_toks)) self.assertEqual(all_size_2, all_size + len(new_toks)) - tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l") + tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l", no_sep_cls_tokens=True) self.assertGreaterEqual(len(tokens), 4) self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) @@ -121,7 +121,8 @@ class CommonTestCases: self.assertEqual(added_toks_2, len(new_toks_2)) self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) - tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l") + tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", + no_sep_cls_tokens=True) self.assertGreaterEqual(len(tokens), 6) self.assertGreater(tokens[0], tokenizer.vocab_size - 1)