Updated test architecture
This commit is contained in:
@@ -19,8 +19,9 @@ from __future__ import print_function
|
|||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM)
|
from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
|
||||||
from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
|
from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
|
||||||
@@ -156,6 +157,42 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
|||||||
inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
|
inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
|
||||||
return config, inputs_dict
|
return config, inputs_dict
|
||||||
|
|
||||||
|
def test_inference_masked_lm(self):
|
||||||
|
model = RobertaForMaskedLM.from_pretrained('roberta-base')
|
||||||
|
|
||||||
|
input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||||
|
output = model(input_ids)[0]
|
||||||
|
expected_shape = torch.Size((1, 11, 50265))
|
||||||
|
self.assertEqual(
|
||||||
|
output.shape,
|
||||||
|
expected_shape
|
||||||
|
)
|
||||||
|
# compare the actual values for a slice.
|
||||||
|
expected_slice = torch.Tensor(
|
||||||
|
[[[33.8843, -4.3107, 22.7779],
|
||||||
|
[4.6533, -2.8099, 13.6252],
|
||||||
|
[1.8222, -3.6898, 8.8600]]]
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
|
||||||
|
)
|
||||||
|
|
||||||
|
# @pytest.mark.slow
|
||||||
|
def test_inference_no_head(self):
|
||||||
|
model = RobertaModel.from_pretrained('roberta-base')
|
||||||
|
|
||||||
|
input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||||
|
output = model(input_ids)[0]
|
||||||
|
# compare the actual values for a slice.
|
||||||
|
expected_slice = torch.Tensor(
|
||||||
|
[[[-0.0231, 0.0782, 0.0074],
|
||||||
|
[-0.1854, 0.0539, -0.0174],
|
||||||
|
[0.0548, 0.0799, 0.1687]]]
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
|
||||||
|
)
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = RobertaModelTest.RobertaModelTester(self)
|
self.model_tester = RobertaModelTest.RobertaModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
|
self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
|
||||||
@@ -183,7 +220,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
class RobertaModelIntegrationTest(unittest.TestCase):
|
class RobertaModelIntegrationTest(unittest.TestCase):
|
||||||
|
|
||||||
@pytest.mark.slow
|
# @pytest.mark.slow
|
||||||
def test_inference_masked_lm(self):
|
def test_inference_masked_lm(self):
|
||||||
model = RobertaForMaskedLM.from_pretrained('roberta-base')
|
model = RobertaForMaskedLM.from_pretrained('roberta-base')
|
||||||
|
|
||||||
@@ -204,7 +241,7 @@ class RobertaModelIntegrationTest(unittest.TestCase):
|
|||||||
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
|
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.slow
|
# @pytest.mark.slow
|
||||||
def test_inference_no_head(self):
|
def test_inference_no_head(self):
|
||||||
model = RobertaModel.from_pretrained('roberta-base')
|
model = RobertaModel.from_pretrained('roberta-base')
|
||||||
|
|
||||||
|
|||||||
@@ -15,42 +15,68 @@
|
|||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import json
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
|
from pytorch_transformers.tokenization_roberta import RobertaTokenizer, DICT_FILES_NAMES
|
||||||
from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
|
from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
|
||||||
|
from .tokenization_tests_commons import CommonTestCases
|
||||||
|
|
||||||
|
|
||||||
class RobertaTokenizationTest(unittest.TestCase):
|
class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||||
|
tokenizer_class = RobertaTokenizer
|
||||||
|
|
||||||
def test_full_tokenizer(self):
|
def setUp(self):
|
||||||
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
|
super(RobertaTokenizationTest, self).setUp()
|
||||||
|
|
||||||
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
||||||
"lo", "low", "er",
|
"lo", "low", "er",
|
||||||
"low", "lowest", "newer", "wider", "<unk>"]
|
"low", "lowest", "newer", "wider", "<unk>"]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
special_tokens_map = {"unk_token": "<unk>"}
|
merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
|
||||||
|
self.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
with TemporaryDirectory() as tmpdirname:
|
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||||
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
||||||
with open(vocab_file, "w") as fp:
|
with open(self.vocab_file, "w") as fp:
|
||||||
[fp.write(f"{vocab} {index}\n") for index, vocab in enumerate(vocab_tokens)]
|
fp.write(json.dumps(vocab_tokens))
|
||||||
|
with open(self.merges_file, "w") as fp:
|
||||||
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
input_text = u"lower newer"
|
def get_tokenizer(self):
|
||||||
output_text = u"lower<unk>newer"
|
bpe_tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
|
||||||
|
return RobertaTokenizer.from_pretrained("roberta-base", bpe_tokenizer=bpe_tokenizer)
|
||||||
|
|
||||||
create_and_check_tokenizer_commons(self, input_text, output_text, RobertaTokenizer, tmpdirname, **special_tokens_map)
|
def get_input_output_texts(self):
|
||||||
|
input_text = u"lower newer"
|
||||||
|
output_text = u"lower<unk>newer"
|
||||||
|
return input_text, output_text
|
||||||
|
|
||||||
tokenizer = RobertaTokenizer(vocab_file, **special_tokens_map)
|
def test_full_tokenizer(self):
|
||||||
text = "lower"
|
tokenizer = self.get_tokenizer()
|
||||||
bpe_tokens = ["low", "er"]
|
text = "lower"
|
||||||
tokens = tokenizer.tokenize(text)
|
bpe_tokens = ["low", "er"]
|
||||||
self.assertListEqual(tokens, bpe_tokens)
|
tokens = tokenizer.tokenize(text)
|
||||||
|
self.assertListEqual(tokens, bpe_tokens)
|
||||||
|
|
||||||
input_tokens = tokens + [tokenizer.unk_token]
|
input_tokens = tokens + [tokenizer.unk_token]
|
||||||
input_bpe_tokens = [13, 12, 17]
|
input_bpe_tokens = [0, 4, 12, 176, 2]
|
||||||
self.assertListEqual(
|
tokenizer.convert_tokens_to_ids(input_tokens)
|
||||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
self.assertListEqual(
|
||||||
|
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||||
|
|
||||||
|
def roberta_dict_integration_testing(self):
|
||||||
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
tokenizer.encode('Hello world!'),
|
||||||
|
[0, 31414, 232, 328, 2]
|
||||||
|
)
|
||||||
|
self.assertListEqual(
|
||||||
|
tokenizer.encode('Hello world! cécé herlolip'),
|
||||||
|
[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
@@ -105,7 +105,7 @@ class CommonTestCases:
|
|||||||
self.assertEqual(added_toks, len(new_toks))
|
self.assertEqual(added_toks, len(new_toks))
|
||||||
self.assertEqual(all_size_2, all_size + len(new_toks))
|
self.assertEqual(all_size_2, all_size + len(new_toks))
|
||||||
|
|
||||||
tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
|
tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l", no_sep_cls_tokens=True)
|
||||||
self.assertGreaterEqual(len(tokens), 4)
|
self.assertGreaterEqual(len(tokens), 4)
|
||||||
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
||||||
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
||||||
@@ -121,7 +121,8 @@ class CommonTestCases:
|
|||||||
self.assertEqual(added_toks_2, len(new_toks_2))
|
self.assertEqual(added_toks_2, len(new_toks_2))
|
||||||
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
|
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
|
||||||
|
|
||||||
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
|
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
|
||||||
|
no_sep_cls_tokens=True)
|
||||||
|
|
||||||
self.assertGreaterEqual(len(tokens), 6)
|
self.assertGreaterEqual(len(tokens), 6)
|
||||||
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
||||||
|
|||||||
Reference in New Issue
Block a user