Add integration tests for xlm roberta modelling and xlm roberta tokenzier (#3014)
* add first files * add xlm roberta integration tests * make style * flake 8 issues solved
This commit is contained in:
committed by
GitHub
parent
e8ce63ff21
commit
c913eb9c38
68
tests/test_modeling_xlm_roberta.py
Normal file
68
tests/test_modeling_xlm_roberta.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
from .utils import slow
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
from transformers import XLMRobertaModel
|
||||
|
||||
|
||||
class XLMRobertaModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_xlm_roberta_base(self):
|
||||
model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
|
||||
input_ids = torch.tensor([0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]).unsqueeze(
|
||||
0
|
||||
) # The dog is cute and lives in the garden house
|
||||
|
||||
expected_output_shape = torch.Size((1, 12, 768)) # batch_size, sequence_length, embedding_vector_dim
|
||||
expected_output_values_last_dim = torch.tensor(
|
||||
[-0.0101, 0.1218, -0.0803, 0.0801, 0.1327, 0.0776, -0.1215, 0.2383, 0.3338, 0.3106, 0.0300, 0.0252]
|
||||
).unsqueeze(0)
|
||||
# xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')
|
||||
# xlmr.eval()
|
||||
# expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
|
||||
|
||||
output = model(input_ids)[0].detach()
|
||||
self.assertEqual(output.shape, expected_output_shape)
|
||||
# compare the actual values for a slice of last dim
|
||||
self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
|
||||
|
||||
@slow
|
||||
def test_xlm_roberta_large(self):
|
||||
model = XLMRobertaModel.from_pretrained("xlm-roberta-large")
|
||||
input_ids = torch.tensor([0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]).unsqueeze(
|
||||
0
|
||||
) # The dog is cute and lives in the garden house
|
||||
|
||||
expected_output_shape = torch.Size((1, 12, 1024)) # batch_size, sequence_length, embedding_vector_dim
|
||||
expected_output_values_last_dim = torch.tensor(
|
||||
[-0.0699, -0.0318, 0.0705, -0.1241, 0.0999, -0.0520, 0.1004, -0.1838, -0.4704, 0.1437, 0.0821, 0.0126]
|
||||
).unsqueeze(0)
|
||||
# xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large')
|
||||
# xlmr.eval()
|
||||
# expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
|
||||
|
||||
output = model(input_ids)[0].detach()
|
||||
self.assertEqual(output.shape, expected_output_shape)
|
||||
# compare the actual values for a slice of last dim
|
||||
self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
|
||||
111
tests/test_tokenization_xlm_roberta.py
Normal file
111
tests/test_tokenization_xlm_roberta.py
Normal file
@@ -0,0 +1,111 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer
|
||||
|
||||
from .utils import slow
|
||||
|
||||
|
||||
class XLMRobertaTokenizationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_tokenization_base_easy_symbols(self):
|
||||
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
|
||||
|
||||
symbols = "Hello World!"
|
||||
original_tokenizer_encodings = [0, 35378, 6661, 38, 2]
|
||||
# xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base') # xlmr.large has same tokenizer
|
||||
# xlmr.eval()
|
||||
# xlmr.encode(symbols)
|
||||
|
||||
self.assertListEqual(original_tokenizer_encodings, tokenizer.encode(symbols))
|
||||
|
||||
@slow
|
||||
def test_tokenization_base_hard_symbols(self):
|
||||
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
|
||||
|
||||
symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth'
|
||||
original_tokenizer_encodings = [
|
||||
0,
|
||||
3293,
|
||||
83,
|
||||
10,
|
||||
4552,
|
||||
4989,
|
||||
7986,
|
||||
678,
|
||||
10,
|
||||
5915,
|
||||
111,
|
||||
179459,
|
||||
124850,
|
||||
4,
|
||||
6044,
|
||||
237,
|
||||
12,
|
||||
6,
|
||||
5,
|
||||
6,
|
||||
4,
|
||||
6780,
|
||||
705,
|
||||
15,
|
||||
1388,
|
||||
44,
|
||||
378,
|
||||
10114,
|
||||
711,
|
||||
152,
|
||||
20,
|
||||
6,
|
||||
5,
|
||||
22376,
|
||||
642,
|
||||
1221,
|
||||
15190,
|
||||
34153,
|
||||
450,
|
||||
5608,
|
||||
959,
|
||||
1119,
|
||||
57702,
|
||||
136,
|
||||
186,
|
||||
47,
|
||||
1098,
|
||||
29367,
|
||||
47,
|
||||
4426,
|
||||
3678,
|
||||
2740,
|
||||
4,
|
||||
6044,
|
||||
237,
|
||||
6284,
|
||||
50901,
|
||||
528,
|
||||
31,
|
||||
90,
|
||||
34,
|
||||
927,
|
||||
2,
|
||||
]
|
||||
# xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base') # xlmr.large has same tokenizer
|
||||
# xlmr.eval()
|
||||
# xlmr.encode(symbols)
|
||||
|
||||
self.assertListEqual(original_tokenizer_encodings, tokenizer.encode(symbols))
|
||||
Reference in New Issue
Block a user