diff --git a/tests/test_modeling_xlm_roberta.py b/tests/test_modeling_xlm_roberta.py new file mode 100644 index 0000000000..8c6bd0069b --- /dev/null +++ b/tests/test_modeling_xlm_roberta.py @@ -0,0 +1,68 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import is_torch_available + +from .utils import slow + + +if is_torch_available(): + import torch + from transformers import XLMRobertaModel + + +class XLMRobertaModelIntegrationTest(unittest.TestCase): + @slow + def test_xlm_roberta_base(self): + model = XLMRobertaModel.from_pretrained("xlm-roberta-base") + input_ids = torch.tensor([0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]).unsqueeze( + 0 + ) # The dog is cute and lives in the garden house + + expected_output_shape = torch.Size((1, 12, 768)) # batch_size, sequence_length, embedding_vector_dim + expected_output_values_last_dim = torch.tensor( + [-0.0101, 0.1218, -0.0803, 0.0801, 0.1327, 0.0776, -0.1215, 0.2383, 0.3338, 0.3106, 0.0300, 0.0252] + ).unsqueeze(0) + # xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base') + # xlmr.eval() + # expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1] + + output = model(input_ids)[0].detach() + self.assertEqual(output.shape, expected_output_shape) + # compare the actual values for a slice of last dim + self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3)) + + @slow + def test_xlm_roberta_large(self): + model = XLMRobertaModel.from_pretrained("xlm-roberta-large") + input_ids = torch.tensor([0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]).unsqueeze( + 0 + ) # The dog is cute and lives in the garden house + + expected_output_shape = torch.Size((1, 12, 1024)) # batch_size, sequence_length, embedding_vector_dim + expected_output_values_last_dim = torch.tensor( + [-0.0699, -0.0318, 0.0705, -0.1241, 0.0999, -0.0520, 0.1004, -0.1838, -0.4704, 0.1437, 0.0821, 0.0126] + ).unsqueeze(0) + # xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large') + # xlmr.eval() + # expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1] + + output = model(input_ids)[0].detach() + self.assertEqual(output.shape, expected_output_shape) + # compare the actual values for a slice of last dim + self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3)) diff --git a/tests/test_tokenization_xlm_roberta.py b/tests/test_tokenization_xlm_roberta.py new file mode 100644 index 0000000000..bf1169c8ab --- /dev/null +++ b/tests/test_tokenization_xlm_roberta.py @@ -0,0 +1,111 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer + +from .utils import slow + + +class XLMRobertaTokenizationIntegrationTest(unittest.TestCase): + @slow + def test_tokenization_base_easy_symbols(self): + tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base") + + symbols = "Hello World!" + original_tokenizer_encodings = [0, 35378, 6661, 38, 2] + # xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base') # xlmr.large has same tokenizer + # xlmr.eval() + # xlmr.encode(symbols) + + self.assertListEqual(original_tokenizer_encodings, tokenizer.encode(symbols)) + + @slow + def test_tokenization_base_hard_symbols(self): + tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base") + + symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to , such as saoneuhaoesuth' + original_tokenizer_encodings = [ + 0, + 3293, + 83, + 10, + 4552, + 4989, + 7986, + 678, + 10, + 5915, + 111, + 179459, + 124850, + 4, + 6044, + 237, + 12, + 6, + 5, + 6, + 4, + 6780, + 705, + 15, + 1388, + 44, + 378, + 10114, + 711, + 152, + 20, + 6, + 5, + 22376, + 642, + 1221, + 15190, + 34153, + 450, + 5608, + 959, + 1119, + 57702, + 136, + 186, + 47, + 1098, + 29367, + 47, + 4426, + 3678, + 2740, + 4, + 6044, + 237, + 6284, + 50901, + 528, + 31, + 90, + 34, + 927, + 2, + ] + # xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base') # xlmr.large has same tokenizer + # xlmr.eval() + # xlmr.encode(symbols) + + self.assertListEqual(original_tokenizer_encodings, tokenizer.encode(symbols))