Add integration tests for xlm roberta modelling and xlm roberta tokenzier (#3014)
* add first files * add xlm roberta integration tests * make style * flake 8 issues solved
This commit is contained in:
committed by
GitHub
parent
e8ce63ff21
commit
c913eb9c38
111
tests/test_tokenization_xlm_roberta.py
Normal file
111
tests/test_tokenization_xlm_roberta.py
Normal file
@@ -0,0 +1,111 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer
|
||||
|
||||
from .utils import slow
|
||||
|
||||
|
||||
class XLMRobertaTokenizationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_tokenization_base_easy_symbols(self):
|
||||
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
|
||||
|
||||
symbols = "Hello World!"
|
||||
original_tokenizer_encodings = [0, 35378, 6661, 38, 2]
|
||||
# xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base') # xlmr.large has same tokenizer
|
||||
# xlmr.eval()
|
||||
# xlmr.encode(symbols)
|
||||
|
||||
self.assertListEqual(original_tokenizer_encodings, tokenizer.encode(symbols))
|
||||
|
||||
@slow
|
||||
def test_tokenization_base_hard_symbols(self):
|
||||
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
|
||||
|
||||
symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth'
|
||||
original_tokenizer_encodings = [
|
||||
0,
|
||||
3293,
|
||||
83,
|
||||
10,
|
||||
4552,
|
||||
4989,
|
||||
7986,
|
||||
678,
|
||||
10,
|
||||
5915,
|
||||
111,
|
||||
179459,
|
||||
124850,
|
||||
4,
|
||||
6044,
|
||||
237,
|
||||
12,
|
||||
6,
|
||||
5,
|
||||
6,
|
||||
4,
|
||||
6780,
|
||||
705,
|
||||
15,
|
||||
1388,
|
||||
44,
|
||||
378,
|
||||
10114,
|
||||
711,
|
||||
152,
|
||||
20,
|
||||
6,
|
||||
5,
|
||||
22376,
|
||||
642,
|
||||
1221,
|
||||
15190,
|
||||
34153,
|
||||
450,
|
||||
5608,
|
||||
959,
|
||||
1119,
|
||||
57702,
|
||||
136,
|
||||
186,
|
||||
47,
|
||||
1098,
|
||||
29367,
|
||||
47,
|
||||
4426,
|
||||
3678,
|
||||
2740,
|
||||
4,
|
||||
6044,
|
||||
237,
|
||||
6284,
|
||||
50901,
|
||||
528,
|
||||
31,
|
||||
90,
|
||||
34,
|
||||
927,
|
||||
2,
|
||||
]
|
||||
# xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base') # xlmr.large has same tokenizer
|
||||
# xlmr.eval()
|
||||
# xlmr.encode(symbols)
|
||||
|
||||
self.assertListEqual(original_tokenizer_encodings, tokenizer.encode(symbols))
|
||||
Reference in New Issue
Block a user