Tokenization tests + fixes + init

2019-10-30 16:19:49 +00:00
parent e3ea5d1d8d
commit ee20201d33
5 changed files with 102 additions and 19 deletions
--- a/transformers/init.py
+++ b/transformers/init.py
@@ -42,6 +42,7 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
 from .tokenization_albert import AlbertTokenizer
 from .tokenization_camembert import CamembertTokenizer
 # Configurations
@@ -57,6 +58,8 @@ from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_albert import AlbertConfig, ALBERT
 from .configuration_albert import AlbertConfig
 from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 # Modeling
@@ -104,6 +107,8 @@ if is_torch_available():
                                CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
    from .modeling_albert import (AlbertModel, AlbertForMaskedLM)
    # Optimization
    from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
                               get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
--- a/transformers/tests/fixtures/30k-clean.model
+++ b/transformers/tests/fixtures/30k-clean.model
--- a/transformers/tests/tokenization_albert_test.py
+++ b/transformers/tests/tokenization_albert_test.py
@@ -0,0 +1,78 @@
 # coding=utf-8
 # Copyright 2019 Hugging Face inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 import os
 import unittest
 from transformers.tokenization_albert import (AlbertTokenizer, SPIECE_UNDERLINE)
 from .tokenization_tests_commons import CommonTestCases
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                    'fixtures/30k-clean.model')
 class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester):
    tokenizer_class = AlbertTokenizer
    def setUp(self):
        super(AlbertTokenizationTest, self).setUp()
        # We have a SentencePiece fixture for testing
        tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
        tokenizer.save_pretrained(self.tmpdirname)
    def get_tokenizer(self, **kwargs):
        return AlbertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
    def get_input_output_texts(self):
        input_text = u"this is a test"
        output_text = u"this is a test"
        return input_text, output_text
    def test_full_tokenizer(self):
        tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)
        tokens = tokenizer.tokenize(u'This is a test')
        self.assertListEqual(tokens, [u'▁this', u'▁is', u'▁a', u'▁test'])
        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
        self.assertListEqual(tokens, [u'▁i', u'▁was', u'▁born', u'▁in', u'▁9', u'2000', u',', u'▁and', u'▁this', u'▁is', u'▁fal', u's', u'é', u'.'])
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(back_tokens, ['▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fal', 's', '<unk>', '.'])
    def test_sequence_builders(self):
        tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
        text = tokenizer.encode("sequence builders")
        text_2 = tokenizer.encode("multi-sequence build")
        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [tokenizer.sep_token_id]
 if __name__ == '__main__':
    unittest.main()
--- a/transformers/tokenization_albert.py
+++ b/transformers/tokenization_albert.py
@@ -8,6 +8,7 @@ from shutil import copyfile
 logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {'vocab_file': '30k-clean.model'}
 SPIECE_UNDERLINE = u'▁'
 class AlbertTokenizer(PreTrainedTokenizer):
@@ -16,12 +17,12 @@ class AlbertTokenizer(PreTrainedTokenizer):
            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
    """
-    # vocab_files_names = VOCAB_FILES_NAMES
+    vocab_files_names = VOCAB_FILES_NAMES
    # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    def __init__(self, vocab_file,
-                 do_lower_case=False, remove_space=True, keep_accents=False,
+                 do_lower_case=True, remove_space=True, keep_accents=False,
                 bos_token="[CLS]", eos_token="[SEP]", unk_token="<unk>", sep_token="[SEP]",
                 pad_token="<pad>", cls_token="[CLS]", mask_token="[MASK]>", **kwargs):
        super(AlbertTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
@@ -142,15 +143,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
+        An ALBERT sequence has the following format:
-            single sequence: <s> X </s>
+            single sequence: [CLS] X [SEP]
-            pair of sequences: <s> A </s></s> B </s>
+            pair of sequences: [CLS] A [SEP] B [SEP]
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
-            return token_ids_0 + sep + cls
+            return cls + token_ids_0 + sep
-        return token_ids_0 + sep + token_ids_1 + sep + cls
+        return cls + token_ids_0 + sep + token_ids_1 + sep
    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
@@ -175,25 +176,24 @@ class AlbertTokenizer(PreTrainedTokenizer):
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        if token_ids_1 is not None:
-            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return ([0] * len(token_ids_0)) + [1, 1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A BERT sequence pair mask has the following format:
+        An ALBERT sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 
-        | first sequence    | second sequence     | CLS segment ID
+        | first sequence    | second sequence     
        if token_ids_1 is None, only returns the first portion of the mask (0's).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        cls_segment_id = [2]
        if token_ids_1 is None:
-            return len(token_ids_0 + sep + cls) * [0]
+            return len(cls + token_ids_0 + sep) * [0]
-        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    def save_vocabulary(self, save_directory):
        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -185,9 +185,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
+        An XLNet sequence has the following format:
-            single sequence: <s> X </s>
+            single sequence: X <sep> <cls>
-            pair of sequences: <s> A </s></s> B </s>
+            pair of sequences: A <sep> B <sep> <cls>
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -224,7 +224,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A BERT sequence pair mask has the following format:
+        An XLNet sequence pair mask has the following format:
        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
        | first sequence    | second sequence     | CLS segment ID