Flaubert auto tokenizer + tests

cc @julien-c
2020-01-31 14:16:34 -05:00
parent d18d47be67
commit 1e82cd8457
4 changed files with 53 additions and 2 deletions
--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
@@ -50,8 +50,8 @@ class FlaubertConfig(XLMConfig):
                Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
                with Structured Dropout. ICLR 2020)
            vocab_size (:obj:`int`, optional, defaults to 30145):
-                Vocabulary size of the XLM model. Defines the different tokens that
+                Vocabulary size of the Flaubert model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
            emb_dim (:obj:`int`, optional, defaults to 2048):
                Dimensionality of the encoder layers and the pooler layer.
            n_layer (:obj:`int`, optional, defaults to 12):
--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -25,6 +25,7 @@ from .configuration_auto import (
    CamembertConfig,
    CTRLConfig,
    DistilBertConfig,
    FlaubertConfig,
    GPT2Config,
    OpenAIGPTConfig,
    RobertaConfig,
@@ -41,6 +42,7 @@ from .tokenization_bert_japanese import BertJapaneseTokenizer
 from .tokenization_camembert import CamembertTokenizer
 from .tokenization_ctrl import CTRLTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
 from .tokenization_flaubert import FlaubertTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_roberta import RobertaTokenizer
@@ -67,6 +69,7 @@ TOKENIZER_MAPPING = OrderedDict(
        (GPT2Config, GPT2Tokenizer),
        (TransfoXLConfig, TransfoXLTokenizer),
        (XLNetConfig, XLNetTokenizer),
        (FlaubertConfig, FlaubertTokenizer),
        (XLMConfig, XLMTokenizer),
        (CTRLConfig, CTRLTokenizer),
    ]
--- a/tests/test_modeling_auto.py
+++ b/tests/test_modeling_auto.py
@@ -39,6 +39,14 @@ if is_torch_available():
        BertForQuestionAnswering,
    )
    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
    from transformers.modeling_auto import (
        MODEL_MAPPING,
        MODEL_FOR_PRETRAINING_MAPPING,
        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
        MODEL_WITH_LM_HEAD_MAPPING,
    )
@require_torch
@@ -127,3 +135,26 @@ class AutoModelTest(unittest.TestCase):
        self.assertIsInstance(model, RobertaForMaskedLM)
        self.assertEqual(model.num_parameters(), 14830)
        self.assertEqual(model.num_parameters(only_trainable=True), 14830)
    def test_parents_and_children_in_mappings(self):
        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
        # by the parents and will return the wrong configuration type when using auto models
        mappings = (
            MODEL_MAPPING,
            MODEL_FOR_PRETRAINING_MAPPING,
            MODEL_FOR_QUESTION_ANSWERING_MAPPING,
            MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
            MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
            MODEL_WITH_LM_HEAD_MAPPING,
        )
        for mapping in mappings:
            mapping = tuple(mapping.items())
            for index, (child_config, child_model) in enumerate(mapping[1:]):
                for parent_config, parent_model in mapping[: index + 1]:
                    with self.subTest(
                        msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
                    ):
                        self.assertFalse(issubclass(child_config, parent_config))
                        self.assertFalse(issubclass(child_model, parent_model))
--- a/tests/test_tokenization_auto.py
+++ b/tests/test_tokenization_auto.py
@@ -25,6 +25,7 @@ from transformers import (
    GPT2Tokenizer,
    RobertaTokenizer,
 )
 from transformers.tokenization_auto import TOKENIZER_MAPPING
 from .utils import DUMMY_UNKWOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, slow  # noqa: F401
@@ -70,3 +71,19 @@ class AutoTokenizerTest(unittest.TestCase):
        for tokenizer_class in [BertTokenizer, AutoTokenizer]:
            with self.assertRaises(EnvironmentError):
                _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
    def test_parents_and_children_in_mappings(self):
        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
        # by the parents and will return the wrong configuration type when using auto models
        mappings = (TOKENIZER_MAPPING,)
        for mapping in mappings:
            mapping = tuple(mapping.items())
            for index, (child_config, child_model) in enumerate(mapping[1:]):
                for parent_config, parent_model in mapping[: index + 1]:
                    with self.subTest(
                        msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
                    ):
                        self.assertFalse(issubclass(child_config, parent_config))
                        self.assertFalse(issubclass(child_model, parent_model))