Flaubert auto tokenizer + tests

cc @julien-c
This commit is contained in:
Lysandre
2020-01-31 14:16:34 -05:00
parent d18d47be67
commit 1e82cd8457
4 changed files with 53 additions and 2 deletions

View File

@@ -50,8 +50,8 @@ class FlaubertConfig(XLMConfig):
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
with Structured Dropout. ICLR 2020) with Structured Dropout. ICLR 2020)
vocab_size (:obj:`int`, optional, defaults to 30145): vocab_size (:obj:`int`, optional, defaults to 30145):
Vocabulary size of the XLM model. Defines the different tokens that Vocabulary size of the Flaubert model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
emb_dim (:obj:`int`, optional, defaults to 2048): emb_dim (:obj:`int`, optional, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
n_layer (:obj:`int`, optional, defaults to 12): n_layer (:obj:`int`, optional, defaults to 12):

View File

@@ -25,6 +25,7 @@ from .configuration_auto import (
CamembertConfig, CamembertConfig,
CTRLConfig, CTRLConfig,
DistilBertConfig, DistilBertConfig,
FlaubertConfig,
GPT2Config, GPT2Config,
OpenAIGPTConfig, OpenAIGPTConfig,
RobertaConfig, RobertaConfig,
@@ -41,6 +42,7 @@ from .tokenization_bert_japanese import BertJapaneseTokenizer
from .tokenization_camembert import CamembertTokenizer from .tokenization_camembert import CamembertTokenizer
from .tokenization_ctrl import CTRLTokenizer from .tokenization_ctrl import CTRLTokenizer
from .tokenization_distilbert import DistilBertTokenizer from .tokenization_distilbert import DistilBertTokenizer
from .tokenization_flaubert import FlaubertTokenizer
from .tokenization_gpt2 import GPT2Tokenizer from .tokenization_gpt2 import GPT2Tokenizer
from .tokenization_openai import OpenAIGPTTokenizer from .tokenization_openai import OpenAIGPTTokenizer
from .tokenization_roberta import RobertaTokenizer from .tokenization_roberta import RobertaTokenizer
@@ -67,6 +69,7 @@ TOKENIZER_MAPPING = OrderedDict(
(GPT2Config, GPT2Tokenizer), (GPT2Config, GPT2Tokenizer),
(TransfoXLConfig, TransfoXLTokenizer), (TransfoXLConfig, TransfoXLTokenizer),
(XLNetConfig, XLNetTokenizer), (XLNetConfig, XLNetTokenizer),
(FlaubertConfig, FlaubertTokenizer),
(XLMConfig, XLMTokenizer), (XLMConfig, XLMTokenizer),
(CTRLConfig, CTRLTokenizer), (CTRLConfig, CTRLTokenizer),
] ]

View File

@@ -39,6 +39,14 @@ if is_torch_available():
BertForQuestionAnswering, BertForQuestionAnswering,
) )
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from transformers.modeling_auto import (
MODEL_MAPPING,
MODEL_FOR_PRETRAINING_MAPPING,
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
MODEL_WITH_LM_HEAD_MAPPING,
)
@require_torch @require_torch
@@ -127,3 +135,26 @@ class AutoModelTest(unittest.TestCase):
self.assertIsInstance(model, RobertaForMaskedLM) self.assertIsInstance(model, RobertaForMaskedLM)
self.assertEqual(model.num_parameters(), 14830) self.assertEqual(model.num_parameters(), 14830)
self.assertEqual(model.num_parameters(only_trainable=True), 14830) self.assertEqual(model.num_parameters(only_trainable=True), 14830)
def test_parents_and_children_in_mappings(self):
# Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
# by the parents and will return the wrong configuration type when using auto models
mappings = (
MODEL_MAPPING,
MODEL_FOR_PRETRAINING_MAPPING,
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
MODEL_WITH_LM_HEAD_MAPPING,
)
for mapping in mappings:
mapping = tuple(mapping.items())
for index, (child_config, child_model) in enumerate(mapping[1:]):
for parent_config, parent_model in mapping[: index + 1]:
with self.subTest(
msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
):
self.assertFalse(issubclass(child_config, parent_config))
self.assertFalse(issubclass(child_model, parent_model))

View File

@@ -25,6 +25,7 @@ from transformers import (
GPT2Tokenizer, GPT2Tokenizer,
RobertaTokenizer, RobertaTokenizer,
) )
from transformers.tokenization_auto import TOKENIZER_MAPPING
from .utils import DUMMY_UNKWOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, slow # noqa: F401 from .utils import DUMMY_UNKWOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, slow # noqa: F401
@@ -70,3 +71,19 @@ class AutoTokenizerTest(unittest.TestCase):
for tokenizer_class in [BertTokenizer, AutoTokenizer]: for tokenizer_class in [BertTokenizer, AutoTokenizer]:
with self.assertRaises(EnvironmentError): with self.assertRaises(EnvironmentError):
_ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists") _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
def test_parents_and_children_in_mappings(self):
# Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
# by the parents and will return the wrong configuration type when using auto models
mappings = (TOKENIZER_MAPPING,)
for mapping in mappings:
mapping = tuple(mapping.items())
for index, (child_config, child_model) in enumerate(mapping[1:]):
for parent_config, parent_model in mapping[: index + 1]:
with self.subTest(
msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
):
self.assertFalse(issubclass(child_config, parent_config))
self.assertFalse(issubclass(child_model, parent_model))