Tokenization tests + fixes + init
This commit is contained in:
@@ -42,6 +42,7 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
|
|||||||
from .tokenization_xlm import XLMTokenizer
|
from .tokenization_xlm import XLMTokenizer
|
||||||
from .tokenization_roberta import RobertaTokenizer
|
from .tokenization_roberta import RobertaTokenizer
|
||||||
from .tokenization_distilbert import DistilBertTokenizer
|
from .tokenization_distilbert import DistilBertTokenizer
|
||||||
|
from .tokenization_albert import AlbertTokenizer
|
||||||
from .tokenization_camembert import CamembertTokenizer
|
from .tokenization_camembert import CamembertTokenizer
|
||||||
|
|
||||||
# Configurations
|
# Configurations
|
||||||
@@ -57,6 +58,8 @@ from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|||||||
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_albert import AlbertConfig, ALBERT
|
||||||
|
from .configuration_albert import AlbertConfig
|
||||||
from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
# Modeling
|
# Modeling
|
||||||
@@ -104,6 +107,8 @@ if is_torch_available():
|
|||||||
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
||||||
|
|
||||||
|
from .modeling_albert import (AlbertModel, AlbertForMaskedLM)
|
||||||
|
|
||||||
# Optimization
|
# Optimization
|
||||||
from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
|
from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
|
||||||
get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
|
get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
|
||||||
|
|||||||
BIN
transformers/tests/fixtures/30k-clean.model
vendored
Normal file
BIN
transformers/tests/fixtures/30k-clean.model
vendored
Normal file
Binary file not shown.
78
transformers/tests/tokenization_albert_test.py
Normal file
78
transformers/tests/tokenization_albert_test.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019 Hugging Face inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from transformers.tokenization_albert import (AlbertTokenizer, SPIECE_UNDERLINE)
|
||||||
|
|
||||||
|
from .tokenization_tests_commons import CommonTestCases
|
||||||
|
|
||||||
|
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||||
|
'fixtures/30k-clean.model')
|
||||||
|
|
||||||
|
class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||||
|
|
||||||
|
tokenizer_class = AlbertTokenizer
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super(AlbertTokenizationTest, self).setUp()
|
||||||
|
|
||||||
|
# We have a SentencePiece fixture for testing
|
||||||
|
tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
|
||||||
|
tokenizer.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
|
def get_tokenizer(self, **kwargs):
|
||||||
|
return AlbertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
|
def get_input_output_texts(self):
|
||||||
|
input_text = u"this is a test"
|
||||||
|
output_text = u"this is a test"
|
||||||
|
return input_text, output_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_tokenizer(self):
|
||||||
|
tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
|
|
||||||
|
tokens = tokenizer.tokenize(u'This is a test')
|
||||||
|
self.assertListEqual(tokens, [u'▁this', u'▁is', u'▁a', u'▁test'])
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
|
||||||
|
|
||||||
|
tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
|
||||||
|
self.assertListEqual(tokens, [u'▁i', u'▁was', u'▁born', u'▁in', u'▁9', u'2000', u',', u'▁and', u'▁this', u'▁is', u'▁fal', u's', u'é', u'.'])
|
||||||
|
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||||
|
self.assertListEqual(ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
|
||||||
|
|
||||||
|
back_tokens = tokenizer.convert_ids_to_tokens(ids)
|
||||||
|
self.assertListEqual(back_tokens, ['▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fal', 's', '<unk>', '.'])
|
||||||
|
|
||||||
|
def test_sequence_builders(self):
|
||||||
|
tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
|
||||||
|
|
||||||
|
text = tokenizer.encode("sequence builders")
|
||||||
|
text_2 = tokenizer.encode("multi-sequence build")
|
||||||
|
|
||||||
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
|
assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
|
||||||
|
assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [tokenizer.sep_token_id]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
@@ -8,6 +8,7 @@ from shutil import copyfile
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
VOCAB_FILES_NAMES = {'vocab_file': '30k-clean.model'}
|
||||||
SPIECE_UNDERLINE = u'▁'
|
SPIECE_UNDERLINE = u'▁'
|
||||||
|
|
||||||
class AlbertTokenizer(PreTrainedTokenizer):
|
class AlbertTokenizer(PreTrainedTokenizer):
|
||||||
@@ -16,12 +17,12 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
|
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
|
||||||
"""
|
"""
|
||||||
# vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
# pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
# pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
# max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
# max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||||
|
|
||||||
def __init__(self, vocab_file,
|
def __init__(self, vocab_file,
|
||||||
do_lower_case=False, remove_space=True, keep_accents=False,
|
do_lower_case=True, remove_space=True, keep_accents=False,
|
||||||
bos_token="[CLS]", eos_token="[SEP]", unk_token="<unk>", sep_token="[SEP]",
|
bos_token="[CLS]", eos_token="[SEP]", unk_token="<unk>", sep_token="[SEP]",
|
||||||
pad_token="<pad>", cls_token="[CLS]", mask_token="[MASK]>", **kwargs):
|
pad_token="<pad>", cls_token="[CLS]", mask_token="[MASK]>", **kwargs):
|
||||||
super(AlbertTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
|
super(AlbertTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
|
||||||
@@ -142,15 +143,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
"""
|
"""
|
||||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
by concatenating and adding special tokens.
|
by concatenating and adding special tokens.
|
||||||
A RoBERTa sequence has the following format:
|
An ALBERT sequence has the following format:
|
||||||
single sequence: <s> X </s>
|
single sequence: [CLS] X [SEP]
|
||||||
pair of sequences: <s> A </s></s> B </s>
|
pair of sequences: [CLS] A [SEP] B [SEP]
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return token_ids_0 + sep + cls
|
return cls + token_ids_0 + sep
|
||||||
return token_ids_0 + sep + token_ids_1 + sep + cls
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||||
"""
|
"""
|
||||||
@@ -175,25 +176,24 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
||||||
|
|
||||||
if token_ids_1 is not None:
|
if token_ids_1 is not None:
|
||||||
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return ([0] * len(token_ids_0)) + [1, 1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A BERT sequence pair mask has the following format:
|
An ALBERT sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
|
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
|
||||||
| first sequence | second sequence | CLS segment ID
|
| first sequence | second sequence
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
cls_segment_id = [2]
|
|
||||||
|
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return len(token_ids_0 + sep + cls) * [0]
|
return len(cls + token_ids_0 + sep) * [0]
|
||||||
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
||||||
|
|||||||
@@ -185,9 +185,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
"""
|
"""
|
||||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
by concatenating and adding special tokens.
|
by concatenating and adding special tokens.
|
||||||
A RoBERTa sequence has the following format:
|
An XLNet sequence has the following format:
|
||||||
single sequence: <s> X </s>
|
single sequence: X <sep> <cls>
|
||||||
pair of sequences: <s> A </s></s> B </s>
|
pair of sequences: A <sep> B <sep> <cls>
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
@@ -224,7 +224,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A BERT sequence pair mask has the following format:
|
An XLNet sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
|
||||||
| first sequence | second sequence | CLS segment ID
|
| first sequence | second sequence | CLS segment ID
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user