From 626a0a01471accc32ded29ccca3ed93c4995fcd6 Mon Sep 17 00:00:00 2001 From: yujun <50394665+JunnYu@users.noreply.github.com> Date: Tue, 6 Jul 2021 15:31:57 +0800 Subject: [PATCH] [RoFormer] Fix some issues (#12397) * add RoFormerTokenizerFast into AutoTokenizer * fix typo in roformer docs * make onnx export happy * update RoFormerConfig embedding_size * use jieba not rjieba * fix 12244 and make test_alignement passed * update ARCHIVE_MAP * make style & quality & fixup * update * make style & quality & fixup * make style quality fixup * update * suggestion from LysandreJik Co-authored-by: Lysandre Debut * make style * use rjieba Co-authored-by: Lysandre Debut --- docs/source/model_doc/roformer.rst | 2 +- src/transformers/file_utils.py | 4 +++ .../models/auto/tokenization_auto.py | 4 ++- .../models/roformer/configuration_roformer.py | 15 ++++++---- .../models/roformer/modeling_roformer.py | 10 +++++-- .../models/roformer/modeling_tf_roformer.py | 6 +++- .../models/roformer/tokenization_roformer.py | 26 +++++++++++----- .../roformer/tokenization_roformer_fast.py | 17 ++++++++++- .../models/roformer/tokenization_utils.py | 30 +++++++++---------- src/transformers/testing_utils.py | 11 +++++++ tests/test_tokenization_roformer.py | 25 +++++----------- 11 files changed, 97 insertions(+), 53 deletions(-) diff --git a/docs/source/model_doc/roformer.rst b/docs/source/model_doc/roformer.rst index 6ca558abea..21f1fe6bbe 100644 --- a/docs/source/model_doc/roformer.rst +++ b/docs/source/model_doc/roformer.rst @@ -56,7 +56,7 @@ RoFormerTokenizer create_token_type_ids_from_sequences, save_vocabulary -RobertaTokenizerFast +RoFormerTokenizerFast ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.RoFormerTokenizerFast diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 5f522a440c..c3717a9289 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -315,6 +315,10 @@ def is_datasets_available(): return _datasets_available +def is_rjieba_available(): + return importlib.util.find_spec("rjieba") is not None + + def is_psutil_available(): return importlib.util.find_spec("psutil") is not None diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 14c59742f2..c417b486a8 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -198,6 +198,7 @@ if is_tokenizers_available(): from ..reformer.tokenization_reformer_fast import ReformerTokenizerFast from ..retribert.tokenization_retribert_fast import RetriBertTokenizerFast from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast + from ..roformer.tokenization_roformer_fast import RoFormerTokenizerFast from ..squeezebert.tokenization_squeezebert_fast import SqueezeBertTokenizerFast from ..t5.tokenization_t5_fast import T5TokenizerFast from ..xlm_roberta.tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast @@ -232,6 +233,7 @@ else: ReformerTokenizerFast = None RetriBertTokenizerFast = None RobertaTokenizerFast = None + RoFormerTokenizerFast = None SqueezeBertTokenizerFast = None T5TokenizerFast = None XLMRobertaTokenizerFast = None @@ -245,7 +247,7 @@ logger = logging.get_logger(__name__) TOKENIZER_MAPPING = OrderedDict( [ (RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)), - (RoFormerConfig, (RoFormerTokenizer, None)), + (RoFormerConfig, (RoFormerTokenizer, RoFormerTokenizerFast)), (T5Config, (T5Tokenizer, T5TokenizerFast)), (MT5Config, (MT5Tokenizer, MT5TokenizerFast)), (MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)), diff --git a/src/transformers/models/roformer/configuration_roformer.py b/src/transformers/models/roformer/configuration_roformer.py index 24e3e2c30f..945d1064a1 100644 --- a/src/transformers/models/roformer/configuration_roformer.py +++ b/src/transformers/models/roformer/configuration_roformer.py @@ -22,7 +22,11 @@ logger = logging.get_logger(__name__) ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { "junnyu/roformer_chinese_small": "https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/config.json", - "junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/config.json" + "junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/config.json", + "junnyu/roformer_chinese_char_small": "https://huggingface.co/junnyu/roformer_chinese_char_small/resolve/main/config.json", + "junnyu/roformer_chinese_char_base": "https://huggingface.co/junnyu/roformer_chinese_char_base/resolve/main/config.json", + "junnyu/roformer_small_discriminator": "https://huggingface.co/junnyu/roformer_small_discriminator/resolve/main/config.json", + "junnyu/roformer_small_generator": "https://huggingface.co/junnyu/roformer_small_generator/resolve/main/config.json", # See all RoFormer models at https://huggingface.co/models?filter=roformer } @@ -43,8 +47,9 @@ class RoFormerConfig(PretrainedConfig): Vocabulary size of the RoFormer model. Defines the number of different tokens that can be represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.RoFormerModel` or :class:`~transformers.TFRoFormerModel`. - embedding_size (:obj:`int`, `optional`, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. + embedding_size (:obj:`int`, `optional`, defaults to None): + Dimensionality of the encoder layers and the pooler layer. Defaults to the :obj:`hidden_size` if not + provided. hidden_size (:obj:`int`, `optional`, defaults to 768): Dimension of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, `optional`, defaults to 12): @@ -96,7 +101,7 @@ class RoFormerConfig(PretrainedConfig): def __init__( self, vocab_size=50000, - embedding_size=768, + embedding_size=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, @@ -117,7 +122,7 @@ class RoFormerConfig(PretrainedConfig): super().__init__(pad_token_id=pad_token_id, **kwargs) self.vocab_size = vocab_size - self.embedding_size = embedding_size + self.embedding_size = hidden_size if embedding_size is None else embedding_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index e7c42afd68..0e943617b7 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -60,7 +60,11 @@ _TOKENIZER_FOR_DOC = "RoFormerTokenizer" ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ "junnyu/roformer_chinese_small", - "junnyu/roformer_chinese_base" + "junnyu/roformer_chinese_base", + "junnyu/roformer_chinese_char_small", + "junnyu/roformer_chinese_char_base", + "junnyu/roformer_small_discriminator", + "junnyu/roformer_small_generator" # See all RoFormer models at https://huggingface.co/models?filter=roformer ] @@ -327,9 +331,9 @@ class RoFormerSelfAttention(nn.Module): # cos [batch_size, num_heads, sequence_length, embed_size_per_head//2] sin, cos = sinusoidal_pos.chunk(2, dim=-1) # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1] - sin_pos = torch.repeat_interleave(sin, 2, dim=-1) + sin_pos = torch.stack([sin, sin], dim=-1).reshape_as(sinusoidal_pos) # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1] - cos_pos = torch.repeat_interleave(cos, 2, dim=-1) + cos_pos = torch.stack([cos, cos], dim=-1).reshape_as(sinusoidal_pos) # rotate_half_query_layer [-q1,q0,-q3,q2......,-qd-1,qd-2] rotate_half_query_layer = torch.stack([-query_layer[..., 1::2], query_layer[..., ::2]], dim=-1).reshape_as( query_layer diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py index dae6e180b1..436acdbd30 100644 --- a/src/transformers/models/roformer/modeling_tf_roformer.py +++ b/src/transformers/models/roformer/modeling_tf_roformer.py @@ -65,7 +65,11 @@ _TOKENIZER_FOR_DOC = "RoFormerTokenizer" TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ "junnyu/roformer_chinese_small", - "junnyu/roformer_chinese_base" + "junnyu/roformer_chinese_base", + "junnyu/roformer_chinese_char_small", + "junnyu/roformer_chinese_char_base", + "junnyu/roformer_small_discriminator", + "junnyu/roformer_small_generator" # See all RoFormer models at https://huggingface.co/models?filter=roformer ] diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py index efb5d83051..a425ec934c 100644 --- a/src/transformers/models/roformer/tokenization_roformer.py +++ b/src/transformers/models/roformer/tokenization_roformer.py @@ -31,15 +31,30 @@ PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "junnyu/roformer_chinese_small": "https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/vocab.txt", "junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/vocab.txt", + "junnyu/roformer_chinese_char_small": "https://huggingface.co/junnyu/roformer_chinese_char_small/resolve/main/vocab.txt", + "junnyu/roformer_chinese_char_base": "https://huggingface.co/junnyu/roformer_chinese_char_base/resolve/main/vocab.txt", + "junnyu/roformer_small_discriminator": "https://huggingface.co/junnyu/roformer_small_discriminator/resolve/main/vocab.txt", + "junnyu/roformer_small_generator": "https://huggingface.co/junnyu/roformer_small_generator/resolve/main/vocab.txt", } } -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"junnyu/roformer_chinese_small": 1536, "junnyu/roformer_chinese_base": 1536} +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "junnyu/roformer_chinese_small": 1536, + "junnyu/roformer_chinese_base": 1536, + "junnyu/roformer_chinese_char_small": 512, + "junnyu/roformer_chinese_char_base": 512, + "junnyu/roformer_small_discriminator": 128, + "junnyu/roformer_small_generator": 128, +} PRETRAINED_INIT_CONFIGURATION = { "junnyu/roformer_chinese_small": {"do_lower_case": True}, "junnyu/roformer_chinese_base": {"do_lower_case": True}, + "junnyu/roformer_chinese_char_small": {"do_lower_case": True}, + "junnyu/roformer_chinese_char_base": {"do_lower_case": True}, + "junnyu/roformer_small_discriminator": {"do_lower_case": True}, + "junnyu/roformer_small_generator": {"do_lower_case": True}, } @@ -166,13 +181,8 @@ class RoFormerTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - try: - import rjieba - except ImportError: - raise ImportError( - "You need to install rjieba to use RoFormerTokenizer." - "See https://pypi.org/project/rjieba/ for installation." - ) + import rjieba + self.jieba = rjieba def get_vocab(self): diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py index 983cc2fba5..736f157f92 100644 --- a/src/transformers/models/roformer/tokenization_roformer_fast.py +++ b/src/transformers/models/roformer/tokenization_roformer_fast.py @@ -33,15 +33,30 @@ PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "junnyu/roformer_chinese_small": "https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/vocab.txt", "junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/vocab.txt", + "junnyu/roformer_chinese_char_small": "https://huggingface.co/junnyu/roformer_chinese_char_small/resolve/main/vocab.txt", + "junnyu/roformer_chinese_char_base": "https://huggingface.co/junnyu/roformer_chinese_char_base/resolve/main/vocab.txt", + "junnyu/roformer_small_discriminator": "https://huggingface.co/junnyu/roformer_small_discriminator/resolve/main/vocab.txt", + "junnyu/roformer_small_generator": "https://huggingface.co/junnyu/roformer_small_generator/resolve/main/vocab.txt", } } -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"junnyu/roformer_chinese_small": 1536, "junnyu/roformer_chinese_base": 1536} +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "junnyu/roformer_chinese_small": 1536, + "junnyu/roformer_chinese_base": 1536, + "junnyu/roformer_chinese_char_small": 512, + "junnyu/roformer_chinese_char_base": 512, + "junnyu/roformer_small_discriminator": 128, + "junnyu/roformer_small_generator": 128, +} PRETRAINED_INIT_CONFIGURATION = { "junnyu/roformer_chinese_small": {"do_lower_case": True}, "junnyu/roformer_chinese_base": {"do_lower_case": True}, + "junnyu/roformer_chinese_char_small": {"do_lower_case": True}, + "junnyu/roformer_chinese_char_base": {"do_lower_case": True}, + "junnyu/roformer_small_discriminator": {"do_lower_case": True}, + "junnyu/roformer_small_generator": {"do_lower_case": True}, } diff --git a/src/transformers/models/roformer/tokenization_utils.py b/src/transformers/models/roformer/tokenization_utils.py index d956d5214c..195e6eff2d 100644 --- a/src/transformers/models/roformer/tokenization_utils.py +++ b/src/transformers/models/roformer/tokenization_utils.py @@ -41,26 +41,26 @@ class JiebaPreTokenizer: splits = [] # this code slice normalized_string is too slow (6s) but test_alignement_methods can pass - # for token, start, end in self.jieba.tokenize(str(normalized_string), hmm=False): - # if token in self.vocab: - # splits.append(normalized_string.slice((start, end))) - # else: - # token_list = self.normalizers.normalize_str(token).split() - # for token in token_list: - # if token: - # end = start + len(token) - # splits.append(normalized_string.slice((start, end))) - # start = end - - # this code test_alignement_methods can't pass but fast (300ms) - for token in self.jieba.cut(str(normalized_string), False): + for token, start, end in self.jieba.tokenize(str(normalized_string), hmm=False): if token in self.vocab: - splits.append(NormalizedString(token)) + splits.append(normalized_string[start:end]) else: token_list = self.normalizers.normalize_str(token).split() for token in token_list: if token: - splits.append(NormalizedString(token)) + end = start + len(token) + splits.append(normalized_string[start:end]) + start = end + + # this code test_alignement_methods can't pass but fast (300ms) + # for token in self.jieba.cut(str(normalized_string), False): + # if token in self.vocab: + # splits.append(NormalizedString(token)) + # else: + # token_list = self.normalizers.normalize_str(token).split() + # for token in token_list: + # if token: + # splits.append(NormalizedString(token)) return splits diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index d315785ed9..439cee385d 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -35,6 +35,7 @@ from .file_utils import ( is_flax_available, is_onnx_available, is_pandas_available, + is_rjieba_available, is_scatter_available, is_sentencepiece_available, is_soundfile_availble, @@ -223,6 +224,16 @@ def require_git_lfs(test_case): return test_case +def require_rjieba(test_case): + """ + Decorator marking a test that requires rjieba. These tests are skipped when rjieba isn't installed. + """ + if not is_rjieba_available(): + return unittest.skip("test requires rjieba")(test_case) + else: + return test_case + + def require_onnx(test_case): if not is_onnx_available(): return unittest.skip("test requires ONNX")(test_case) diff --git a/tests/test_tokenization_roformer.py b/tests/test_tokenization_roformer.py index 19c7fb6543..c5e19b66b2 100644 --- a/tests/test_tokenization_roformer.py +++ b/tests/test_tokenization_roformer.py @@ -13,29 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import importlib import unittest from transformers import RoFormerTokenizer, RoFormerTokenizerFast -from transformers.testing_utils import require_tokenizers +from transformers.testing_utils import require_rjieba, require_tokenizers from .test_tokenization_common import TokenizerTesterMixin -def is_rjieba_available(): - return importlib.util.find_spec("rjieba") is not None - - -def require_rjieba(test_case): - """ - Decorator marking a test that requires Jieba. These tests are skipped when Jieba isn't installed. - """ - if not is_rjieba_available(): - return unittest.skip("test requires rjieba")(test_case) - else: - return test_case - - @require_rjieba @require_tokenizers class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @@ -79,6 +64,10 @@ class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): exp_tokens = [22943, 21332, 34431, 45904, 117, 306, 1231, 1231, 2653, 33994, 1266, 100] self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), exp_tokens) - # due to custom pre_tokenize , char_to_token may be error - def test_alignement_methods(self): + # can't train new_tokenizer via Tokenizers lib + def test_training_new_tokenizer(self): + pass + + # can't train new_tokenizer via Tokenizers lib + def test_training_new_tokenizer_with_special_tokens_change(self): pass