From d0d5aee1dd26de108a901921b9df19b889430645 Mon Sep 17 00:00:00 2001 From: David Yang Date: Wed, 12 Oct 2022 00:09:17 +0800 Subject: [PATCH] Make bert_japanese and cpm independent of their inherited modules (#19431) * Make cpm tokenization independent of xlnet * Make bert japanese tokenization independent of bert --- .../tokenization_bert_japanese.py | 373 +++++++++++++++++- .../models/cpm/tokenization_cpm.py | 251 +++++++++++- .../models/cpm/tokenization_cpm_fast.py | 131 +++++- .../test_tokenization_bert_japanese.py | 2 +- 4 files changed, 738 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index 0b6ccab3c4..54d77ea279 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -19,10 +19,10 @@ import collections import copy import os import unicodedata -from typing import Optional +from typing import List, Optional, Tuple +from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace from ...utils import logging -from ..bert.tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab logger = logging.get_logger(__name__) @@ -75,10 +75,35 @@ PRETRAINED_INIT_CONFIGURATION = { } -class BertJapaneseTokenizer(BertTokenizer): +# Copied from transformers.models.bert.tokenization_bert.load_vocab +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertJapaneseTokenizer(PreTrainedTokenizer): r""" Construct a BERT tokenizer for Japanese text. + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer + to: this superclass for more information regarding those methods. + Args: vocab_file (`str`): Path to a one-wordpiece-per-line vocabulary file. @@ -124,7 +149,7 @@ class BertJapaneseTokenizer(BertTokenizer): jumanpp_kwargs=None, **kwargs ): - super(BertTokenizer, self).__init__( + super().__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, @@ -141,7 +166,6 @@ class BertJapaneseTokenizer(BertTokenizer): jumanpp_kwargs=jumanpp_kwargs, **kwargs, ) - # ^^ We call the grandparent's init, not the parent's. if not os.path.isfile(vocab_file): raise ValueError( @@ -226,6 +250,137 @@ class BertJapaneseTokenizer(BertTokenizer): return split_tokens + @property + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size + def vocab_size(self): + return len(self.vocab) + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence + pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + class MecabTokenizer: """Runs basic tokenization with MeCab morphological parser.""" @@ -530,3 +685,211 @@ class CharacterTokenizer: output_tokens.append(char) return output_tokens + + +# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer +class BasicTokenizer(object): + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + """ + + def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see + WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) # + or (cp >= 0x20000 and cp <= 0x2A6DF) # + or (cp >= 0x2A700 and cp <= 0x2B73F) # + or (cp >= 0x2B740 and cp <= 0x2B81F) # + or (cp >= 0x2B820 and cp <= 0x2CEAF) # + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) # + ): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens diff --git a/src/transformers/models/cpm/tokenization_cpm.py b/src/transformers/models/cpm/tokenization_cpm.py index 812e887bc2..bf2ec8f7c4 100644 --- a/src/transformers/models/cpm/tokenization_cpm.py +++ b/src/transformers/models/cpm/tokenization_cpm.py @@ -13,8 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes.""" -from ...utils import logging -from ..xlnet.tokenization_xlnet import XLNetTokenizer +import os +import unicodedata +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import SPIECE_UNDERLINE, logging logger = logging.get_logger(__name__) @@ -28,10 +35,26 @@ PRETRAINED_VOCAB_FILES_MAP = { } -class CpmTokenizer(XLNetTokenizer): +class CpmTokenizer(PreTrainedTokenizer): """Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models.""" - def __init__(self, *args, **kwargs): + def __init__( + self, + vocab_file, + do_lower_case=False, + remove_space=True, + keep_accents=False, + bos_token="", + eos_token="", + unk_token="", + sep_token="", + pad_token="", + cls_token="", + mask_token="", + additional_special_tokens=["", ""], + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs + ) -> None: """ Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and [SentencePiece](https://github.com/google/sentencepiece). @@ -93,7 +116,37 @@ class CpmTokenizer(XLNetTokenizer): sp_model (`SentencePieceProcessor`): The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). """ - super().__init__(*args, **kwargs) + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + super().__init__( + do_lower_case=do_lower_case, + remove_space=remove_space, + keep_accents=keep_accents, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + self._pad_token_type_id = 3 + + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + try: import jieba except ModuleNotFoundError as error: @@ -104,10 +157,190 @@ class CpmTokenizer(XLNetTokenizer): self.jieba = jieba self.translator = str.maketrans(" \n", "\u2582\u2583") - def _tokenize(self, text, *args, **kwargs): - text = [x.translate(self.translator) for x in self.jieba.cut(text, cut_all=False)] - text = " ".join(text) - return super()._tokenize(text, *args, **kwargs) + @property + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.vocab_size + def vocab_size(self): + return len(self.sp_model) + + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.get_vocab + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.__getstate__ + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.__setstate__ + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.preprocess_text + def preprocess_text(self, inputs): + if self.remove_space: + outputs = " ".join(inputs.strip().split()) + else: + outputs = inputs + outputs = outputs.replace("``", '"').replace("''", '"') + + if not self.keep_accents: + outputs = unicodedata.normalize("NFKD", outputs) + outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) + if self.do_lower_case: + outputs = outputs.lower() + + return outputs + + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer._tokenize + def _tokenize(self, text: str) -> List[str]: + """Tokenize a string.""" + text = self.preprocess_text(text) + pieces = self.sp_model.encode(text, out_type=str) + new_pieces = [] + for piece in pieces: + if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): + cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) + if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: + if len(cur_pieces[0]) == 1: + cur_pieces = cur_pieces[1:] + else: + cur_pieces[0] = cur_pieces[0][1:] + cur_pieces.append(piece[-1]) + new_pieces.extend(cur_pieces) + else: + new_pieces.append(piece) + + return new_pieces + + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer._convert_token_to_id + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.PieceToId(token) + + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer._convert_id_to_token + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index) + + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.build_inputs_with_special_tokens + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An XLNet sequence has the following format: + + - single sequence: `X ` + - pair of sequences: `A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return token_ids_0 + sep + cls + return token_ids_0 + sep + token_ids_1 + sep + cls + + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.get_special_tokens_mask + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] + return ([0] * len(token_ids_0)) + [1, 1] + + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.create_token_type_ids_from_sequences + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls_segment_id = [2] + + if token_ids_1 is None: + return len(token_ids_0 + sep) * [0] + cls_segment_id + return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id + + # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) def _decode(self, *args, **kwargs): text = super()._decode(*args, **kwargs) diff --git a/src/transformers/models/cpm/tokenization_cpm_fast.py b/src/transformers/models/cpm/tokenization_cpm_fast.py index 748046b8d1..032aebcf5b 100644 --- a/src/transformers/models/cpm/tokenization_cpm_fast.py +++ b/src/transformers/models/cpm/tokenization_cpm_fast.py @@ -13,8 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes.""" +import os +from shutil import copyfile +from typing import List, Optional, Tuple + +from ...tokenization_utils_fast import AddedToken, PreTrainedTokenizerFast from ...utils import logging -from ..xlnet.tokenization_xlnet_fast import XLNetTokenizerFast logger = logging.get_logger(__name__) @@ -31,10 +35,26 @@ PRETRAINED_VOCAB_FILES_MAP = { } -class CpmTokenizerFast(XLNetTokenizerFast): +class CpmTokenizerFast(PreTrainedTokenizerFast): """Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models.""" - def __init__(self, *args, **kwargs): + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=False, + remove_space=True, + keep_accents=False, + bos_token="", + eos_token="", + unk_token="", + sep_token="", + pad_token="", + cls_token="", + mask_token="", + additional_special_tokens=["", ""], + **kwargs + ): """ Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and [SentencePiece](https://github.com/google/sentencepiece). @@ -96,7 +116,33 @@ class CpmTokenizerFast(XLNetTokenizerFast): sp_model (`SentencePieceProcessor`): The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). """ - super().__init__(*args, **kwargs) + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + super().__init__( + vocab_file=vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + remove_space=remove_space, + keep_accents=keep_accents, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + self._pad_token_type_id = 3 + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + self.can_save_slow_tokenizer = False if not self.vocab_file else True + try: import jieba except ModuleNotFoundError as error: @@ -107,6 +153,83 @@ class CpmTokenizerFast(XLNetTokenizerFast): self.jieba = jieba self.translator = str.maketrans(" \n", "\u2582\u2583") + # Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.build_inputs_with_special_tokens + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An XLNet sequence has the following format: + + - single sequence: `X ` + - pair of sequences: `A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return token_ids_0 + sep + cls + return token_ids_0 + sep + token_ids_1 + sep + cls + + # Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.create_token_type_ids_from_sequences + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls_segment_id = [2] + + if token_ids_1 is None: + return len(token_ids_0 + sep) * [0] + cls_segment_id + return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id + + # Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs): batch_text_or_text_pairs = [ " ".join([x.translate(self.translator) for x in self.jieba.cut(text, cut_all=False)]) diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py index 9aba5c3705..7141f75514 100644 --- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py +++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py @@ -19,10 +19,10 @@ import pickle import unittest from transformers import AutoTokenizer +from transformers.models.bert.tokenization_bert import BertTokenizer from transformers.models.bert_japanese.tokenization_bert_japanese import ( VOCAB_FILES_NAMES, BertJapaneseTokenizer, - BertTokenizer, CharacterTokenizer, JumanppTokenizer, MecabTokenizer,