Add support for Japanese BERT models by cl-tohoku
This commit is contained in:
committed by
Julien Chaumond
parent
030faccb8d
commit
c03c0dfd23
@@ -61,6 +61,24 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on uncased German text by DBMDZ |
|
||||
| | | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__). |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. |
|
||||
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
|
||||
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. |
|
||||
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
|
||||
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``bert-base-japanese-char`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on Japanese text. Text is tokenized into characters. |
|
||||
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``bert-base-japanese-char-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. |
|
||||
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | OpenAI GPT English model |
|
||||
|
||||
@@ -37,6 +37,7 @@ if is_sklearn_available():
|
||||
from .tokenization_utils import (PreTrainedTokenizer)
|
||||
from .tokenization_auto import AutoTokenizer
|
||||
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
||||
from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
|
||||
from .tokenization_openai import OpenAIGPTTokenizer
|
||||
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
|
||||
from .tokenization_gpt2 import GPT2Tokenizer
|
||||
|
||||
@@ -42,6 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
|
||||
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
|
||||
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
|
||||
'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-config.json",
|
||||
'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-config.json",
|
||||
'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-config.json",
|
||||
'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-config.json"
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -48,6 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
|
||||
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
|
||||
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
|
||||
'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-pytorch_model.bin",
|
||||
'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-pytorch_model.bin",
|
||||
'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-pytorch_model.bin",
|
||||
'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
|
||||
}
|
||||
|
||||
|
||||
@@ -1233,9 +1237,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
||||
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
|
||||
input_ids = tokenizer.encode(input_text)
|
||||
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
|
||||
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
|
||||
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
|
||||
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
||||
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
||||
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
|
||||
# a nice puppet
|
||||
|
||||
|
||||
@@ -48,6 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
|
||||
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
|
||||
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
|
||||
'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-tf_model.h5",
|
||||
'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-tf_model.h5",
|
||||
'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-tf_model.h5",
|
||||
'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-tf_model.h5"
|
||||
}
|
||||
|
||||
|
||||
@@ -129,7 +133,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
||||
linear tensor, float32 with shape [batch_size, length, vocab_size].
|
||||
Raises:
|
||||
ValueError: if mode is not valid.
|
||||
|
||||
|
||||
Shared weights logic adapted from
|
||||
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
|
||||
"""
|
||||
@@ -148,7 +152,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
||||
input_shape = shape_list(input_ids)
|
||||
else:
|
||||
input_shape = shape_list(inputs_embeds)[:-1]
|
||||
|
||||
|
||||
seq_length = input_shape[1]
|
||||
if position_ids is None:
|
||||
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
||||
@@ -246,7 +250,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
|
||||
context_layer = tf.matmul(attention_probs, value_layer)
|
||||
|
||||
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||
context_layer = tf.reshape(context_layer,
|
||||
context_layer = tf.reshape(context_layer,
|
||||
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
|
||||
|
||||
outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
|
||||
@@ -591,7 +595,7 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in
|
||||
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||
|
||||
Parameters:
|
||||
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
|
||||
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
@@ -605,13 +609,13 @@ BERT_INPUTS_DOCSTRING = r"""
|
||||
(a) For sequence pairs:
|
||||
|
||||
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
||||
|
||||
|
||||
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
||||
|
||||
(b) For single sequences:
|
||||
|
||||
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||
|
||||
|
||||
``token_type_ids: 0 0 0 0 0 0 0``
|
||||
|
||||
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
|
||||
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
import logging
|
||||
|
||||
from .tokenization_bert import BertTokenizer
|
||||
from .tokenization_bert_japanese import BertJapaneseTokenizer
|
||||
from .tokenization_openai import OpenAIGPTTokenizer
|
||||
from .tokenization_gpt2 import GPT2Tokenizer
|
||||
from .tokenization_ctrl import CTRLTokenizer
|
||||
@@ -118,6 +119,8 @@ class AutoTokenizer(object):
|
||||
return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'roberta' in pretrained_model_name_or_path:
|
||||
return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'bert-japanese' in pretrained_model_name_or_path:
|
||||
return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'bert' in pretrained_model_name_or_path:
|
||||
return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||
|
||||
247
transformers/tokenization_bert_japanese.py
Normal file
247
transformers/tokenization_bert_japanese.py
Normal file
@@ -0,0 +1,247 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import collections
|
||||
import logging
|
||||
import os
|
||||
import unicodedata
|
||||
from io import open
|
||||
|
||||
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer, load_vocab
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
'vocab_file':
|
||||
{
|
||||
'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-vocab.txt",
|
||||
'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-vocab.txt",
|
||||
'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-vocab.txt",
|
||||
'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-vocab.txt"
|
||||
}
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
'bert-base-japanese': 512,
|
||||
'bert-base-japanese-whole-word-masking': 512,
|
||||
'bert-base-japanese-char': 512,
|
||||
'bert-base-japanese-char-whole-word-masking': 512
|
||||
}
|
||||
|
||||
PRETRAINED_INIT_CONFIGURATION = {
|
||||
'bert-base-japanese': {
|
||||
'do_lower_case': False,
|
||||
'word_tokenizer_type': 'mecab',
|
||||
'subword_tokenizer_type': 'wordpiece'
|
||||
},
|
||||
'bert-base-japanese-whole-word-masking':{
|
||||
'do_lower_case': False,
|
||||
'word_tokenizer_type': 'mecab',
|
||||
'subword_tokenizer_type': 'wordpiece'
|
||||
},
|
||||
'bert-base-japanese-char': {
|
||||
'do_lower_case': False,
|
||||
'word_tokenizer_type': 'mecab',
|
||||
'subword_tokenizer_type': 'character'
|
||||
},
|
||||
'bert-base-japanese-char-whole-word-masking': {
|
||||
'do_lower_case': False,
|
||||
'word_tokenizer_type': 'mecab',
|
||||
'subword_tokenizer_type': 'character'
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class BertJapaneseTokenizer(BertTokenizer):
|
||||
"""BERT tokenizer for Japanese text"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
|
||||
def __init__(self, vocab_file, do_lower_case=False,
|
||||
do_word_tokenize=True, do_subword_tokenize=True,
|
||||
word_tokenizer_type='basic', subword_tokenizer_type='wordpiece',
|
||||
never_split=None, unk_token='[UNK]', sep_token='[SEP]',
|
||||
pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
|
||||
"""Constructs a MecabBertTokenizer.
|
||||
|
||||
Args:
|
||||
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
|
||||
**do_lower_case**: (`optional`) boolean (default True)
|
||||
Whether to lower case the input.
|
||||
Only has an effect when do_basic_tokenize=True.
|
||||
**do_word_tokenize**: (`optional`) boolean (default True)
|
||||
Whether to do word tokenization.
|
||||
**do_subword_tokenize**: (`optional`) boolean (default True)
|
||||
Whether to do subword tokenization.
|
||||
**word_tokenizer_type**: (`optional`) string (default "basic")
|
||||
Type of word tokenizer.
|
||||
**subword_tokenizer_type**: (`optional`) string (default "wordpiece")
|
||||
Type of subword tokenizer.
|
||||
"""
|
||||
super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
|
||||
pad_token=pad_token, cls_token=cls_token,
|
||||
mask_token=mask_token, **kwargs)
|
||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
|
||||
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
|
||||
self.vocab = load_vocab(vocab_file)
|
||||
self.ids_to_tokens = collections.OrderedDict(
|
||||
[(ids, tok) for tok, ids in self.vocab.items()])
|
||||
|
||||
self.do_word_tokenize = do_word_tokenize
|
||||
if do_word_tokenize:
|
||||
if word_tokenizer_type == 'basic':
|
||||
self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
|
||||
never_split=never_split,
|
||||
tokenize_chinese_chars=False)
|
||||
elif word_tokenizer_type == 'mecab':
|
||||
self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case,
|
||||
never_split=never_split)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
|
||||
|
||||
self.do_subword_tokenize = do_subword_tokenize
|
||||
if do_subword_tokenize:
|
||||
if subword_tokenizer_type == 'wordpiece':
|
||||
self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab,
|
||||
unk_token=self.unk_token)
|
||||
elif subword_tokenizer_type == 'character':
|
||||
self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab,
|
||||
unk_token=self.unk_token)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
|
||||
|
||||
|
||||
def _tokenize(self, text):
|
||||
if self.do_word_tokenize:
|
||||
tokens = self.word_tokenizer.tokenize(text,
|
||||
never_split=self.all_special_tokens)
|
||||
else:
|
||||
tokens = [text]
|
||||
|
||||
if self.do_subword_tokenize:
|
||||
split_tokens = [sub_token for token in tokens
|
||||
for sub_token in self.subword_tokenizer.tokenize(token)]
|
||||
else:
|
||||
split_tokens = tokens
|
||||
|
||||
return split_tokens
|
||||
|
||||
|
||||
class MecabTokenizer(object):
|
||||
"""Runs basic tokenization with MeCab morphological parser."""
|
||||
|
||||
def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
|
||||
"""Constructs a MecabTokenizer.
|
||||
|
||||
Args:
|
||||
**do_lower_case**: (`optional`) boolean (default True)
|
||||
Whether to lower case the input.
|
||||
**never_split**: (`optional`) list of str
|
||||
Kept for backward compatibility purposes.
|
||||
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
|
||||
List of token not to split.
|
||||
**normalize_text**: (`optional`) boolean (default True)
|
||||
Whether to apply unicode normalization to text before tokenization.
|
||||
"""
|
||||
self.do_lower_case = do_lower_case
|
||||
self.never_split = never_split if never_split is not None else []
|
||||
self.normalize_text = normalize_text
|
||||
|
||||
import MeCab
|
||||
self.mecab = MeCab.Tagger()
|
||||
|
||||
def tokenize(self, text, never_split=None, **kwargs):
|
||||
"""Tokenizes a piece of text."""
|
||||
if self.normalize_text:
|
||||
text = unicodedata.normalize('NFKC', text)
|
||||
|
||||
never_split = self.never_split + (never_split if never_split is not None else [])
|
||||
tokens = []
|
||||
|
||||
cursor = 0
|
||||
for line in self.mecab.parse(text).split('\n'):
|
||||
if line == 'EOS':
|
||||
break
|
||||
|
||||
token, _ = line.split('\t')
|
||||
token_start = text.index(token, cursor)
|
||||
token_end = token_start + len(token)
|
||||
if self.do_lower_case and token not in never_split:
|
||||
token = token.lower()
|
||||
|
||||
tokens.append(token)
|
||||
cursor = token_end
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
class CharacterTokenizer(object):
|
||||
"""Runs Character tokenziation."""
|
||||
|
||||
def __init__(self, vocab, unk_token, normalize_text=True):
|
||||
"""Constructs a CharacterTokenizer.
|
||||
|
||||
Args:
|
||||
**vocab**:
|
||||
Vocabulary object.
|
||||
**unk_token**: str
|
||||
A special symbol for out-of-vocabulary token.
|
||||
**normalize_text**: (`optional`) boolean (default True)
|
||||
Whether to apply unicode normalization to text before tokenization.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.unk_token = unk_token
|
||||
self.normalize_text = normalize_text
|
||||
|
||||
def tokenize(self, text):
|
||||
"""Tokenizes a piece of text into characters.
|
||||
|
||||
For example:
|
||||
input = "apple"
|
||||
output = ["a", "p", "p", "l", "e"]
|
||||
Args:
|
||||
text: A single token or whitespace separated tokens.
|
||||
This should have already been passed through `BasicTokenizer`.
|
||||
Returns:
|
||||
A list of characters.
|
||||
"""
|
||||
if self.normalize_text:
|
||||
text = unicodedata.normalize('NFKC', text)
|
||||
|
||||
output_tokens = []
|
||||
for i, char in enumerate(text):
|
||||
if char not in self.vocab:
|
||||
output_tokens.append(self.unk_token)
|
||||
continue
|
||||
|
||||
output_tokens.append(char)
|
||||
|
||||
return output_tokens
|
||||
Reference in New Issue
Block a user