tokenizer train from iterator without pre_tokenizers (#35396)
* fix if else issues * add a test * fix the test * style
This commit is contained in:
@@ -813,17 +813,17 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||||||
kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"]
|
kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"]
|
||||||
if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None:
|
if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None:
|
||||||
kwargs["unk_token"] = unk_token
|
kwargs["unk_token"] = unk_token
|
||||||
if (
|
if tokenizer_json["pre_tokenizer"] is not None:
|
||||||
tokenizer_json["pre_tokenizer"] is not None
|
if (
|
||||||
and tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel"
|
tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel"
|
||||||
or tokenizer_json["pre_tokenizer"]["type"] == "Sequence"
|
or tokenizer_json["pre_tokenizer"]["type"] == "Sequence"
|
||||||
and "pretokenizers" in tokenizer_json["pre_tokenizer"]
|
and "pretokenizers" in tokenizer_json["pre_tokenizer"]
|
||||||
and any(
|
and any(
|
||||||
pretokenizer["type"] == "ByteLevel"
|
pretokenizer["type"] == "ByteLevel"
|
||||||
for pretokenizer in tokenizer_json["pre_tokenizer"]["pretokenizers"]
|
for pretokenizer in tokenizer_json["pre_tokenizer"]["pretokenizers"]
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet()
|
kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet()
|
||||||
|
|
||||||
trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]]
|
trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]]
|
||||||
trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs)
|
trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs)
|
||||||
|
|||||||
@@ -48,6 +48,7 @@ from transformers.testing_utils import (
|
|||||||
|
|
||||||
|
|
||||||
if is_tokenizers_available():
|
if is_tokenizers_available():
|
||||||
|
import tokenizers
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
from tokenizers.models import WordPiece
|
from tokenizers.models import WordPiece
|
||||||
|
|
||||||
@@ -428,3 +429,21 @@ class TokenizerUtilsTest(unittest.TestCase):
|
|||||||
# Now this will try to import sentencepiece_model_pb2_new.py. This should not fail even if the protobuf
|
# Now this will try to import sentencepiece_model_pb2_new.py. This should not fail even if the protobuf
|
||||||
# was already imported.
|
# was already imported.
|
||||||
import_protobuf()
|
import_protobuf()
|
||||||
|
|
||||||
|
def test_training_new_tokenizer_edge_cases(self):
|
||||||
|
_tokenizer = Tokenizer(tokenizers.models.BPE(vocab={"a": 1, "b": 2, "ab": 3}, merges=[("a", "b")]))
|
||||||
|
_tokenizer.pre_tokenizer = None
|
||||||
|
|
||||||
|
tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer)
|
||||||
|
toy_text_iterator = ("a" for _ in range(1000))
|
||||||
|
tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50)
|
||||||
|
|
||||||
|
_tokenizer.normalizer = None
|
||||||
|
tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer)
|
||||||
|
toy_text_iterator = ("a" for _ in range(1000))
|
||||||
|
tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50)
|
||||||
|
|
||||||
|
_tokenizer.post_processor = None
|
||||||
|
tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer)
|
||||||
|
toy_text_iterator = ("a" for _ in range(1000))
|
||||||
|
tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50)
|
||||||
|
|||||||
Reference in New Issue
Block a user