Improve bert-japanese tokenizer handling (#8659)
* Make ci fail * Try to make tests actually run? * CI finally failing? * Fix CI * Revert "Fix CI" This reverts commit ca7923be7334d4e571b023478ebdd6b33dfd0ebb. * Ooops wrong one * one more try * Ok ok let's move this elsewhere * Alternative to globals() (#8667) * Alternative to globals() * Error is raised later so return None * Sentencepiece not installed make some tokenizers None * Apply Lysandre wisdom * Slightly clearer comment? cc @sgugger Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -18,6 +18,7 @@ import os
|
||||
import pickle
|
||||
import unittest
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.models.bert_japanese.tokenization_bert_japanese import (
|
||||
VOCAB_FILES_NAMES,
|
||||
BertJapaneseTokenizer,
|
||||
@@ -267,3 +268,11 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC
|
||||
# 2 is for "[CLS]", 3 is for "[SEP]"
|
||||
assert encoded_sentence == [2] + text + [3]
|
||||
assert encoded_pair == [2] + text + [3] + text_2 + [3]
|
||||
|
||||
|
||||
@custom_tokenizers
|
||||
class AutoTokenizerCustomTest(unittest.TestCase):
|
||||
def test_tokenizer_bert_japanese(self):
|
||||
EXAMPLE_BERT_JAPANESE_ID = "cl-tohoku/bert-base-japanese"
|
||||
tokenizer = AutoTokenizer.from_pretrained(EXAMPLE_BERT_JAPANESE_ID)
|
||||
self.assertIsInstance(tokenizer, BertJapaneseTokenizer)
|
||||
|
||||
Reference in New Issue
Block a user