[RoFormer] Fix some issues (#12397)

* add RoFormerTokenizerFast into AutoTokenizer

* fix typo in roformer docs

* make onnx export happy

* update RoFormerConfig embedding_size

* use jieba not rjieba

* fix 12244 and make test_alignement passed

* update ARCHIVE_MAP

* make style & quality & fixup

* update

* make style & quality & fixup

* make style quality fixup

* update

* suggestion from LysandreJik

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

* make style

* use rjieba

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
yujun
2021-07-06 15:31:57 +08:00
committed by GitHub
parent f5b0c1ecf0
commit 626a0a0147
11 changed files with 97 additions and 53 deletions

View File

@@ -13,29 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import unittest
from transformers import RoFormerTokenizer, RoFormerTokenizerFast
from transformers.testing_utils import require_tokenizers
from transformers.testing_utils import require_rjieba, require_tokenizers
from .test_tokenization_common import TokenizerTesterMixin
def is_rjieba_available():
return importlib.util.find_spec("rjieba") is not None
def require_rjieba(test_case):
"""
Decorator marking a test that requires Jieba. These tests are skipped when Jieba isn't installed.
"""
if not is_rjieba_available():
return unittest.skip("test requires rjieba")(test_case)
else:
return test_case
@require_rjieba
@require_tokenizers
class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -79,6 +64,10 @@ class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
exp_tokens = [22943, 21332, 34431, 45904, 117, 306, 1231, 1231, 2653, 33994, 1266, 100]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), exp_tokens)
# due to custom pre_tokenize , char_to_token may be error
def test_alignement_methods(self):
# can't train new_tokenizer via Tokenizers lib
def test_training_new_tokenizer(self):
pass
# can't train new_tokenizer via Tokenizers lib
def test_training_new_tokenizer_with_special_tokens_change(self):
pass