From 195bfd118a3a0e80c647161ca2e2c9af7db9a225 Mon Sep 17 00:00:00 2001 From: Philip May Date: Fri, 23 Apr 2021 23:52:31 +0200 Subject: [PATCH] Enable option for subword regularization in `XLMRobertaTokenizer` (#11149) * enable subword regularization. * fix tokenizer storage * fix docstring formatting * Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py Co-authored-by: Stefan Schweter * fix docstring formatting * add test for subword regularization tokenizer * improve comments of test * add sp_model_kwargs * reformat docstring to match the style * add some more documentation * Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * improve docstring * empty commit to trigger CI * Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * fix docstring formatting for sphinx Co-authored-by: Stefan Schweter Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- .../xlm_roberta/tokenization_xlm_roberta.py | 22 +++++++++++++++-- tests/test_tokenization_xlm_roberta.py | 24 +++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py index 8ecec6dffe..877bfaf1d1 100644 --- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py @@ -94,6 +94,20 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`, defaults to :obj:`None`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -115,11 +129,14 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): unk_token="", pad_token="", mask_token="", + sp_model_kwargs=None, **kwargs ): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -128,10 +145,11 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, + sp_model_kwargs=sp_model_kwargs, **kwargs, ) - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -249,7 +267,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): return vocab def _tokenize(self, text): - return self.sp_model.EncodeAsPieces(text) + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ diff --git a/tests/test_tokenization_xlm_roberta.py b/tests/test_tokenization_xlm_roberta.py index 48a40031f5..8031ebc405 100644 --- a/tests/test_tokenization_xlm_roberta.py +++ b/tests/test_tokenization_xlm_roberta.py @@ -14,6 +14,7 @@ # limitations under the License. +import itertools import os import unittest @@ -118,6 +119,29 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ], ) + def test_subword_regularization_tokenizer(self): + # Subword regularization is only available for the slow tokenizer. + tokenizer = XLMRobertaTokenizer( + SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs={"enable_sampling": True, "alpha": 0.1, "nbest_size": -1} + ) + + # Subword regularization augments training data with subword sampling. + # This has a random component. We test if the tokenizer generates different + # results when subword regularization is enabled. + tokens_list = [] + for _ in range(5): + tokens_list.append(tokenizer.tokenize("This is a test for subword regularization.")) + + # the list of different pairs of tokens_list + combinations = itertools.combinations(tokens_list, 2) + + all_equal = True + for combination in combinations: + if combination[0] != combination[1]: + all_equal = False + + self.assertFalse(all_equal) + @cached_property def big_tokenizer(self): return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")