Enable option for subword regularization in XLMRobertaTokenizer (#11149)

* enable subword regularization.

* fix tokenizer storage

* fix docstring formatting

* Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py

Co-authored-by: Stefan Schweter <stefan@schweter.it>

* fix docstring formatting

* add test for subword regularization tokenizer

* improve comments of test

* add sp_model_kwargs

* reformat docstring to match the style

* add some more documentation

* Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* improve docstring

* empty commit to trigger CI

* Update src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* fix docstring formatting for sphinx

Co-authored-by: Stefan Schweter <stefan@schweter.it>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
Philip May
2021-04-23 23:52:31 +02:00
committed by GitHub
parent 1ef152eb48
commit 195bfd118a
2 changed files with 44 additions and 2 deletions

View File

@@ -14,6 +14,7 @@
# limitations under the License.
import itertools
import os
import unittest
@@ -118,6 +119,29 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
],
)
def test_subword_regularization_tokenizer(self):
# Subword regularization is only available for the slow tokenizer.
tokenizer = XLMRobertaTokenizer(
SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs={"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
)
# Subword regularization augments training data with subword sampling.
# This has a random component. We test if the tokenizer generates different
# results when subword regularization is enabled.
tokens_list = []
for _ in range(5):
tokens_list.append(tokenizer.tokenize("This is a test for subword regularization."))
# the list of different pairs of tokens_list
combinations = itertools.combinations(tokens_list, 2)
all_equal = True
for combination in combinations:
if combination[0] != combination[1]:
all_equal = False
self.assertFalse(all_equal)
@cached_property
def big_tokenizer(self):
return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")