From 90c833870c78bb3d5d807a9a3e6a40d24bf2302b Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Wed, 17 Jun 2020 16:31:05 -0400 Subject: [PATCH] [MarianTokenizer] Switch to sacremoses for punc normalization (#5092) --- src/transformers/tokenization_marian.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/tokenization_marian.py b/src/transformers/tokenization_marian.py index 8a95d1fbbd..4d307cf978 100644 --- a/src/transformers/tokenization_marian.py +++ b/src/transformers/tokenization_marian.py @@ -82,11 +82,11 @@ class MarianTokenizer(PreTrainedTokenizer): def _setup_normalizer(self): try: - from mosestokenizer import MosesPunctuationNormalizer + from sacremoses import MosesPunctNormalizer - self.punc_normalizer = MosesPunctuationNormalizer(self.source_lang) - except ImportError: - warnings.warn("Recommended: pip install mosestokenizer") + self.punc_normalizer = MosesPunctNormalizer(self.source_lang).normalize + except (ImportError, FileNotFoundError): + warnings.warn("Recommended: pip install sacremoses.") self.punc_normalizer = lambda x: x def normalize(self, x: str) -> str: