[MarianTokenizer] Switch to sacremoses for punc normalization (#5092)

This commit is contained in:
Sam Shleifer
2020-06-17 16:31:05 -04:00
committed by GitHub
parent 049e14f0e3
commit 90c833870c

View File

@@ -82,11 +82,11 @@ class MarianTokenizer(PreTrainedTokenizer):
def _setup_normalizer(self):
try:
from mosestokenizer import MosesPunctuationNormalizer
from sacremoses import MosesPunctNormalizer
self.punc_normalizer = MosesPunctuationNormalizer(self.source_lang)
except ImportError:
warnings.warn("Recommended: pip install mosestokenizer")
self.punc_normalizer = MosesPunctNormalizer(self.source_lang).normalize
except (ImportError, FileNotFoundError):
warnings.warn("Recommended: pip install sacremoses.")
self.punc_normalizer = lambda x: x
def normalize(self, x: str) -> str: