[MarianTokenizer] Switch to sacremoses for punc normalization (#5092)
This commit is contained in:
@@ -82,11 +82,11 @@ class MarianTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
def _setup_normalizer(self):
|
def _setup_normalizer(self):
|
||||||
try:
|
try:
|
||||||
from mosestokenizer import MosesPunctuationNormalizer
|
from sacremoses import MosesPunctNormalizer
|
||||||
|
|
||||||
self.punc_normalizer = MosesPunctuationNormalizer(self.source_lang)
|
self.punc_normalizer = MosesPunctNormalizer(self.source_lang).normalize
|
||||||
except ImportError:
|
except (ImportError, FileNotFoundError):
|
||||||
warnings.warn("Recommended: pip install mosestokenizer")
|
warnings.warn("Recommended: pip install sacremoses.")
|
||||||
self.punc_normalizer = lambda x: x
|
self.punc_normalizer = lambda x: x
|
||||||
|
|
||||||
def normalize(self, x: str) -> str:
|
def normalize(self, x: str) -> str:
|
||||||
|
|||||||
Reference in New Issue
Block a user