Add changes for uroman package to handle non-Roman characters (#32404)
* Add changes for uroman package to handle non-Roman characters * Update docs for uroman changes * Modifying error message to warning, for backward compatibility * Update instruction for user to install uroman * Update docs for uroman python version dependency and backward compatibility * Update warning message for python version compatibility with uroman * Refine docs
This commit is contained in:
@@ -93,12 +93,33 @@ from transformers import VitsTokenizer
|
|||||||
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
|
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
|
||||||
print(tokenizer.is_uroman)
|
print(tokenizer.is_uroman)
|
||||||
```
|
```
|
||||||
|
If the is_uroman attribute is `True`, the tokenizer will automatically apply the `uroman` package to your text inputs, but you need to install uroman if not already installed using:
|
||||||
|
```
|
||||||
|
pip install --upgrade uroman
|
||||||
|
```
|
||||||
|
Note: Python version required to use `uroman` as python package should be >= `3.10`.
|
||||||
|
You can use the tokenizer as usual without any additional preprocessing steps:
|
||||||
|
```python
|
||||||
|
import torch
|
||||||
|
from transformers import VitsTokenizer, VitsModel, set_seed
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
If required, you should apply the uroman package to your text inputs **prior** to passing them to the `VitsTokenizer`,
|
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-kor")
|
||||||
since currently the tokenizer does not support performing the pre-processing itself.
|
model = VitsModel.from_pretrained("facebook/mms-tts-kor")
|
||||||
|
text = "이봐 무슨 일이야"
|
||||||
|
inputs = tokenizer(text=text, return_tensors="pt")
|
||||||
|
|
||||||
|
set_seed(555) # make deterministic
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(inputs["input_ids"])
|
||||||
|
|
||||||
|
waveform = outputs.waveform[0]
|
||||||
|
```
|
||||||
|
If you don't want to upgrade to python >= `3.10`, then you can use the `uroman` perl package to pre-process the text inputs to the Roman alphabet.
|
||||||
To do this, first clone the uroman repository to your local machine and set the bash variable `UROMAN` to the local path:
|
To do this, first clone the uroman repository to your local machine and set the bash variable `UROMAN` to the local path:
|
||||||
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/isi-nlp/uroman.git
|
git clone https://github.com/isi-nlp/uroman.git
|
||||||
cd uroman
|
cd uroman
|
||||||
|
|||||||
@@ -20,12 +20,14 @@ import re
|
|||||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
from ...tokenization_utils import PreTrainedTokenizer
|
from ...tokenization_utils import PreTrainedTokenizer
|
||||||
from ...utils import is_phonemizer_available, logging
|
from ...utils import is_phonemizer_available, is_uroman_available, logging
|
||||||
|
|
||||||
|
|
||||||
if is_phonemizer_available():
|
if is_phonemizer_available():
|
||||||
import phonemizer
|
import phonemizer
|
||||||
|
|
||||||
|
if is_uroman_available():
|
||||||
|
import uroman as ur
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
@@ -172,11 +174,16 @@ class VitsTokenizer(PreTrainedTokenizer):
|
|||||||
filtered_text = self._preprocess_char(text)
|
filtered_text = self._preprocess_char(text)
|
||||||
|
|
||||||
if has_non_roman_characters(filtered_text) and self.is_uroman:
|
if has_non_roman_characters(filtered_text) and self.is_uroman:
|
||||||
|
if not is_uroman_available():
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Text to the tokenizer contains non-Roman characters. Ensure the `uroman` Romanizer is "
|
"Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing "
|
||||||
"applied to the text prior to passing it to the tokenizer. See "
|
"step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` "
|
||||||
"`https://github.com/isi-nlp/uroman` for details."
|
"Note `uroman` requires python version >= 3.10"
|
||||||
|
"Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uroman"
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
uroman = ur.Uroman()
|
||||||
|
filtered_text = uroman.romanize_string(filtered_text)
|
||||||
|
|
||||||
if self.phonemize:
|
if self.phonemize:
|
||||||
if not is_phonemizer_available():
|
if not is_phonemizer_available():
|
||||||
|
|||||||
@@ -218,6 +218,7 @@ from .import_utils import (
|
|||||||
is_torchdynamo_compiling,
|
is_torchdynamo_compiling,
|
||||||
is_torchvision_available,
|
is_torchvision_available,
|
||||||
is_training_run_on_sagemaker,
|
is_training_run_on_sagemaker,
|
||||||
|
is_uroman_available,
|
||||||
is_vision_available,
|
is_vision_available,
|
||||||
requires_backends,
|
requires_backends,
|
||||||
torch_only_method,
|
torch_only_method,
|
||||||
|
|||||||
@@ -142,6 +142,7 @@ _quanto_available = _is_package_available("quanto")
|
|||||||
_pandas_available = _is_package_available("pandas")
|
_pandas_available = _is_package_available("pandas")
|
||||||
_peft_available = _is_package_available("peft")
|
_peft_available = _is_package_available("peft")
|
||||||
_phonemizer_available = _is_package_available("phonemizer")
|
_phonemizer_available = _is_package_available("phonemizer")
|
||||||
|
_uroman_available = _is_package_available("uroman")
|
||||||
_psutil_available = _is_package_available("psutil")
|
_psutil_available = _is_package_available("psutil")
|
||||||
_py3nvml_available = _is_package_available("py3nvml")
|
_py3nvml_available = _is_package_available("py3nvml")
|
||||||
_pyctcdecode_available = _is_package_available("pyctcdecode")
|
_pyctcdecode_available = _is_package_available("pyctcdecode")
|
||||||
@@ -1107,6 +1108,10 @@ def is_phonemizer_available():
|
|||||||
return _phonemizer_available
|
return _phonemizer_available
|
||||||
|
|
||||||
|
|
||||||
|
def is_uroman_available():
|
||||||
|
return _uroman_available
|
||||||
|
|
||||||
|
|
||||||
def torch_only_method(fn):
|
def torch_only_method(fn):
|
||||||
def wrapper(*args, **kwargs):
|
def wrapper(*args, **kwargs):
|
||||||
if not _torch_available:
|
if not _torch_available:
|
||||||
@@ -1383,6 +1388,11 @@ PHONEMIZER_IMPORT_ERROR = """
|
|||||||
{0} requires the phonemizer library but it was not found in your environment. You can install it with pip:
|
{0} requires the phonemizer library but it was not found in your environment. You can install it with pip:
|
||||||
`pip install phonemizer`. Please note that you may need to restart your runtime after installation.
|
`pip install phonemizer`. Please note that you may need to restart your runtime after installation.
|
||||||
"""
|
"""
|
||||||
|
# docstyle-ignore
|
||||||
|
UROMAN_IMPORT_ERROR = """
|
||||||
|
{0} requires the uroman library but it was not found in your environment. You can install it with pip:
|
||||||
|
`pip install uroman`. Please note that you may need to restart your runtime after installation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
@@ -1523,6 +1533,7 @@ BACKENDS_MAPPING = OrderedDict(
|
|||||||
("g2p_en", (is_g2p_en_available, G2P_EN_IMPORT_ERROR)),
|
("g2p_en", (is_g2p_en_available, G2P_EN_IMPORT_ERROR)),
|
||||||
("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
|
("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
|
||||||
("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
|
("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
|
||||||
|
("uroman", (is_uroman_available, UROMAN_IMPORT_ERROR)),
|
||||||
("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)),
|
("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)),
|
||||||
("levenshtein", (is_levenshtein_available, LEVENSHTEIN_IMPORT_ERROR)),
|
("levenshtein", (is_levenshtein_available, LEVENSHTEIN_IMPORT_ERROR)),
|
||||||
("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
|
("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
|
||||||
|
|||||||
Reference in New Issue
Block a user