Wav2 vec2 phoneme ctc tokenizer optimisation (#16817)
* Solved href rendering issue in heading Markdown references in headings such as '####' don't render well. Replaced it with <h4>...<a></a></h> banners. * PhonemeTokenizer optimization using phonemizer lib The backend should only be initialized once, otherwise it is reloaded. Added `init_backend` function, intializes a backend attribute. Phonemize re-uses self.backend. Should give ~10 times faster phonemization. * formatted file with make style * Documentation suggestion Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update /tokenization_wav2vec2_phoneme.py based on PR suggestion Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update CONTRIBUTING.md Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -368,8 +368,7 @@ For documentation strings, 🤗 Transformers follows the [google style](https://
|
|||||||
Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
|
Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
|
||||||
for more information.
|
for more information.
|
||||||
|
|
||||||
#### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md)
|
**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
|
||||||
|
|
||||||
|
|
||||||
### Develop on Windows
|
### Develop on Windows
|
||||||
|
|
||||||
|
|||||||
@@ -158,6 +158,9 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
|
|||||||
self.phonemizer_lang = phonemizer_lang
|
self.phonemizer_lang = phonemizer_lang
|
||||||
self.phonemizer_backend = phonemizer_backend
|
self.phonemizer_backend = phonemizer_backend
|
||||||
|
|
||||||
|
if do_phonemize:
|
||||||
|
self.init_backend(self.phonemizer_lang)
|
||||||
|
|
||||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||||
self.encoder = json.load(vocab_handle)
|
self.encoder = json.load(vocab_handle)
|
||||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||||
@@ -169,6 +172,18 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
|
|||||||
def get_vocab(self) -> Dict:
|
def get_vocab(self) -> Dict:
|
||||||
return dict(self.encoder, **self.added_tokens_encoder)
|
return dict(self.encoder, **self.added_tokens_encoder)
|
||||||
|
|
||||||
|
def init_backend(self, phonemizer_lang: str):
|
||||||
|
"""
|
||||||
|
Initializes the backend.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
phonemizer_lang (`str`): The language to be used.
|
||||||
|
"""
|
||||||
|
requires_backends(self, "phonemizer")
|
||||||
|
from phonemizer.backend import BACKENDS
|
||||||
|
|
||||||
|
self.backend = BACKENDS[self.phonemizer_backend](phonemizer_lang, language_switch="remove-flags")
|
||||||
|
|
||||||
def prepare_for_tokenization(
|
def prepare_for_tokenization(
|
||||||
self,
|
self,
|
||||||
text: str,
|
text: str,
|
||||||
@@ -209,6 +224,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
|
|||||||
# set the correct phonemizer language
|
# set the correct phonemizer language
|
||||||
if phonemizer_lang is not None:
|
if phonemizer_lang is not None:
|
||||||
self.phonemizer_lang = phonemizer_lang
|
self.phonemizer_lang = phonemizer_lang
|
||||||
|
self.init_backend(phonemizer_lang)
|
||||||
|
|
||||||
return (text, {})
|
return (text, {})
|
||||||
|
|
||||||
@@ -234,23 +250,20 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
|
|||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str:
|
def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str:
|
||||||
requires_backends(self, "phonemizer")
|
|
||||||
|
|
||||||
from phonemizer import phonemize
|
|
||||||
from phonemizer.separator import Separator
|
from phonemizer.separator import Separator
|
||||||
|
|
||||||
word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else ""
|
word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else ""
|
||||||
phonemizer_lang = phonemizer_lang if phonemizer_lang is not None else self.phonemizer_lang
|
if phonemizer_lang is not None and phonemizer_lang != self.phonemizer_lang:
|
||||||
|
self.init_backend(phonemizer_lang)
|
||||||
|
else:
|
||||||
|
phonemizer_lang = self.phonemizer_lang
|
||||||
|
|
||||||
separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="")
|
separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="")
|
||||||
phonemes = phonemize(
|
phonemes = self.backend.phonemize(
|
||||||
text,
|
[text],
|
||||||
language=phonemizer_lang,
|
|
||||||
backend=self.phonemizer_backend,
|
|
||||||
separator=separator,
|
separator=separator,
|
||||||
language_switch="remove-flags",
|
|
||||||
)
|
)
|
||||||
phonemes = phonemes.strip()
|
phonemes = phonemes[0].strip()
|
||||||
|
|
||||||
return phonemes
|
return phonemes
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user