From 6de4ee61a03d3f5f8184195c92e58b7f5b8ba642 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 19 Apr 2022 13:39:04 +0200 Subject: [PATCH] Wav2 vec2 phoneme ctc tokenizer optimisation (#16817) * Solved href rendering issue in heading Markdown references in headings such as '####' don't render well. Replaced it with

... banners. * PhonemeTokenizer optimization using phonemizer lib The backend should only be initialized once, otherwise it is reloaded. Added `init_backend` function, intializes a backend attribute. Phonemize re-uses self.backend. Should give ~10 times faster phonemization. * formatted file with make style * Documentation suggestion Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update /tokenization_wav2vec2_phoneme.py based on PR suggestion Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update CONTRIBUTING.md Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- CONTRIBUTING.md | 3 +- .../tokenization_wav2vec2_phoneme.py | 33 +++++++++++++------ 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c0af3fbaa6..e74510948a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -368,8 +368,7 @@ For documentation strings, 🤗 Transformers follows the [google style](https:// Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) for more information. -#### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md) - +**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).** ### Develop on Windows diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py index b37f902c34..6bd355645e 100644 --- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py @@ -158,6 +158,9 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): self.phonemizer_lang = phonemizer_lang self.phonemizer_backend = phonemizer_backend + if do_phonemize: + self.init_backend(self.phonemizer_lang) + with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -169,6 +172,18 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): def get_vocab(self) -> Dict: return dict(self.encoder, **self.added_tokens_encoder) + def init_backend(self, phonemizer_lang: str): + """ + Initializes the backend. + + Args: + phonemizer_lang (`str`): The language to be used. + """ + requires_backends(self, "phonemizer") + from phonemizer.backend import BACKENDS + + self.backend = BACKENDS[self.phonemizer_backend](phonemizer_lang, language_switch="remove-flags") + def prepare_for_tokenization( self, text: str, @@ -209,6 +224,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): # set the correct phonemizer language if phonemizer_lang is not None: self.phonemizer_lang = phonemizer_lang + self.init_backend(phonemizer_lang) return (text, {}) @@ -234,23 +250,20 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): return tokens def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str: - requires_backends(self, "phonemizer") - - from phonemizer import phonemize from phonemizer.separator import Separator word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else "" - phonemizer_lang = phonemizer_lang if phonemizer_lang is not None else self.phonemizer_lang + if phonemizer_lang is not None and phonemizer_lang != self.phonemizer_lang: + self.init_backend(phonemizer_lang) + else: + phonemizer_lang = self.phonemizer_lang separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="") - phonemes = phonemize( - text, - language=phonemizer_lang, - backend=self.phonemizer_backend, + phonemes = self.backend.phonemize( + [text], separator=separator, - language_switch="remove-flags", ) - phonemes = phonemes.strip() + phonemes = phonemes[0].strip() return phonemes