Fixing the timestamps with chunking. (#15843)
* Fixing the timestamps with chunking. * The changes modified (and fixed) the striding tests. * Adding a tokenizer test. * Update src/transformers/pipelines/automatic_speech_recognition.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * Defense -> comment. * Update src/transformers/models/wav2vec2/tokenization_wav2vec2.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
@@ -540,6 +540,42 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# last E is at 6th position of first word, first L is at last (15th) position of second word
|
||||
self.assertListEqual(self.get_from_offsets(outputs["word_offsets"], "end_offset"), [6, 15])
|
||||
|
||||
def test_word_offsets_from_char_offsets(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
char_offsets = [
|
||||
{"char": "H", "start_offset": 0, "end_offset": 1},
|
||||
{"char": "I", "start_offset": 1, "end_offset": 2},
|
||||
{"char": " ", "start_offset": 2, "end_offset": 3},
|
||||
{"char": "L", "start_offset": 3, "end_offset": 4},
|
||||
{"char": "I", "start_offset": 4, "end_offset": 5},
|
||||
]
|
||||
word_offsets = tokenizer._get_word_offsets(char_offsets, tokenizer.replace_word_delimiter_char)
|
||||
|
||||
self.assertEqual(
|
||||
word_offsets,
|
||||
[{"word": "HI", "start_offset": 0, "end_offset": 2}, {"word": "LI", "start_offset": 3, "end_offset": 5}],
|
||||
)
|
||||
|
||||
# Double spaces don't get counted
|
||||
char_offsets = [
|
||||
{"char": " ", "start_offset": 0, "end_offset": 1},
|
||||
{"char": "H", "start_offset": 1, "end_offset": 2},
|
||||
{"char": "I", "start_offset": 2, "end_offset": 3},
|
||||
{"char": " ", "start_offset": 3, "end_offset": 4},
|
||||
{"char": " ", "start_offset": 4, "end_offset": 5},
|
||||
{"char": "L", "start_offset": 5, "end_offset": 6},
|
||||
{"char": "I", "start_offset": 6, "end_offset": 7},
|
||||
{"char": "I", "start_offset": 7, "end_offset": 8},
|
||||
{"char": " ", "start_offset": 8, "end_offset": 9},
|
||||
{"char": " ", "start_offset": 9, "end_offset": 10},
|
||||
]
|
||||
word_offsets = tokenizer._get_word_offsets(char_offsets, tokenizer.replace_word_delimiter_char)
|
||||
self.assertEqual(
|
||||
word_offsets,
|
||||
[{"word": "HI", "start_offset": 1, "end_offset": 3}, {"word": "LII", "start_offset": 5, "end_offset": 8}],
|
||||
)
|
||||
|
||||
def test_offsets_batch(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user