Hotfix chunk_length_s instead of _ms. (#15029)

* Hotfix `chunk_length_s` instead of `_ms`.

* Adding fix of `pad_token` which should be last/previous token for CTC

proper decoding

* Fixing ChunkPipeline unwrapping.

* Adding a PackIterator specific test.
This commit is contained in:
Nicolas Patry
2022-01-04 14:07:44 +01:00
committed by GitHub
parent 21aecc0971
commit 19d37c2dd3
4 changed files with 191 additions and 50 deletions

View File

@@ -584,3 +584,14 @@ class PipelineUtilsTest(unittest.TestCase):
outputs = [item for item in dataset]
self.assertEqual(outputs, [[{"id": 2}, {"id": 3}], [{"id": 4}, {"id": 5}]])
# is_false Across batch
dummy_dataset = [{"id": [0, 1, 2], "is_last": [False, False, False]}, {"id": [3], "is_last": [True]}]
def add(number, extra=0):
return {"id": [i + extra for i in number["id"]], "is_last": number["is_last"]}
dataset = PipelinePackIterator(dummy_dataset, add, {"extra": 2}, loader_batch_size=3)
outputs = [item for item in dataset]
self.assertEqual(outputs, [[{"id": 2}, {"id": 3}, {"id": 4}, {"id": 5}]])