Fix sentinel token IDs in data collator for Flax T5 pretraining script (#14477)
This commit is contained in:
@@ -291,7 +291,7 @@ class FlaxDataCollatorForT5MLM:
|
|||||||
start_indices[:, 0] = mask_indices[:, 0]
|
start_indices[:, 0] = mask_indices[:, 0]
|
||||||
|
|
||||||
sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices)
|
sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices)
|
||||||
sentinel_ids = np.where(sentinel_ids != 0, (sentinel_ids + self.tokenizer.vocab_size - 1), 0)
|
sentinel_ids = np.where(sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0)
|
||||||
sentinel_ids -= mask_indices - start_indices
|
sentinel_ids -= mask_indices - start_indices
|
||||||
|
|
||||||
return sentinel_ids
|
return sentinel_ids
|
||||||
|
|||||||
Reference in New Issue
Block a user