Fix group_lengths for short datasets (#12558)
This commit is contained in:
@@ -398,7 +398,8 @@ def main():
|
|||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
total_length = (total_length // block_size) * block_size
|
if total_length >= block_size:
|
||||||
|
total_length = (total_length // block_size) * block_size
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
||||||
|
|||||||
@@ -431,7 +431,8 @@ if __name__ == "__main__":
|
|||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
total_length = (total_length // max_seq_length) * max_seq_length
|
if total_length >= max_seq_length:
|
||||||
|
total_length = (total_length // max_seq_length) * max_seq_length
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
||||||
|
|||||||
@@ -541,7 +541,8 @@ if __name__ == "__main__":
|
|||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
total_length = (total_length // expanded_inputs_length) * expanded_inputs_length
|
if total_length >= expanded_inputs_length:
|
||||||
|
total_length = (total_length // expanded_inputs_length) * expanded_inputs_length
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)]
|
k: [t[i : i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)]
|
||||||
|
|||||||
@@ -404,7 +404,8 @@ def main():
|
|||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
total_length = (total_length // block_size) * block_size
|
if total_length >= block_size:
|
||||||
|
total_length = (total_length // block_size) * block_size
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
||||||
|
|||||||
@@ -343,7 +343,8 @@ def main():
|
|||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
total_length = (total_length // block_size) * block_size
|
if total_length >= block_size:
|
||||||
|
total_length = (total_length // block_size) * block_size
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
||||||
|
|||||||
@@ -433,7 +433,8 @@ def main():
|
|||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
total_length = (total_length // max_seq_length) * max_seq_length
|
if total_length >= max_seq_length:
|
||||||
|
total_length = (total_length // max_seq_length) * max_seq_length
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
||||||
|
|||||||
@@ -387,7 +387,8 @@ def main():
|
|||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
total_length = (total_length // max_seq_length) * max_seq_length
|
if total_length >= max_seq_length:
|
||||||
|
total_length = (total_length // max_seq_length) * max_seq_length
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
||||||
|
|||||||
@@ -406,7 +406,8 @@ def main():
|
|||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
total_length = (total_length // max_seq_length) * max_seq_length
|
if total_length >= max_seq_length:
|
||||||
|
total_length = (total_length // max_seq_length) * max_seq_length
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
||||||
|
|||||||
@@ -405,7 +405,8 @@ def main():
|
|||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
total_length = (total_length // block_size) * block_size
|
if total_length >= block_size:
|
||||||
|
total_length = (total_length // block_size) * block_size
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
||||||
|
|||||||
@@ -466,7 +466,8 @@ def main():
|
|||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
total_length = (total_length // max_seq_length) * max_seq_length
|
if total_length >= max_seq_length:
|
||||||
|
total_length = (total_length // max_seq_length) * max_seq_length
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
||||||
|
|||||||
Reference in New Issue
Block a user