Fix group_lengths for short datasets (#12558)

This commit is contained in:
Sylvain Gugger
2021-07-08 07:23:41 -04:00
committed by GitHub
parent 0a6b9048d1
commit 6f1adc4334
10 changed files with 20 additions and 10 deletions

View File

@@ -398,6 +398,7 @@ def main():
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
if total_length >= block_size:
total_length = (total_length // block_size) * block_size total_length = (total_length // block_size) * block_size
# Split by chunks of max_len. # Split by chunks of max_len.
result = { result = {

View File

@@ -431,6 +431,7 @@ if __name__ == "__main__":
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
if total_length >= max_seq_length:
total_length = (total_length // max_seq_length) * max_seq_length total_length = (total_length // max_seq_length) * max_seq_length
# Split by chunks of max_len. # Split by chunks of max_len.
result = { result = {

View File

@@ -541,6 +541,7 @@ if __name__ == "__main__":
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
if total_length >= expanded_inputs_length:
total_length = (total_length // expanded_inputs_length) * expanded_inputs_length total_length = (total_length // expanded_inputs_length) * expanded_inputs_length
# Split by chunks of max_len. # Split by chunks of max_len.
result = { result = {

View File

@@ -404,6 +404,7 @@ def main():
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
if total_length >= block_size:
total_length = (total_length // block_size) * block_size total_length = (total_length // block_size) * block_size
# Split by chunks of max_len. # Split by chunks of max_len.
result = { result = {

View File

@@ -343,6 +343,7 @@ def main():
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
if total_length >= block_size:
total_length = (total_length // block_size) * block_size total_length = (total_length // block_size) * block_size
# Split by chunks of max_len. # Split by chunks of max_len.
result = { result = {

View File

@@ -433,6 +433,7 @@ def main():
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
if total_length >= max_seq_length:
total_length = (total_length // max_seq_length) * max_seq_length total_length = (total_length // max_seq_length) * max_seq_length
# Split by chunks of max_len. # Split by chunks of max_len.
result = { result = {

View File

@@ -387,6 +387,7 @@ def main():
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
if total_length >= max_seq_length:
total_length = (total_length // max_seq_length) * max_seq_length total_length = (total_length // max_seq_length) * max_seq_length
# Split by chunks of max_len. # Split by chunks of max_len.
result = { result = {

View File

@@ -406,6 +406,7 @@ def main():
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
if total_length >= max_seq_length:
total_length = (total_length // max_seq_length) * max_seq_length total_length = (total_length // max_seq_length) * max_seq_length
# Split by chunks of max_len. # Split by chunks of max_len.
result = { result = {

View File

@@ -405,6 +405,7 @@ def main():
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
if total_length >= block_size:
total_length = (total_length // block_size) * block_size total_length = (total_length // block_size) * block_size
# Split by chunks of max_len. # Split by chunks of max_len.
result = { result = {

View File

@@ -466,6 +466,7 @@ def main():
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
if total_length >= max_seq_length:
total_length = (total_length // max_seq_length) * max_seq_length total_length = (total_length // max_seq_length) * max_seq_length
# Split by chunks of max_len. # Split by chunks of max_len.
result = { result = {