From 6f1adc43344a4ebe6fb1ecc018df9d6c092370cf Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 8 Jul 2021 07:23:41 -0400 Subject: [PATCH] Fix group_lengths for short datasets (#12558) --- examples/flax/language-modeling/run_clm_flax.py | 3 ++- examples/flax/language-modeling/run_mlm_flax.py | 3 ++- examples/flax/language-modeling/run_t5_mlm_flax.py | 3 ++- examples/pytorch/language-modeling/run_clm.py | 3 ++- examples/pytorch/language-modeling/run_clm_no_trainer.py | 3 ++- examples/pytorch/language-modeling/run_mlm.py | 3 ++- examples/pytorch/language-modeling/run_mlm_no_trainer.py | 3 ++- examples/pytorch/language-modeling/run_plm.py | 3 ++- examples/tensorflow/language-modeling/run_clm.py | 3 ++- examples/tensorflow/language-modeling/run_mlm.py | 3 ++- 10 files changed, 20 insertions(+), 10 deletions(-) diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py index d20de1b1d6..1367ad2db9 100755 --- a/examples/flax/language-modeling/run_clm_flax.py +++ b/examples/flax/language-modeling/run_clm_flax.py @@ -398,7 +398,8 @@ def main(): total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. - total_length = (total_length // block_size) * block_size + if total_length >= block_size: + total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py index 34fb948000..56cc35d969 100755 --- a/examples/flax/language-modeling/run_mlm_flax.py +++ b/examples/flax/language-modeling/run_mlm_flax.py @@ -431,7 +431,8 @@ if __name__ == "__main__": total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. - total_length = (total_length // max_seq_length) * max_seq_length + if total_length >= max_seq_length: + total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py index 0cf6f3d632..f04f586001 100755 --- a/examples/flax/language-modeling/run_t5_mlm_flax.py +++ b/examples/flax/language-modeling/run_t5_mlm_flax.py @@ -541,7 +541,8 @@ if __name__ == "__main__": total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. - total_length = (total_length // expanded_inputs_length) * expanded_inputs_length + if total_length >= expanded_inputs_length: + total_length = (total_length // expanded_inputs_length) * expanded_inputs_length # Split by chunks of max_len. result = { k: [t[i : i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)] diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 13b5e660f4..8f3514b4d6 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -404,7 +404,8 @@ def main(): total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. - total_length = (total_length // block_size) * block_size + if total_length >= block_size: + total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index c22021aaa1..0c85e45238 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -343,7 +343,8 @@ def main(): total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. - total_length = (total_length // block_size) * block_size + if total_length >= block_size: + total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index b2acdb999e..5355fe87a0 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -433,7 +433,8 @@ def main(): total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. - total_length = (total_length // max_seq_length) * max_seq_length + if total_length >= max_seq_length: + total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 4deb746025..6d911e93b1 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -387,7 +387,8 @@ def main(): total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. - total_length = (total_length // max_seq_length) * max_seq_length + if total_length >= max_seq_length: + total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index fbc98fb01f..09c4e28569 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -406,7 +406,8 @@ def main(): total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. - total_length = (total_length // max_seq_length) * max_seq_length + if total_length >= max_seq_length: + total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py index 67f959d2af..92d74cfd6f 100755 --- a/examples/tensorflow/language-modeling/run_clm.py +++ b/examples/tensorflow/language-modeling/run_clm.py @@ -405,7 +405,8 @@ def main(): total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. - total_length = (total_length // block_size) * block_size + if total_length >= block_size: + total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py index 008ca6d69d..d862c4423e 100755 --- a/examples/tensorflow/language-modeling/run_mlm.py +++ b/examples/tensorflow/language-modeling/run_mlm.py @@ -466,7 +466,8 @@ def main(): total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. - total_length = (total_length // max_seq_length) * max_seq_length + if total_length >= max_seq_length: + total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]