From e43e11260ff3c0a1b3cb0f4f39782d71a51c0191 Mon Sep 17 00:00:00 2001 From: Bhavitvya Malik Date: Fri, 18 Jun 2021 01:07:31 +0530 Subject: [PATCH] update desc for map in all examples (#12226) * update desc for map in all examples * added plm * suggestions --- examples/pytorch/language-modeling/requirements.txt | 2 +- examples/pytorch/language-modeling/run_clm.py | 4 ++++ examples/pytorch/language-modeling/run_clm_no_trainer.py | 6 ++++++ examples/pytorch/language-modeling/run_mlm.py | 5 +++++ examples/pytorch/language-modeling/run_mlm_no_trainer.py | 5 +++++ examples/pytorch/language-modeling/run_plm.py | 5 +++++ examples/pytorch/question-answering/requirements.txt | 2 +- examples/pytorch/question-answering/run_qa.py | 5 +++++ .../pytorch/question-answering/run_qa_beam_search.py | 5 +++++ .../question-answering/run_qa_beam_search_no_trainer.py | 5 +++++ examples/pytorch/question-answering/run_qa_no_trainer.py | 5 +++++ examples/pytorch/summarization/requirements.txt | 2 +- examples/pytorch/summarization/run_summarization.py | 5 +++++ .../summarization/run_summarization_no_trainer.py | 9 ++++++++- examples/pytorch/token-classification/requirements.txt | 2 +- examples/pytorch/token-classification/run_ner.py | 5 +++++ .../pytorch/token-classification/run_ner_no_trainer.py | 8 +++++++- examples/pytorch/translation/requirements.txt | 2 +- examples/pytorch/translation/run_translation.py | 5 +++++ .../pytorch/translation/run_translation_no_trainer.py | 4 ++++ 20 files changed, 84 insertions(+), 7 deletions(-) diff --git a/examples/pytorch/language-modeling/requirements.txt b/examples/pytorch/language-modeling/requirements.txt index 58d9fb8a8c..4e41336c64 100644 --- a/examples/pytorch/language-modeling/requirements.txt +++ b/examples/pytorch/language-modeling/requirements.txt @@ -1,4 +1,4 @@ torch >= 1.3 -datasets >= 1.1.3 +datasets >= 1.8.0 sentencepiece != 0.1.92 protobuf diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 92ea20bd88..6ec82b593d 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -46,10 +46,12 @@ from transformers import ( from transformers.testing_utils import CaptureLogger from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version +from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.8.0.dev0") +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") logger = logging.getLogger(__name__) @@ -355,6 +357,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", ) if data_args.block_size is None: @@ -401,6 +404,7 @@ def main(): batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", ) if training_args.do_train: diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 4005e7883c..906aa1af55 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -48,9 +48,13 @@ from transformers import ( get_scheduler, set_seed, ) +from transformers.utils.versions import require_version logger = logging.getLogger(__name__) + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) @@ -300,6 +304,7 @@ def main(): num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on dataset", ) if args.block_size is None: @@ -346,6 +351,7 @@ def main(): batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", ) train_dataset = lm_datasets["train"] diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 0e94cb290e..2163ecc4b8 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -45,10 +45,12 @@ from transformers import ( ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version +from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.8.0.dev0") +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") logger = logging.getLogger(__name__) MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys()) @@ -380,6 +382,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset line_by_line", ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. @@ -394,6 +397,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on every text in dataset", ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of @@ -424,6 +428,7 @@ def main(): batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {max_seq_length}", ) if training_args.do_train: diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 27e61056df..e280b375f4 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -48,9 +48,11 @@ from transformers import ( get_scheduler, set_seed, ) +from transformers.utils.versions import require_version logger = logging.getLogger(__name__) +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) @@ -346,6 +348,7 @@ def main(): num_proc=args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on dataset line_by_line", ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. @@ -360,6 +363,7 @@ def main(): num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on every text in dataset", ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of @@ -390,6 +394,7 @@ def main(): batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, + desc=f"Grouping texts in chunks of {max_seq_length}", ) train_dataset = tokenized_datasets["train"] diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index b0439e7f2d..537da55abb 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -41,10 +41,12 @@ from transformers import ( ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version +from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.8.0.dev0") +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") logger = logging.getLogger(__name__) @@ -358,6 +360,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset line_by_line", ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. @@ -370,6 +373,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on every text in dataset", ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of @@ -400,6 +404,7 @@ def main(): batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {max_seq_length}", ) if training_args.do_train: diff --git a/examples/pytorch/question-answering/requirements.txt b/examples/pytorch/question-answering/requirements.txt index ca9b0641cb..ca8e8e12ce 100644 --- a/examples/pytorch/question-answering/requirements.txt +++ b/examples/pytorch/question-answering/requirements.txt @@ -1,2 +1,2 @@ -datasets >= 1.4.0 +datasets >= 1.8.0 torch >= 1.3.0 diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 0f3b4eedd8..da762429a0 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -42,11 +42,13 @@ from transformers import ( ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version +from transformers.utils.versions import require_version from utils_qa import postprocess_qa_predictions # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.8.0.dev0") +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") logger = logging.getLogger(__name__) @@ -417,6 +419,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples @@ -478,6 +481,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", ) if data_args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again @@ -497,6 +501,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 10f1bce8b3..a81c3ad23a 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -41,11 +41,13 @@ from transformers import ( ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version +from transformers.utils.versions import require_version from utils_qa import postprocess_qa_predictions_with_beam_search # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.8.0.dev0") +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") logger = logging.getLogger(__name__) @@ -429,6 +431,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", ) if data_args.max_train_samples is not None: # Select samples from dataset again since Feature Creation might increase number of features @@ -514,6 +517,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", ) if data_args.max_eval_samples is not None: # Selecting Samples from Dataset again since Feature Creation might increase samples size @@ -533,6 +537,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 6b9c6b156b..ea0f072d28 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -46,11 +46,13 @@ from transformers import ( set_seed, ) from transformers.utils import check_min_version +from transformers.utils.versions import require_version from utils_qa import postprocess_qa_predictions_with_beam_search # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.8.0.dev0") +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") logger = logging.getLogger(__name__) @@ -419,6 +421,7 @@ def main(): num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on train dataset", ) if args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples @@ -503,6 +506,7 @@ def main(): num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on validation dataset", ) if args.max_eval_samples is not None: @@ -523,6 +527,7 @@ def main(): num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on prediction dataset", ) if args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 93af00d39a..e3b14dd8cf 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -48,11 +48,13 @@ from transformers import ( set_seed, ) from transformers.utils import check_min_version +from transformers.utils.versions import require_version from utils_qa import postprocess_qa_predictions # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.8.0.dev0") +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") logger = logging.getLogger(__name__) # You should update this to your particular problem to have better documentation of `model_type` @@ -448,6 +450,7 @@ def main(): num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on train dataset", ) if args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples @@ -508,6 +511,7 @@ def main(): num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on validation dataset", ) if args.max_eval_samples is not None: @@ -528,6 +532,7 @@ def main(): num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on prediction dataset", ) if args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again diff --git a/examples/pytorch/summarization/requirements.txt b/examples/pytorch/summarization/requirements.txt index a721194361..f2f908b38b 100644 --- a/examples/pytorch/summarization/requirements.txt +++ b/examples/pytorch/summarization/requirements.txt @@ -1,4 +1,4 @@ -datasets >= 1.1.3 +datasets >= 1.8.0 sentencepiece != 0.1.92 protobuf rouge-score diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index a8335f6646..8fbafe7b54 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -43,10 +43,12 @@ from transformers import ( from transformers.file_utils import is_offline_mode from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version +from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.8.0.dev0") +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") logger = logging.getLogger(__name__) @@ -433,6 +435,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", ) if training_args.do_eval: @@ -448,6 +451,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", ) if training_args.do_predict: @@ -463,6 +467,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", ) # Data collator diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index c25c77d756..9f4b8f7999 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -48,9 +48,12 @@ from transformers import ( set_seed, ) from transformers.file_utils import is_offline_mode +from transformers.utils.versions import require_version logger = logging.getLogger(__name__) +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") + # You should update this to your particular problem to have better documentation of `model_type` MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) @@ -419,7 +422,11 @@ def main(): return model_inputs processed_datasets = raw_datasets.map( - preprocess_function, batched=True, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache + preprocess_function, + batched=True, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] diff --git a/examples/pytorch/token-classification/requirements.txt b/examples/pytorch/token-classification/requirements.txt index 842b66c86c..2b4bee1f85 100644 --- a/examples/pytorch/token-classification/requirements.txt +++ b/examples/pytorch/token-classification/requirements.txt @@ -1,3 +1,3 @@ seqeval -datasets >= 1.1.3 +datasets >= 1.8.0 torch >= 1.3 diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 065cd7528a..73bb03c7e0 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -42,10 +42,12 @@ from transformers import ( ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version +from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.8.0.dev0") +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") logger = logging.getLogger(__name__) @@ -388,6 +390,7 @@ def main(): batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", ) if training_args.do_eval: @@ -401,6 +404,7 @@ def main(): batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", ) if training_args.do_predict: @@ -414,6 +418,7 @@ def main(): batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", ) # Data collator diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 958d3d842a..26990f68c2 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -45,9 +45,12 @@ from transformers import ( get_scheduler, set_seed, ) +from transformers.utils.versions import require_version logger = logging.getLogger(__name__) +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") + # You should update this to your particular problem to have better documentation of `model_type` MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) @@ -381,7 +384,10 @@ def main(): return tokenized_inputs processed_raw_datasets = raw_datasets.map( - tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names + tokenize_and_align_labels, + batched=True, + remove_columns=raw_datasets["train"].column_names, + desc="Running tokenizer on dataset", ) train_dataset = processed_raw_datasets["train"] diff --git a/examples/pytorch/translation/requirements.txt b/examples/pytorch/translation/requirements.txt index 6572e995a5..3ca965b581 100644 --- a/examples/pytorch/translation/requirements.txt +++ b/examples/pytorch/translation/requirements.txt @@ -1,4 +1,4 @@ -datasets >= 1.1.3 +datasets >= 1.8.0 sentencepiece != 0.1.92 protobuf sacrebleu >= 1.4.12 diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index cfacc95867..8a5a6f531f 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -46,10 +46,12 @@ from transformers import ( ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version +from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.8.0.dev0") +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") logger = logging.getLogger(__name__) @@ -427,6 +429,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", ) if training_args.do_eval: @@ -442,6 +445,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", ) if training_args.do_predict: @@ -457,6 +461,7 @@ def main(): num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", ) # Data collator diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 4350d59b9a..e6569e6aaa 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -48,9 +48,12 @@ from transformers import ( get_scheduler, set_seed, ) +from transformers.utils.versions import require_version logger = logging.getLogger(__name__) +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") + # You should update this to your particular problem to have better documentation of `model_type` MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) @@ -401,6 +404,7 @@ def main(): num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"]