From 04dbea31a9b2e1e0bbb44ea6c2e0074d90cf0ba9 Mon Sep 17 00:00:00 2001 From: Bhadresh Savani Date: Mon, 28 Jun 2021 21:44:00 +0530 Subject: [PATCH] [Examples] Added context manager to datasets map (#12367) * added cotext manager to datasets map * fixed style and spaces * fixed warning of deprecation * changed desc --- examples/pytorch/language-modeling/run_clm.py | 32 ++++++------ examples/pytorch/language-modeling/run_mlm.py | 49 +++++++++--------- examples/pytorch/language-modeling/run_plm.py | 49 +++++++++--------- examples/pytorch/multiple-choice/run_swag.py | 26 +++++----- examples/pytorch/question-answering/run_qa.py | 51 ++++++++++--------- .../question-answering/run_qa_beam_search.py | 51 ++++++++++--------- .../summarization/run_summarization.py | 51 ++++++++++--------- .../pytorch/text-classification/run_glue.py | 15 +++--- .../pytorch/text-classification/run_xnli.py | 39 +++++++------- .../pytorch/token-classification/run_ner.py | 45 ++++++++-------- .../run_{{cookiecutter.example_shortcut}}.py | 45 ++++++++-------- .../pytorch/run_glue_model_parallelism.py | 2 +- 12 files changed, 242 insertions(+), 213 deletions(-) diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index a30278a615..cd76849f75 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -356,14 +356,15 @@ def main(): ) return output - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) + with training_args.main_process_first(desc="dataset map tokenization"): + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) if data_args.block_size is None: block_size = tokenizer.model_max_length @@ -404,13 +405,14 @@ def main(): # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {block_size}", - ) + with training_args.main_process_first(desc="grouping texts together"): + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) if training_args.do_train: if "train" not in tokenized_datasets: diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 57bd1d891f..f016e3df01 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -383,14 +383,15 @@ def main(): return_special_tokens_mask=True, ) - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=[text_column_name], - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset line_by_line", - ) + with training_args.main_process_first(desc="dataset map tokenization"): + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=[text_column_name], + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset line_by_line", + ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more @@ -398,14 +399,15 @@ def main(): def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on every text in dataset", - ) + with training_args.main_process_first(desc="dataset map tokenization"): + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on every text in dataset", + ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. @@ -430,13 +432,14 @@ def main(): # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - tokenized_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {max_seq_length}", - ) + with training_args.main_process_first(desc="grouping texts together"): + tokenized_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {max_seq_length}", + ) if training_args.do_train: if "train" not in tokenized_datasets: diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index e608827f34..125dc88443 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -359,27 +359,29 @@ def main(): examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length) - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=[text_column_name], - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset line_by_line", - ) + with training_args.main_process_first(desc="dataset map tokenization"): + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=[text_column_name], + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset line_by_line", + ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. def tokenize_function(examples): return tokenizer(examples[text_column_name]) - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on every text in dataset", - ) + with training_args.main_process_first(desc="dataset map tokenization"): + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on every text in dataset", + ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. @@ -404,13 +406,14 @@ def main(): # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - tokenized_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {max_seq_length}", - ) + with training_args.main_process_first(desc="grouping texts together"): + tokenized_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {max_seq_length}", + ) if training_args.do_train: if "train" not in tokenized_datasets: diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index bdbd5cf911..b754b2e34b 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -353,12 +353,13 @@ def main(): train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range(data_args.max_train_samples)) - train_dataset = train_dataset.map( - preprocess_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - ) + with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + preprocess_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + ) if training_args.do_eval: if "validation" not in raw_datasets: @@ -366,12 +367,13 @@ def main(): eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) - eval_dataset = eval_dataset.map( - preprocess_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - ) + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_dataset.map( + preprocess_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + ) # Data collator data_collator = ( diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index db08cc22af..80917bb1a2 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -418,14 +418,15 @@ def main(): # We will select sample from whole data if agument is specified train_dataset = train_dataset.select(range(data_args.max_train_samples)) # Create train feature from dataset - train_dataset = train_dataset.map( - prepare_train_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on train dataset", - ) + with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + prepare_train_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", + ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples train_dataset = train_dataset.select(range(data_args.max_train_samples)) @@ -480,14 +481,15 @@ def main(): # We will select sample from whole data eval_examples = eval_examples.select(range(data_args.max_eval_samples)) # Validation Feature Creation - eval_dataset = eval_examples.map( - prepare_validation_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on validation dataset", - ) + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_examples.map( + prepare_validation_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", + ) if data_args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) @@ -500,14 +502,15 @@ def main(): # We will select sample from whole data predict_examples = predict_examples.select(range(data_args.max_predict_samples)) # Predict Feature Creation - predict_dataset = predict_examples.map( - prepare_validation_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on prediction dataset", - ) + with training_args.main_process_first(desc="prediction dataset map pre-processing"): + predict_dataset = predict_examples.map( + prepare_validation_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", + ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 52a27a162d..975b3bbaaa 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -429,14 +429,15 @@ def main(): # Select samples from Dataset, This will help to decrease processing time train_dataset = train_dataset.select(range(data_args.max_train_samples)) # Create Training Features - train_dataset = train_dataset.map( - prepare_train_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on train dataset", - ) + with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + prepare_train_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", + ) if data_args.max_train_samples is not None: # Select samples from dataset again since Feature Creation might increase number of features train_dataset = train_dataset.select(range(data_args.max_train_samples)) @@ -515,14 +516,15 @@ def main(): # Selecting Eval Samples from Dataset eval_examples = eval_examples.select(range(data_args.max_eval_samples)) # Create Features from Eval Dataset - eval_dataset = eval_examples.map( - prepare_validation_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on validation dataset", - ) + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_examples.map( + prepare_validation_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", + ) if data_args.max_eval_samples is not None: # Selecting Samples from Dataset again since Feature Creation might increase samples size eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) @@ -535,14 +537,15 @@ def main(): # We will select sample from whole data predict_examples = predict_examples.select(range(data_args.max_predict_samples)) # Test Feature Creation - predict_dataset = predict_examples.map( - prepare_validation_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on prediction dataset", - ) + with training_args.main_process_first(desc="prediction dataset map pre-processing"): + predict_dataset = predict_examples.map( + prepare_validation_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", + ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 9e7c13e266..954f46c136 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -435,14 +435,15 @@ def main(): train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range(data_args.max_train_samples)) - train_dataset = train_dataset.map( - preprocess_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on train dataset", - ) + with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + preprocess_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", + ) if training_args.do_eval: max_target_length = data_args.val_max_target_length @@ -451,14 +452,15 @@ def main(): eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) - eval_dataset = eval_dataset.map( - preprocess_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on validation dataset", - ) + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_dataset.map( + preprocess_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", + ) if training_args.do_predict: max_target_length = data_args.val_max_target_length @@ -467,14 +469,15 @@ def main(): predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) - predict_dataset = predict_dataset.map( - preprocess_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on prediction dataset", - ) + with training_args.main_process_first(desc="prediction dataset map pre-processing"): + predict_dataset = predict_dataset.map( + preprocess_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", + ) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 99606fd909..a652d0727c 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -400,12 +400,13 @@ def main(): result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] return result - raw_datasets = raw_datasets.map( - preprocess_function, - batched=True, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) + with training_args.main_process_first(desc="dataset map pre-processing"): + raw_datasets = raw_datasets.map( + preprocess_function, + batched=True, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") @@ -526,7 +527,7 @@ def main(): for predict_dataset, task in zip(predict_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. - predict_dataset.remove_columns_("label") + predict_dataset = predict_dataset.remove_columns("label") predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index ca037ae079..5c0215b745 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -280,12 +280,13 @@ def main(): if training_args.do_train: if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range(data_args.max_train_samples)) - train_dataset = train_dataset.map( - preprocess_function, - batched=True, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on train dataset", - ) + with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + preprocess_function, + batched=True, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", + ) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") @@ -293,22 +294,24 @@ def main(): if training_args.do_eval: if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) - eval_dataset = eval_dataset.map( - preprocess_function, - batched=True, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on validation dataset", - ) + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_dataset.map( + preprocess_function, + batched=True, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", + ) if training_args.do_predict: if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) - predict_dataset = predict_dataset.map( - preprocess_function, - batched=True, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on prediction dataset", - ) + with training_args.main_process_first(desc="prediction dataset map pre-processing"): + predict_dataset = predict_dataset.map( + preprocess_function, + batched=True, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", + ) # Get the metric function metric = load_metric("xnli") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index cbdd0379cb..41c4c1e111 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -390,13 +390,14 @@ def main(): train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range(data_args.max_train_samples)) - train_dataset = train_dataset.map( - tokenize_and_align_labels, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on train dataset", - ) + with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + tokenize_and_align_labels, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", + ) if training_args.do_eval: if "validation" not in raw_datasets: @@ -404,13 +405,14 @@ def main(): eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) - eval_dataset = eval_dataset.map( - tokenize_and_align_labels, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on validation dataset", - ) + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_dataset.map( + tokenize_and_align_labels, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", + ) if training_args.do_predict: if "test" not in raw_datasets: @@ -418,13 +420,14 @@ def main(): predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) - predict_dataset = predict_dataset.map( - tokenize_and_align_labels, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on prediction dataset", - ) + with training_args.main_process_first(desc="prediction dataset map pre-processing"): + predict_dataset = predict_dataset.map( + tokenize_and_align_labels, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", + ) # Data collator data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py index f64076bd95..564145adb9 100755 --- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py +++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py @@ -370,13 +370,14 @@ def main(): # Select Sample from Dataset train_dataset = train_dataset.select(range(data_args.max_train_samples)) # tokenize train dataset in batch - train_dataset = train_dataset.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=[text_column_name], - load_from_cache_file=not data_args.overwrite_cache, - ) + with training_args.main_process_first(desc="train dataset map tokenization"): + train_dataset = train_dataset.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=[text_column_name], + load_from_cache_file=not data_args.overwrite_cache, + ) if training_args.do_eval: if "validation" not in raw_datasets: @@ -386,13 +387,14 @@ def main(): if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) # tokenize validation dataset - eval_dataset = eval_dataset.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=[text_column_name], - load_from_cache_file=not data_args.overwrite_cache, - ) + with training_args.main_process_first(desc="validation dataset map tokenization"): + eval_dataset = eval_dataset.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=[text_column_name], + load_from_cache_file=not data_args.overwrite_cache, + ) if training_args.do_predict: if "test" not in raw_datasets: @@ -402,13 +404,14 @@ def main(): if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) # tokenize predict dataset - predict_dataset = predict_dataset.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=[text_column_name], - load_from_cache_file=not data_args.overwrite_cache, - ) + with training_args.main_process_first(desc="prediction dataset map tokenization"): + predict_dataset = predict_dataset.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=[text_column_name], + load_from_cache_file=not data_args.overwrite_cache, + ) # Data collator data_collator=default_data_collator if not training_args.fp16 else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) diff --git a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py index 4b31b9cfe8..6bec48fda7 100644 --- a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py +++ b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py @@ -503,7 +503,7 @@ def main(): for test_dataset, task in zip(test_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. - test_dataset.remove_columns_("label") + test_dataset = test_dataset.remove_columns("label") predictions = trainer.predict(test_dataset=test_dataset).predictions predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)