update desc for map in all examples (#12226)
* update desc for map in all examples * added plm * suggestions
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
torch >= 1.3
|
||||
datasets >= 1.1.3
|
||||
datasets >= 1.8.0
|
||||
sentencepiece != 0.1.92
|
||||
protobuf
|
||||
|
||||
@@ -46,10 +46,12 @@ from transformers import (
|
||||
from transformers.testing_utils import CaptureLogger
|
||||
from transformers.trainer_utils import get_last_checkpoint
|
||||
from transformers.utils import check_min_version
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.8.0.dev0")
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -355,6 +357,7 @@ def main():
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
remove_columns=column_names,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on dataset",
|
||||
)
|
||||
|
||||
if data_args.block_size is None:
|
||||
@@ -401,6 +404,7 @@ def main():
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc=f"Grouping texts in chunks of {block_size}",
|
||||
)
|
||||
|
||||
if training_args.do_train:
|
||||
|
||||
@@ -48,9 +48,13 @@ from transformers import (
|
||||
get_scheduler,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||
|
||||
@@ -300,6 +304,7 @@ def main():
|
||||
num_proc=args.preprocessing_num_workers,
|
||||
remove_columns=column_names,
|
||||
load_from_cache_file=not args.overwrite_cache,
|
||||
desc="Running tokenizer on dataset",
|
||||
)
|
||||
|
||||
if args.block_size is None:
|
||||
@@ -346,6 +351,7 @@ def main():
|
||||
batched=True,
|
||||
num_proc=args.preprocessing_num_workers,
|
||||
load_from_cache_file=not args.overwrite_cache,
|
||||
desc=f"Grouping texts in chunks of {block_size}",
|
||||
)
|
||||
|
||||
train_dataset = lm_datasets["train"]
|
||||
|
||||
@@ -45,10 +45,12 @@ from transformers import (
|
||||
)
|
||||
from transformers.trainer_utils import get_last_checkpoint
|
||||
from transformers.utils import check_min_version
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.8.0.dev0")
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
|
||||
@@ -380,6 +382,7 @@ def main():
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
remove_columns=[text_column_name],
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on dataset line_by_line",
|
||||
)
|
||||
else:
|
||||
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
||||
@@ -394,6 +397,7 @@ def main():
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
remove_columns=column_names,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on every text in dataset",
|
||||
)
|
||||
|
||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
||||
@@ -424,6 +428,7 @@ def main():
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc=f"Grouping texts in chunks of {max_seq_length}",
|
||||
)
|
||||
|
||||
if training_args.do_train:
|
||||
|
||||
@@ -48,9 +48,11 @@ from transformers import (
|
||||
get_scheduler,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||
|
||||
@@ -346,6 +348,7 @@ def main():
|
||||
num_proc=args.preprocessing_num_workers,
|
||||
remove_columns=[text_column_name],
|
||||
load_from_cache_file=not args.overwrite_cache,
|
||||
desc="Running tokenizer on dataset line_by_line",
|
||||
)
|
||||
else:
|
||||
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
||||
@@ -360,6 +363,7 @@ def main():
|
||||
num_proc=args.preprocessing_num_workers,
|
||||
remove_columns=column_names,
|
||||
load_from_cache_file=not args.overwrite_cache,
|
||||
desc="Running tokenizer on every text in dataset",
|
||||
)
|
||||
|
||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
||||
@@ -390,6 +394,7 @@ def main():
|
||||
batched=True,
|
||||
num_proc=args.preprocessing_num_workers,
|
||||
load_from_cache_file=not args.overwrite_cache,
|
||||
desc=f"Grouping texts in chunks of {max_seq_length}",
|
||||
)
|
||||
|
||||
train_dataset = tokenized_datasets["train"]
|
||||
|
||||
@@ -41,10 +41,12 @@ from transformers import (
|
||||
)
|
||||
from transformers.trainer_utils import get_last_checkpoint
|
||||
from transformers.utils import check_min_version
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.8.0.dev0")
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -358,6 +360,7 @@ def main():
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
remove_columns=[text_column_name],
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on dataset line_by_line",
|
||||
)
|
||||
else:
|
||||
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
||||
@@ -370,6 +373,7 @@ def main():
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
remove_columns=column_names,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on every text in dataset",
|
||||
)
|
||||
|
||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
||||
@@ -400,6 +404,7 @@ def main():
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc=f"Grouping texts in chunks of {max_seq_length}",
|
||||
)
|
||||
|
||||
if training_args.do_train:
|
||||
|
||||
Reference in New Issue
Block a user