update desc for map in all examples (#12226)
* update desc for map in all examples * added plm * suggestions
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
torch >= 1.3
|
torch >= 1.3
|
||||||
datasets >= 1.1.3
|
datasets >= 1.8.0
|
||||||
sentencepiece != 0.1.92
|
sentencepiece != 0.1.92
|
||||||
protobuf
|
protobuf
|
||||||
|
|||||||
@@ -46,10 +46,12 @@ from transformers import (
|
|||||||
from transformers.testing_utils import CaptureLogger
|
from transformers.testing_utils import CaptureLogger
|
||||||
from transformers.trainer_utils import get_last_checkpoint
|
from transformers.trainer_utils import get_last_checkpoint
|
||||||
from transformers.utils import check_min_version
|
from transformers.utils import check_min_version
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||||
check_min_version("4.8.0.dev0")
|
check_min_version("4.8.0.dev0")
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -355,6 +357,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
if data_args.block_size is None:
|
if data_args.block_size is None:
|
||||||
@@ -401,6 +404,7 @@ def main():
|
|||||||
batched=True,
|
batched=True,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc=f"Grouping texts in chunks of {block_size}",
|
||||||
)
|
)
|
||||||
|
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
|
|||||||
@@ -48,9 +48,13 @@ from transformers import (
|
|||||||
get_scheduler,
|
get_scheduler,
|
||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||||
|
|
||||||
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
||||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||||
|
|
||||||
@@ -300,6 +304,7 @@ def main():
|
|||||||
num_proc=args.preprocessing_num_workers,
|
num_proc=args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.block_size is None:
|
if args.block_size is None:
|
||||||
@@ -346,6 +351,7 @@ def main():
|
|||||||
batched=True,
|
batched=True,
|
||||||
num_proc=args.preprocessing_num_workers,
|
num_proc=args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc=f"Grouping texts in chunks of {block_size}",
|
||||||
)
|
)
|
||||||
|
|
||||||
train_dataset = lm_datasets["train"]
|
train_dataset = lm_datasets["train"]
|
||||||
|
|||||||
@@ -45,10 +45,12 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
from transformers.trainer_utils import get_last_checkpoint
|
from transformers.trainer_utils import get_last_checkpoint
|
||||||
from transformers.utils import check_min_version
|
from transformers.utils import check_min_version
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||||
check_min_version("4.8.0.dev0")
|
check_min_version("4.8.0.dev0")
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
|
MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
|
||||||
@@ -380,6 +382,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=[text_column_name],
|
remove_columns=[text_column_name],
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on dataset line_by_line",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
||||||
@@ -394,6 +397,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on every text in dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
||||||
@@ -424,6 +428,7 @@ def main():
|
|||||||
batched=True,
|
batched=True,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc=f"Grouping texts in chunks of {max_seq_length}",
|
||||||
)
|
)
|
||||||
|
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
|
|||||||
@@ -48,9 +48,11 @@ from transformers import (
|
|||||||
get_scheduler,
|
get_scheduler,
|
||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||||
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
||||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||||
|
|
||||||
@@ -346,6 +348,7 @@ def main():
|
|||||||
num_proc=args.preprocessing_num_workers,
|
num_proc=args.preprocessing_num_workers,
|
||||||
remove_columns=[text_column_name],
|
remove_columns=[text_column_name],
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on dataset line_by_line",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
||||||
@@ -360,6 +363,7 @@ def main():
|
|||||||
num_proc=args.preprocessing_num_workers,
|
num_proc=args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on every text in dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
||||||
@@ -390,6 +394,7 @@ def main():
|
|||||||
batched=True,
|
batched=True,
|
||||||
num_proc=args.preprocessing_num_workers,
|
num_proc=args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc=f"Grouping texts in chunks of {max_seq_length}",
|
||||||
)
|
)
|
||||||
|
|
||||||
train_dataset = tokenized_datasets["train"]
|
train_dataset = tokenized_datasets["train"]
|
||||||
|
|||||||
@@ -41,10 +41,12 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
from transformers.trainer_utils import get_last_checkpoint
|
from transformers.trainer_utils import get_last_checkpoint
|
||||||
from transformers.utils import check_min_version
|
from transformers.utils import check_min_version
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||||
check_min_version("4.8.0.dev0")
|
check_min_version("4.8.0.dev0")
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -358,6 +360,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=[text_column_name],
|
remove_columns=[text_column_name],
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on dataset line_by_line",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
||||||
@@ -370,6 +373,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on every text in dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
||||||
@@ -400,6 +404,7 @@ def main():
|
|||||||
batched=True,
|
batched=True,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc=f"Grouping texts in chunks of {max_seq_length}",
|
||||||
)
|
)
|
||||||
|
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
datasets >= 1.4.0
|
datasets >= 1.8.0
|
||||||
torch >= 1.3.0
|
torch >= 1.3.0
|
||||||
|
|||||||
@@ -42,11 +42,13 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
from transformers.trainer_utils import get_last_checkpoint
|
from transformers.trainer_utils import get_last_checkpoint
|
||||||
from transformers.utils import check_min_version
|
from transformers.utils import check_min_version
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
from utils_qa import postprocess_qa_predictions
|
from utils_qa import postprocess_qa_predictions
|
||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||||
check_min_version("4.8.0.dev0")
|
check_min_version("4.8.0.dev0")
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -417,6 +419,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on train dataset",
|
||||||
)
|
)
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# Number of samples might increase during Feature Creation, We select only specified max samples
|
# Number of samples might increase during Feature Creation, We select only specified max samples
|
||||||
@@ -478,6 +481,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on validation dataset",
|
||||||
)
|
)
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
@@ -497,6 +501,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on prediction dataset",
|
||||||
)
|
)
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
|
|||||||
@@ -41,11 +41,13 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
from transformers.trainer_utils import get_last_checkpoint
|
from transformers.trainer_utils import get_last_checkpoint
|
||||||
from transformers.utils import check_min_version
|
from transformers.utils import check_min_version
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
from utils_qa import postprocess_qa_predictions_with_beam_search
|
from utils_qa import postprocess_qa_predictions_with_beam_search
|
||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||||
check_min_version("4.8.0.dev0")
|
check_min_version("4.8.0.dev0")
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -429,6 +431,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on train dataset",
|
||||||
)
|
)
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# Select samples from dataset again since Feature Creation might increase number of features
|
# Select samples from dataset again since Feature Creation might increase number of features
|
||||||
@@ -514,6 +517,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on validation dataset",
|
||||||
)
|
)
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# Selecting Samples from Dataset again since Feature Creation might increase samples size
|
# Selecting Samples from Dataset again since Feature Creation might increase samples size
|
||||||
@@ -533,6 +537,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on prediction dataset",
|
||||||
)
|
)
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
|
|||||||
@@ -46,11 +46,13 @@ from transformers import (
|
|||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
from transformers.utils import check_min_version
|
from transformers.utils import check_min_version
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
from utils_qa import postprocess_qa_predictions_with_beam_search
|
from utils_qa import postprocess_qa_predictions_with_beam_search
|
||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||||
check_min_version("4.8.0.dev0")
|
check_min_version("4.8.0.dev0")
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -419,6 +421,7 @@ def main():
|
|||||||
num_proc=args.preprocessing_num_workers,
|
num_proc=args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on train dataset",
|
||||||
)
|
)
|
||||||
if args.max_train_samples is not None:
|
if args.max_train_samples is not None:
|
||||||
# Number of samples might increase during Feature Creation, We select only specified max samples
|
# Number of samples might increase during Feature Creation, We select only specified max samples
|
||||||
@@ -503,6 +506,7 @@ def main():
|
|||||||
num_proc=args.preprocessing_num_workers,
|
num_proc=args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on validation dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.max_eval_samples is not None:
|
if args.max_eval_samples is not None:
|
||||||
@@ -523,6 +527,7 @@ def main():
|
|||||||
num_proc=args.preprocessing_num_workers,
|
num_proc=args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on prediction dataset",
|
||||||
)
|
)
|
||||||
if args.max_predict_samples is not None:
|
if args.max_predict_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
|
|||||||
@@ -48,11 +48,13 @@ from transformers import (
|
|||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
from transformers.utils import check_min_version
|
from transformers.utils import check_min_version
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
from utils_qa import postprocess_qa_predictions
|
from utils_qa import postprocess_qa_predictions
|
||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||||
check_min_version("4.8.0.dev0")
|
check_min_version("4.8.0.dev0")
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
# You should update this to your particular problem to have better documentation of `model_type`
|
# You should update this to your particular problem to have better documentation of `model_type`
|
||||||
@@ -448,6 +450,7 @@ def main():
|
|||||||
num_proc=args.preprocessing_num_workers,
|
num_proc=args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on train dataset",
|
||||||
)
|
)
|
||||||
if args.max_train_samples is not None:
|
if args.max_train_samples is not None:
|
||||||
# Number of samples might increase during Feature Creation, We select only specified max samples
|
# Number of samples might increase during Feature Creation, We select only specified max samples
|
||||||
@@ -508,6 +511,7 @@ def main():
|
|||||||
num_proc=args.preprocessing_num_workers,
|
num_proc=args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on validation dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.max_eval_samples is not None:
|
if args.max_eval_samples is not None:
|
||||||
@@ -528,6 +532,7 @@ def main():
|
|||||||
num_proc=args.preprocessing_num_workers,
|
num_proc=args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on prediction dataset",
|
||||||
)
|
)
|
||||||
if args.max_predict_samples is not None:
|
if args.max_predict_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
datasets >= 1.1.3
|
datasets >= 1.8.0
|
||||||
sentencepiece != 0.1.92
|
sentencepiece != 0.1.92
|
||||||
protobuf
|
protobuf
|
||||||
rouge-score
|
rouge-score
|
||||||
|
|||||||
@@ -43,10 +43,12 @@ from transformers import (
|
|||||||
from transformers.file_utils import is_offline_mode
|
from transformers.file_utils import is_offline_mode
|
||||||
from transformers.trainer_utils import get_last_checkpoint
|
from transformers.trainer_utils import get_last_checkpoint
|
||||||
from transformers.utils import check_min_version
|
from transformers.utils import check_min_version
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||||
check_min_version("4.8.0.dev0")
|
check_min_version("4.8.0.dev0")
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -433,6 +435,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on train dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
@@ -448,6 +451,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on validation dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
@@ -463,6 +467,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on prediction dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
|
|||||||
@@ -48,9 +48,12 @@ from transformers import (
|
|||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
from transformers.file_utils import is_offline_mode
|
from transformers.file_utils import is_offline_mode
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||||
|
|
||||||
# You should update this to your particular problem to have better documentation of `model_type`
|
# You should update this to your particular problem to have better documentation of `model_type`
|
||||||
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
||||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||||
@@ -419,7 +422,11 @@ def main():
|
|||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
processed_datasets = raw_datasets.map(
|
processed_datasets = raw_datasets.map(
|
||||||
preprocess_function, batched=True, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache
|
preprocess_function,
|
||||||
|
batched=True,
|
||||||
|
remove_columns=column_names,
|
||||||
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
train_dataset = processed_datasets["train"]
|
train_dataset = processed_datasets["train"]
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
seqeval
|
seqeval
|
||||||
datasets >= 1.1.3
|
datasets >= 1.8.0
|
||||||
torch >= 1.3
|
torch >= 1.3
|
||||||
|
|||||||
@@ -42,10 +42,12 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
from transformers.trainer_utils import get_last_checkpoint
|
from transformers.trainer_utils import get_last_checkpoint
|
||||||
from transformers.utils import check_min_version
|
from transformers.utils import check_min_version
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||||
check_min_version("4.8.0.dev0")
|
check_min_version("4.8.0.dev0")
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -388,6 +390,7 @@ def main():
|
|||||||
batched=True,
|
batched=True,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on train dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
@@ -401,6 +404,7 @@ def main():
|
|||||||
batched=True,
|
batched=True,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on validation dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
@@ -414,6 +418,7 @@ def main():
|
|||||||
batched=True,
|
batched=True,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on prediction dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
|
|||||||
@@ -45,9 +45,12 @@ from transformers import (
|
|||||||
get_scheduler,
|
get_scheduler,
|
||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||||
|
|
||||||
# You should update this to your particular problem to have better documentation of `model_type`
|
# You should update this to your particular problem to have better documentation of `model_type`
|
||||||
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
||||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||||
@@ -381,7 +384,10 @@ def main():
|
|||||||
return tokenized_inputs
|
return tokenized_inputs
|
||||||
|
|
||||||
processed_raw_datasets = raw_datasets.map(
|
processed_raw_datasets = raw_datasets.map(
|
||||||
tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names
|
tokenize_and_align_labels,
|
||||||
|
batched=True,
|
||||||
|
remove_columns=raw_datasets["train"].column_names,
|
||||||
|
desc="Running tokenizer on dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
train_dataset = processed_raw_datasets["train"]
|
train_dataset = processed_raw_datasets["train"]
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
datasets >= 1.1.3
|
datasets >= 1.8.0
|
||||||
sentencepiece != 0.1.92
|
sentencepiece != 0.1.92
|
||||||
protobuf
|
protobuf
|
||||||
sacrebleu >= 1.4.12
|
sacrebleu >= 1.4.12
|
||||||
|
|||||||
@@ -46,10 +46,12 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
from transformers.trainer_utils import get_last_checkpoint
|
from transformers.trainer_utils import get_last_checkpoint
|
||||||
from transformers.utils import check_min_version
|
from transformers.utils import check_min_version
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||||
check_min_version("4.8.0.dev0")
|
check_min_version("4.8.0.dev0")
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -427,6 +429,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on train dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
@@ -442,6 +445,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on validation dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
@@ -457,6 +461,7 @@ def main():
|
|||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on prediction dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
|
|||||||
@@ -48,9 +48,12 @@ from transformers import (
|
|||||||
get_scheduler,
|
get_scheduler,
|
||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
|
from transformers.utils.versions import require_version
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
|
||||||
|
|
||||||
# You should update this to your particular problem to have better documentation of `model_type`
|
# You should update this to your particular problem to have better documentation of `model_type`
|
||||||
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
||||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||||
@@ -401,6 +404,7 @@ def main():
|
|||||||
num_proc=args.preprocessing_num_workers,
|
num_proc=args.preprocessing_num_workers,
|
||||||
remove_columns=column_names,
|
remove_columns=column_names,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
|
desc="Running tokenizer on dataset",
|
||||||
)
|
)
|
||||||
|
|
||||||
train_dataset = processed_datasets["train"]
|
train_dataset = processed_datasets["train"]
|
||||||
|
|||||||
Reference in New Issue
Block a user