Tokenizers: ability to load from model subfolder (#8586)

* <small>tiny typo</small>

* Tokenizers: ability to load from model subfolder

* use subfolder for local files as well

* Uniformize model shortcut name => model id

* from s3 => from huggingface.co

Co-authored-by: Quentin Lhoest <lhoest.q@gmail.com>
This commit is contained in:
Julien Chaumond
2020-11-17 14:58:45 +01:00
committed by GitHub
parent 48395d6b8e
commit 042a6aa777
54 changed files with 210 additions and 186 deletions

View File

@@ -57,7 +57,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)

View File

@@ -476,7 +476,7 @@ def main():
"--cache_dir",
default="",
type=str,
help="Where do you want to store the pre-trained models downloaded from s3",
help="Where do you want to store the pre-trained models downloaded from huggingface.co",
)
parser.add_argument(
"--max_seq_length",

View File

@@ -298,7 +298,7 @@ def main():
"--cache_dir",
default=None,
type=str,
help="Where do you want to store the pre-trained models downloaded from s3",
help="Where do you want to store the pre-trained models downloaded from huggingface.co",
)
parser.add_argument(
"--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."

View File

@@ -81,7 +81,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)

View File

@@ -350,7 +350,7 @@ def main():
"--cache_dir",
default=None,
type=str,
help="Where do you want to store the pre-trained models downloaded from s3",
help="Where do you want to store the pre-trained models downloaded from huggingface.co",
)
parser.add_argument(
"--max_seq_length",

View File

@@ -452,7 +452,7 @@ def main():
"--cache_dir",
default="",
type=str,
help="Where do you want to store the pre-trained models downloaded from s3",
help="Where do you want to store the pre-trained models downloaded from huggingface.co",
)
parser.add_argument(
"--max_seq_length",

View File

@@ -578,7 +578,7 @@ def main():
"--cache_dir",
default="",
type=str,
help="Where do you want to store the pre-trained models downloaded from s3",
help="Where do you want to store the pre-trained models downloaded from huggingface.co",
)
parser.add_argument(

View File

@@ -76,7 +76,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
use_fast_tokenizer: bool = field(
default=True,

View File

@@ -74,7 +74,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
use_fast_tokenizer: bool = field(
default=True,

View File

@@ -76,7 +76,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
use_fast_tokenizer: bool = field(
default=True,

View File

@@ -64,7 +64,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
use_fast_tokenizer: bool = field(
default=True,

View File

@@ -236,7 +236,7 @@ class BaseTransformer(pl.LightningModule):
"--cache_dir",
default="",
type=str,
help="Where do you want to store the pre-trained models downloaded from s3",
help="Where do you want to store the pre-trained models downloaded from huggingface.co",
)
parser.add_argument(
"--encoder_layerdrop",

View File

@@ -620,7 +620,7 @@ def main():
"--cache_dir",
default="",
type=str,
help="Where do you want to store the pre-trained models downloaded from s3",
help="Where do you want to store the pre-trained models downloaded from huggingface.co",
)
parser.add_argument(
"--max_seq_length",

View File

@@ -725,7 +725,7 @@ def main():
"--cache_dir",
default="",
type=str,
help="Where do you want to store the pre-trained models downloaded from s3",
help="Where do you want to store the pre-trained models downloaded from huggingface.co",
)
parser.add_argument(

View File

@@ -61,7 +61,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)

View File

@@ -65,7 +65,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)

View File

@@ -532,7 +532,7 @@ def main():
"--cache_dir",
default="",
type=str,
help="Where do you want to store the pre-trained models downloaded from s3",
help="Where do you want to store the pre-trained models downloaded from huggingface.co",
)
parser.add_argument(

View File

@@ -51,7 +51,8 @@ class ModelArguments:
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
# or just modify its tokenizer_config.json.
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)

View File

@@ -63,7 +63,8 @@ class ModelArguments:
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
# or just modify its tokenizer_config.json.
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)

View File

@@ -7,7 +7,7 @@ export PYTHONPATH="../":"${PYTHONPATH}"
python examples/rag/finetune.py \
--data_dir $DATA_DIR \
--output_dir $OUTPUT_DIR \
--model_name_or_path $MODLE_NAME_OR_PATH \
--model_name_or_path $MODEL_NAME_OR_PATH \
--model_type rag_sequence \
--fp16 \
--gpus 8 \

View File

@@ -43,7 +43,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
freeze_encoder: bool = field(default=False, metadata={"help": "Whether tp freeze the encoder."})
freeze_embeds: bool = field(default=False, metadata={"help": "Whether to freeze the embeddings."})

View File

@@ -124,7 +124,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
use_fast_tokenizer: bool = field(
default=True,

View File

@@ -117,7 +117,8 @@ class ModelArguments:
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
# or just modify its tokenizer_config.json.
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)

View File

@@ -182,7 +182,8 @@ class ModelArguments:
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
# or just modify its tokenizer_config.json.
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)

View File

@@ -406,7 +406,7 @@ def main():
"--cache_dir",
default=None,
type=str,
help="Where do you want to store the pre-trained models downloaded from s3",
help="Where do you want to store the pre-trained models downloaded from huggingface.co",
)
parser.add_argument(
"--max_seq_length",

View File

@@ -60,7 +60,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)

View File

@@ -65,7 +65,8 @@ class ModelArguments:
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
# or just modify its tokenizer_config.json.
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)

View File

@@ -67,7 +67,8 @@ class ModelArguments:
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
# or just modify its tokenizer_config.json.
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)