[RAG] Clean Rag readme in examples (#7413)
* Improve README + consolidation script * Reformat README * Reformat README Co-authored-by: Your Name <you@example.com>
This commit is contained in:
99
examples/rag/consolidate_rag_checkpoint.py
Normal file
99
examples/rag/consolidate_rag_checkpoint.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""
|
||||
A script creating a RAG checkpoint from a generator and a question encoder checkpoints.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from transformers import AutoConfig, AutoTokenizer, RagConfig, RagSequenceForGeneration, RagTokenForGeneration
|
||||
|
||||
|
||||
def consolidate(
|
||||
model_type,
|
||||
generator_name_or_path: str,
|
||||
question_encoder_name_or_path: str,
|
||||
dest_dir: Path,
|
||||
config_name_or_path: str = None,
|
||||
generator_tokenizer_name_or_path: str = None,
|
||||
question_encoder_tokenizer_name_or_path: str = None,
|
||||
):
|
||||
|
||||
if config_name_or_path is None:
|
||||
config_name_or_path = "facebook/rag-token-base" if model_type == "rag_token" else "facebook/rag-sequence-base"
|
||||
|
||||
if generator_tokenizer_name_or_path is None:
|
||||
generator_tokenizer_name_or_path = generator_name_or_path
|
||||
|
||||
if question_encoder_tokenizer_name_or_path is None:
|
||||
question_encoder_tokenizer_name_or_path = question_encoder_name_or_path
|
||||
|
||||
model_class = RagTokenForGeneration if model_type == "rag_token" else RagSequenceForGeneration
|
||||
|
||||
# Save model.
|
||||
rag_config = RagConfig.from_pretrained(config_name_or_path)
|
||||
gen_config = AutoConfig.from_pretrained(generator_name_or_path)
|
||||
question_encoder_config = AutoConfig.from_pretrained(question_encoder_name_or_path)
|
||||
|
||||
rag_config.generator = gen_config
|
||||
rag_config.question_encoder = question_encoder_config
|
||||
|
||||
rag_model = model_class.from_pretrained_question_encoder_generator(
|
||||
question_encoder_name_or_path, generator_name_or_path, config=rag_config
|
||||
)
|
||||
rag_model.save_pretrained(dest_dir)
|
||||
|
||||
# Sanity check.
|
||||
model_class.from_pretrained(dest_dir)
|
||||
|
||||
# Save tokenizers.
|
||||
gen_tokenizer = AutoTokenizer.from_pretrained(generator_tokenizer_name_or_path)
|
||||
gen_tokenizer.save_pretrained(dest_dir / "generator_tokenizer/")
|
||||
question_encoder_tokenizer = AutoTokenizer.from_pretrained(question_encoder_tokenizer_name_or_path)
|
||||
question_encoder_tokenizer.save_pretrained(dest_dir / "question_encoder_tokenizer/")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
choices=["rag_sequence", "rag_token"],
|
||||
required=True,
|
||||
type=str,
|
||||
help="RAG model type: rag_sequence, rag_token",
|
||||
)
|
||||
parser.add_argument("--dest", type=str, required=True, help="Path to the output checkpoint directory.")
|
||||
parser.add_argument("--generator_name_or_path", type=str, required=True, help="Generator model identifier")
|
||||
parser.add_argument(
|
||||
"--question_encoder_name_or_path", type=str, required=True, help="Question encoder model identifier"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--generator_tokenizer_name_or_path",
|
||||
type=str,
|
||||
help="Generator tokenizer identifier, if not specified, resolves to ``generator_name_or_path``",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--question_encoder_tokenizer_name_or_path",
|
||||
type=str,
|
||||
help="Question encoder tokenizer identifier, if not specified, resolves to ``question_encoder_name_or_path``",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_name_or_path",
|
||||
type=str,
|
||||
help="Identifier of the model config to use, if not provided, resolves to a base config for a given ``model_type``",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
dest_dir = Path(args.dest)
|
||||
dest_dir.mkdir(exist_ok=True)
|
||||
|
||||
consolidate(
|
||||
args.model_type,
|
||||
args.generator_name_or_path,
|
||||
args.question_encoder_name_or_path,
|
||||
dest_dir,
|
||||
args.config_name_or_path,
|
||||
args.generator_tokenizer_name_or_path,
|
||||
args.question_encoder_tokenizer_name_or_path,
|
||||
)
|
||||
Reference in New Issue
Block a user