From 9068fa6c577a932c46b57ac2e703a1f65883ab42 Mon Sep 17 00:00:00 2001 From: Shamane Siri Date: Wed, 15 Jun 2022 00:56:32 +1200 Subject: [PATCH] Rag end2end new (#17650) * check * update the RAG-end2end with new PL and RAY * removed unwanted comments --- .../rag-end2end-retriever/README.md | 14 ++++++++++---- .../rag-end2end-retriever/callbacks_rag.py | 2 +- .../rag-end2end-retriever/finetune_rag.py | 8 +++----- .../rag-end2end-retriever/lightning_base.py | 15 ++++++--------- .../rag-end2end-retriever/requirements.txt | 12 ++++++------ .../test_run/test_finetune.sh | 7 +++++-- 6 files changed, 31 insertions(+), 27 deletions(-) diff --git a/examples/research_projects/rag-end2end-retriever/README.md b/examples/research_projects/rag-end2end-retriever/README.md index 7cee2f1ea0..9bff4e8c29 100644 --- a/examples/research_projects/rag-end2end-retriever/README.md +++ b/examples/research_projects/rag-end2end-retriever/README.md @@ -15,6 +15,10 @@ This code can be modified to experiment with other research on retrival augmente To start training, use the bash script (finetune_rag_ray_end2end.sh) in this folder. This script also includes descriptions on each command-line argument used. +# Latest Update + +⚠️ Updated the rag-end2end-retriever to be compatible with PL==1.6.4 and RAY==1.13.0 (latest versions to the date 2022-June-11) + # Note ⚠️ This project should be run with pytorch-lightning==1.3.1 which has a potential security vulnerability @@ -22,12 +26,14 @@ To start training, use the bash script (finetune_rag_ray_end2end.sh) in this fol # Testing The following two bash scripts can be used to quickly test the implementation. -1. sh ./test_run/test_rag_new_features.sh - - Tests the newly added functions (set_context_encoder and set_context_encoder_tokenizer) related to modeling rag. - - This is sufficient to check the model's ability to use the set functions correctly. -2. sh ./test_run/test_finetune.sh script +1. sh ./test_run/test_finetune.sh script - Tests the full end-to-end fine-tuning ability with a dummy knowlendge-base and dummy training dataset (check test_dir directory). - Users can replace the dummy dataset and knowledge-base with their own to do their own finetuning. + - Please read the comments in the test_finetune.sh file. +2. sh ./test_run/test_rag_new_features.sh + - Tests the newly added functions (set_context_encoder and set_context_encoder_tokenizer) related to modeling rag. + - This is sufficient to check the model's ability to use the set functions correctly. + # Comparison of end2end RAG (including DPR finetuning) VS original-RAG diff --git a/examples/research_projects/rag-end2end-retriever/callbacks_rag.py b/examples/research_projects/rag-end2end-retriever/callbacks_rag.py index c05db23f18..5f18244a7a 100644 --- a/examples/research_projects/rag-end2end-retriever/callbacks_rag.py +++ b/examples/research_projects/rag-end2end-retriever/callbacks_rag.py @@ -41,7 +41,7 @@ def get_checkpoint_callback(output_dir, metric): monitor=f"val_{metric}", mode="max", save_top_k=1, - every_n_val_epochs=1, # works only with PL > 1.3 + every_n_epochs=1, # works only with PL > 1.3 ) return checkpoint_callback diff --git a/examples/research_projects/rag-end2end-retriever/finetune_rag.py b/examples/research_projects/rag-end2end-retriever/finetune_rag.py index ac781c3254..1229870e63 100644 --- a/examples/research_projects/rag-end2end-retriever/finetune_rag.py +++ b/examples/research_projects/rag-end2end-retriever/finetune_rag.py @@ -350,6 +350,7 @@ class GenerativeQAModule(BaseTransformer): concat.save_to_disk(self.config.passages_path) # here we update the main passage file on the disk logger.info("done updating the dataset") + # To Do (@Aaron) : Useful in the future dynamic memory implementation. # if you load the index from the disk make sure to update the index file here, otherwise it is ok to update the index file from the worker. # logger.info("then updating the index") # shutil.copy(self.custom_config.temp_index, self.config.idex_path) @@ -360,10 +361,7 @@ class GenerativeQAModule(BaseTransformer): isEmUpdateBusy = False isAddIndexBusy = False - - self.trainer.accelerator_connector.accelerator.barrier( - "barrier" - ) # waint untill the index and kb get re-initialized. + self.trainer.strategy.barrier("barrier") loss_tensors = self._step(batch) @@ -724,7 +722,7 @@ def main(args=None, model=None) -> GenerativeQAModule: raise RuntimeError("Please install Ray to use the Ray distributed retriever.") # Connect to an existing Ray cluster. try: - ray.init(address=args.ray_address) + ray.init(address=args.ray_address, namespace="rag") except (ConnectionError, ValueError): logger.warning( "Connection to Ray cluster failed. Make sure a Ray" diff --git a/examples/research_projects/rag-end2end-retriever/lightning_base.py b/examples/research_projects/rag-end2end-retriever/lightning_base.py index 1843b09148..8484294405 100644 --- a/examples/research_projects/rag-end2end-retriever/lightning_base.py +++ b/examples/research_projects/rag-end2end-retriever/lightning_base.py @@ -5,7 +5,6 @@ from pathlib import Path from typing import Any, Dict import pytorch_lightning as pl -from pytorch_lightning.plugins.training_type import DDPPlugin from pytorch_lightning.utilities import rank_zero_info from transformers import ( @@ -386,24 +385,22 @@ def generic_train( train_params = {} - # TODO: remove with PyTorch 1.6 since pl uses native amp if args.fp16: train_params["precision"] = 16 - train_params["amp_level"] = args.fp16_opt_level if args.gpus > 1: - train_params["accelerator"] = "ddp" + train_params["accelerator"] = "auto" + train_params["strategy"] = "ddp" train_params["accumulate_grad_batches"] = args.accumulate_grad_batches - # train_params["accelerator"] = extra_train_kwargs.get("accelerator", None) - train_params["profiler"] = None # extra_train_kwargs.get("profiler", None) + train_params["profiler"] = None + train_params["devices"] = "auto" trainer = pl.Trainer.from_argparse_args( args, weights_summary=None, callbacks=[logging_callback] + extra_callbacks + [InitCallback()] + [checkpoint_callback], logger=logger, - plugins=[DDPPlugin(find_unused_parameters=True)], # this is needed in new pytorch-lightning new version val_check_interval=1, num_sanity_val_steps=2, **train_params, @@ -412,6 +409,6 @@ def generic_train( if args.do_train: trainer.fit(model) - # else: - # print("RAG modeling tests with new set functions successfuly executed!") + else: + print("RAG modeling tests with new set functions successfuly executed!") return trainer diff --git a/examples/research_projects/rag-end2end-retriever/requirements.txt b/examples/research_projects/rag-end2end-retriever/requirements.txt index aca89c78e8..32025229d0 100644 --- a/examples/research_projects/rag-end2end-retriever/requirements.txt +++ b/examples/research_projects/rag-end2end-retriever/requirements.txt @@ -1,7 +1,7 @@ -faiss-cpu >= 1.7.0 -datasets >= 1.6.2 -psutil >= 5.7.0 -torch >= 1.4.0 -pytorch-lightning +faiss-cpu >= 1.7.2 +datasets +psutil >= 5.9.1 +torch >= 1.11.0 +pytorch-lightning == 1.6.4 nvidia-ml-py3 == 7.352.0 -ray >= 1.3.0 +ray >= 1.13.0 \ No newline at end of file diff --git a/examples/research_projects/rag-end2end-retriever/test_run/test_finetune.sh b/examples/research_projects/rag-end2end-retriever/test_run/test_finetune.sh index bbf69b0538..c44d110d20 100755 --- a/examples/research_projects/rag-end2end-retriever/test_run/test_finetune.sh +++ b/examples/research_projects/rag-end2end-retriever/test_run/test_finetune.sh @@ -44,11 +44,14 @@ python finetune_rag.py \ --num_retrieval_workers 4 \ --index_name custom \ --context_encoder_name facebook/dpr-ctx_encoder-multiset-base \ - --index_gpus 1 \ - --gpu_order [6,7,8,9,0,1,2,3,5,4] \ + --index_gpus 2 \ + --gpu_order [2,3,4,5,6,7,8,9,0,1] \ --indexing_freq 5 # Stop the Ray cluster. ray stop + +#CUDA_VISIBLE_DEVICES=2,3,4,5,6,7,8,9,0,1 sh ./test_run/test_finetune.sh +#Make sure --gpu_order is same. \ No newline at end of file