From 2cc3cc835fcf9b872a160e4ab4accdbee1614f4f Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 14 Mar 2024 16:48:11 +0100 Subject: [PATCH] Add `dataset_revision` argument to `RagConfig` (#29610) * add arg --------- Co-authored-by: ydshieh --- .../models/rag/configuration_rag.py | 2 ++ src/transformers/models/rag/retrieval_rag.py | 10 +++++++++- tests/models/rag/test_modeling_rag.py | 13 ++++++++++--- tests/models/rag/test_modeling_tf_rag.py | 17 +++++++++++++---- 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/rag/configuration_rag.py b/src/transformers/models/rag/configuration_rag.py index 60f38ee6a5..2229e485db 100644 --- a/src/transformers/models/rag/configuration_rag.py +++ b/src/transformers/models/rag/configuration_rag.py @@ -111,6 +111,7 @@ class RagConfig(PretrainedConfig): output_retrieved=False, use_cache=True, forced_eos_token_id=None, + dataset_revision=None, **kwargs, ): super().__init__( @@ -156,6 +157,7 @@ class RagConfig(PretrainedConfig): self.passages_path = passages_path self.index_path = index_path self.use_dummy_dataset = use_dummy_dataset + self.dataset_revision = dataset_revision self.output_retrieved = output_retrieved diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py index 76f6231ec2..a448132300 100644 --- a/src/transformers/models/rag/retrieval_rag.py +++ b/src/transformers/models/rag/retrieval_rag.py @@ -266,6 +266,7 @@ class CanonicalHFIndex(HFIndexBase): index_name: Optional[str] = None, index_path: Optional[str] = None, use_dummy_dataset=False, + dataset_revision=None, ): if int(index_path is None) + int(index_name is None) != 1: raise ValueError("Please provide `index_name` or `index_path`.") @@ -274,9 +275,14 @@ class CanonicalHFIndex(HFIndexBase): self.index_name = index_name self.index_path = index_path self.use_dummy_dataset = use_dummy_dataset + self.dataset_revision = dataset_revision logger.info(f"Loading passages from {self.dataset_name}") dataset = load_dataset( - self.dataset_name, with_index=False, split=self.dataset_split, dummy=self.use_dummy_dataset + self.dataset_name, + with_index=False, + split=self.dataset_split, + dummy=self.use_dummy_dataset, + revision=dataset_revision, ) super().__init__(vector_size, dataset, index_initialized=False) @@ -293,6 +299,7 @@ class CanonicalHFIndex(HFIndexBase): split=self.dataset_split, index_name=self.index_name, dummy=self.use_dummy_dataset, + revision=self.dataset_revision, ) self.dataset.set_format("numpy", columns=["embeddings"], output_all_columns=True) self._index_initialized = True @@ -427,6 +434,7 @@ class RagRetriever: index_name=config.index_name, index_path=config.index_path, use_dummy_dataset=config.use_dummy_dataset, + dataset_revision=config.dataset_revision, ) @classmethod diff --git a/tests/models/rag/test_modeling_rag.py b/tests/models/rag/test_modeling_rag.py index 48c7099620..69a321636e 100644 --- a/tests/models/rag/test_modeling_rag.py +++ b/tests/models/rag/test_modeling_rag.py @@ -730,6 +730,7 @@ class RagModelIntegrationTests(unittest.TestCase): use_dummy_dataset=True, retrieval_vector_size=768, retrieval_batch_size=8, + dataset_revision="b24a417", ) @slow @@ -905,7 +906,7 @@ class RagModelIntegrationTests(unittest.TestCase): def test_rag_sequence_generate_batch(self): tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") retriever = RagRetriever.from_pretrained( - "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True + "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417" ) rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to( torch_device @@ -944,7 +945,10 @@ class RagModelIntegrationTests(unittest.TestCase): def test_rag_sequence_generate_batch_from_context_input_ids(self): tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") retriever = RagRetriever.from_pretrained( - "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True + "facebook/rag-sequence-nq", + index_name="exact", + use_dummy_dataset=True, + dataset_revision="b24a417", ) rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to( torch_device @@ -993,7 +997,9 @@ class RagModelIntegrationTests(unittest.TestCase): @slow def test_rag_token_generate_batch(self): tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq") - retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True) + retriever = RagRetriever.from_pretrained( + "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417" + ) rag_token = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever).to( torch_device ) @@ -1063,6 +1069,7 @@ class RagModelSaveLoadTests(unittest.TestCase): use_dummy_dataset=True, retrieval_vector_size=768, retrieval_batch_size=8, + dataset_revision="b24a417", ) @slow diff --git a/tests/models/rag/test_modeling_tf_rag.py b/tests/models/rag/test_modeling_tf_rag.py index a484017e60..ed15cfd7b6 100644 --- a/tests/models/rag/test_modeling_tf_rag.py +++ b/tests/models/rag/test_modeling_tf_rag.py @@ -590,6 +590,7 @@ class TFRagModelIntegrationTests(unittest.TestCase): use_dummy_dataset=True, retrieval_vector_size=768, retrieval_batch_size=8, + dataset_revision="b24a417", ) @slow @@ -794,7 +795,9 @@ class TFRagModelIntegrationTests(unittest.TestCase): @slow def test_rag_token_greedy_search(self): tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq") - retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True) + retriever = RagRetriever.from_pretrained( + "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417" + ) rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever) # check first two questions @@ -828,7 +831,9 @@ class TFRagModelIntegrationTests(unittest.TestCase): def test_rag_token_generate_batch(self): # NOTE: gold labels comes from num_beam=4, so this is effectively beam-search test tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq") - retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True) + retriever = RagRetriever.from_pretrained( + "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417" + ) rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever) input_dict = tokenizer( @@ -871,7 +876,10 @@ class TFRagModelIntegrationTests(unittest.TestCase): def test_rag_sequence_generate_batch(self): tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") retriever = RagRetriever.from_pretrained( - "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True + "facebook/rag-sequence-nq", + index_name="exact", + use_dummy_dataset=True, + dataset_revision="b24a417", ) rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever) @@ -908,7 +916,7 @@ class TFRagModelIntegrationTests(unittest.TestCase): def test_rag_sequence_generate_batch_from_context_input_ids(self): tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") retriever = RagRetriever.from_pretrained( - "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True + "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417" ) rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever) input_dict = tokenizer( @@ -976,6 +984,7 @@ class TFRagModelSaveLoadTests(unittest.TestCase): use_dummy_dataset=True, retrieval_vector_size=768, retrieval_batch_size=8, + dataset_revision="b24a417", ) @slow