Add dataset_revision argument to RagConfig (#29610)
* add arg --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -111,6 +111,7 @@ class RagConfig(PretrainedConfig):
|
|||||||
output_retrieved=False,
|
output_retrieved=False,
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
forced_eos_token_id=None,
|
forced_eos_token_id=None,
|
||||||
|
dataset_revision=None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
@@ -156,6 +157,7 @@ class RagConfig(PretrainedConfig):
|
|||||||
self.passages_path = passages_path
|
self.passages_path = passages_path
|
||||||
self.index_path = index_path
|
self.index_path = index_path
|
||||||
self.use_dummy_dataset = use_dummy_dataset
|
self.use_dummy_dataset = use_dummy_dataset
|
||||||
|
self.dataset_revision = dataset_revision
|
||||||
|
|
||||||
self.output_retrieved = output_retrieved
|
self.output_retrieved = output_retrieved
|
||||||
|
|
||||||
|
|||||||
@@ -266,6 +266,7 @@ class CanonicalHFIndex(HFIndexBase):
|
|||||||
index_name: Optional[str] = None,
|
index_name: Optional[str] = None,
|
||||||
index_path: Optional[str] = None,
|
index_path: Optional[str] = None,
|
||||||
use_dummy_dataset=False,
|
use_dummy_dataset=False,
|
||||||
|
dataset_revision=None,
|
||||||
):
|
):
|
||||||
if int(index_path is None) + int(index_name is None) != 1:
|
if int(index_path is None) + int(index_name is None) != 1:
|
||||||
raise ValueError("Please provide `index_name` or `index_path`.")
|
raise ValueError("Please provide `index_name` or `index_path`.")
|
||||||
@@ -274,9 +275,14 @@ class CanonicalHFIndex(HFIndexBase):
|
|||||||
self.index_name = index_name
|
self.index_name = index_name
|
||||||
self.index_path = index_path
|
self.index_path = index_path
|
||||||
self.use_dummy_dataset = use_dummy_dataset
|
self.use_dummy_dataset = use_dummy_dataset
|
||||||
|
self.dataset_revision = dataset_revision
|
||||||
logger.info(f"Loading passages from {self.dataset_name}")
|
logger.info(f"Loading passages from {self.dataset_name}")
|
||||||
dataset = load_dataset(
|
dataset = load_dataset(
|
||||||
self.dataset_name, with_index=False, split=self.dataset_split, dummy=self.use_dummy_dataset
|
self.dataset_name,
|
||||||
|
with_index=False,
|
||||||
|
split=self.dataset_split,
|
||||||
|
dummy=self.use_dummy_dataset,
|
||||||
|
revision=dataset_revision,
|
||||||
)
|
)
|
||||||
super().__init__(vector_size, dataset, index_initialized=False)
|
super().__init__(vector_size, dataset, index_initialized=False)
|
||||||
|
|
||||||
@@ -293,6 +299,7 @@ class CanonicalHFIndex(HFIndexBase):
|
|||||||
split=self.dataset_split,
|
split=self.dataset_split,
|
||||||
index_name=self.index_name,
|
index_name=self.index_name,
|
||||||
dummy=self.use_dummy_dataset,
|
dummy=self.use_dummy_dataset,
|
||||||
|
revision=self.dataset_revision,
|
||||||
)
|
)
|
||||||
self.dataset.set_format("numpy", columns=["embeddings"], output_all_columns=True)
|
self.dataset.set_format("numpy", columns=["embeddings"], output_all_columns=True)
|
||||||
self._index_initialized = True
|
self._index_initialized = True
|
||||||
@@ -427,6 +434,7 @@ class RagRetriever:
|
|||||||
index_name=config.index_name,
|
index_name=config.index_name,
|
||||||
index_path=config.index_path,
|
index_path=config.index_path,
|
||||||
use_dummy_dataset=config.use_dummy_dataset,
|
use_dummy_dataset=config.use_dummy_dataset,
|
||||||
|
dataset_revision=config.dataset_revision,
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@@ -730,6 +730,7 @@ class RagModelIntegrationTests(unittest.TestCase):
|
|||||||
use_dummy_dataset=True,
|
use_dummy_dataset=True,
|
||||||
retrieval_vector_size=768,
|
retrieval_vector_size=768,
|
||||||
retrieval_batch_size=8,
|
retrieval_batch_size=8,
|
||||||
|
dataset_revision="b24a417",
|
||||||
)
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@@ -905,7 +906,7 @@ class RagModelIntegrationTests(unittest.TestCase):
|
|||||||
def test_rag_sequence_generate_batch(self):
|
def test_rag_sequence_generate_batch(self):
|
||||||
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
|
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
|
||||||
retriever = RagRetriever.from_pretrained(
|
retriever = RagRetriever.from_pretrained(
|
||||||
"facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
|
"facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
|
||||||
)
|
)
|
||||||
rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to(
|
rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to(
|
||||||
torch_device
|
torch_device
|
||||||
@@ -944,7 +945,10 @@ class RagModelIntegrationTests(unittest.TestCase):
|
|||||||
def test_rag_sequence_generate_batch_from_context_input_ids(self):
|
def test_rag_sequence_generate_batch_from_context_input_ids(self):
|
||||||
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
|
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
|
||||||
retriever = RagRetriever.from_pretrained(
|
retriever = RagRetriever.from_pretrained(
|
||||||
"facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
|
"facebook/rag-sequence-nq",
|
||||||
|
index_name="exact",
|
||||||
|
use_dummy_dataset=True,
|
||||||
|
dataset_revision="b24a417",
|
||||||
)
|
)
|
||||||
rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to(
|
rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to(
|
||||||
torch_device
|
torch_device
|
||||||
@@ -993,7 +997,9 @@ class RagModelIntegrationTests(unittest.TestCase):
|
|||||||
@slow
|
@slow
|
||||||
def test_rag_token_generate_batch(self):
|
def test_rag_token_generate_batch(self):
|
||||||
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
|
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
|
||||||
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
|
retriever = RagRetriever.from_pretrained(
|
||||||
|
"facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
|
||||||
|
)
|
||||||
rag_token = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever).to(
|
rag_token = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever).to(
|
||||||
torch_device
|
torch_device
|
||||||
)
|
)
|
||||||
@@ -1063,6 +1069,7 @@ class RagModelSaveLoadTests(unittest.TestCase):
|
|||||||
use_dummy_dataset=True,
|
use_dummy_dataset=True,
|
||||||
retrieval_vector_size=768,
|
retrieval_vector_size=768,
|
||||||
retrieval_batch_size=8,
|
retrieval_batch_size=8,
|
||||||
|
dataset_revision="b24a417",
|
||||||
)
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
|
|||||||
@@ -590,6 +590,7 @@ class TFRagModelIntegrationTests(unittest.TestCase):
|
|||||||
use_dummy_dataset=True,
|
use_dummy_dataset=True,
|
||||||
retrieval_vector_size=768,
|
retrieval_vector_size=768,
|
||||||
retrieval_batch_size=8,
|
retrieval_batch_size=8,
|
||||||
|
dataset_revision="b24a417",
|
||||||
)
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@@ -794,7 +795,9 @@ class TFRagModelIntegrationTests(unittest.TestCase):
|
|||||||
@slow
|
@slow
|
||||||
def test_rag_token_greedy_search(self):
|
def test_rag_token_greedy_search(self):
|
||||||
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
|
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
|
||||||
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
|
retriever = RagRetriever.from_pretrained(
|
||||||
|
"facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
|
||||||
|
)
|
||||||
rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
|
rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
|
||||||
|
|
||||||
# check first two questions
|
# check first two questions
|
||||||
@@ -828,7 +831,9 @@ class TFRagModelIntegrationTests(unittest.TestCase):
|
|||||||
def test_rag_token_generate_batch(self):
|
def test_rag_token_generate_batch(self):
|
||||||
# NOTE: gold labels comes from num_beam=4, so this is effectively beam-search test
|
# NOTE: gold labels comes from num_beam=4, so this is effectively beam-search test
|
||||||
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
|
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
|
||||||
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
|
retriever = RagRetriever.from_pretrained(
|
||||||
|
"facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
|
||||||
|
)
|
||||||
rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
|
rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
|
||||||
|
|
||||||
input_dict = tokenizer(
|
input_dict = tokenizer(
|
||||||
@@ -871,7 +876,10 @@ class TFRagModelIntegrationTests(unittest.TestCase):
|
|||||||
def test_rag_sequence_generate_batch(self):
|
def test_rag_sequence_generate_batch(self):
|
||||||
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
|
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
|
||||||
retriever = RagRetriever.from_pretrained(
|
retriever = RagRetriever.from_pretrained(
|
||||||
"facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
|
"facebook/rag-sequence-nq",
|
||||||
|
index_name="exact",
|
||||||
|
use_dummy_dataset=True,
|
||||||
|
dataset_revision="b24a417",
|
||||||
)
|
)
|
||||||
rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
|
rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
|
||||||
|
|
||||||
@@ -908,7 +916,7 @@ class TFRagModelIntegrationTests(unittest.TestCase):
|
|||||||
def test_rag_sequence_generate_batch_from_context_input_ids(self):
|
def test_rag_sequence_generate_batch_from_context_input_ids(self):
|
||||||
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
|
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
|
||||||
retriever = RagRetriever.from_pretrained(
|
retriever = RagRetriever.from_pretrained(
|
||||||
"facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
|
"facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
|
||||||
)
|
)
|
||||||
rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
|
rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
|
||||||
input_dict = tokenizer(
|
input_dict = tokenizer(
|
||||||
@@ -976,6 +984,7 @@ class TFRagModelSaveLoadTests(unittest.TestCase):
|
|||||||
use_dummy_dataset=True,
|
use_dummy_dataset=True,
|
||||||
retrieval_vector_size=768,
|
retrieval_vector_size=768,
|
||||||
retrieval_batch_size=8,
|
retrieval_batch_size=8,
|
||||||
|
dataset_revision="b24a417",
|
||||||
)
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
|
|||||||
Reference in New Issue
Block a user