Pass datasets trust_remote_code (#31406)
* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commitb7672826ca. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit833fc17a3e. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg
This commit is contained in:
committed by
GitHub
parent
485fd81471
commit
a14b055b65
@@ -215,7 +215,9 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
|
||||
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
|
||||
|
||||
def _load_datasamples(self, num_samples):
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
ds = load_dataset(
|
||||
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
|
||||
)
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
||||
|
||||
|
||||
@@ -410,7 +410,9 @@ class FlaxWhisperModelIntegrationTest(unittest.TestCase):
|
||||
return WhisperProcessor.from_pretrained("openai/whisper-base")
|
||||
|
||||
def _load_datasamples(self, num_samples):
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
ds = load_dataset(
|
||||
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
|
||||
)
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
||||
|
||||
@@ -561,7 +563,7 @@ class FlaxWhisperModelIntegrationTest(unittest.TestCase):
|
||||
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
|
||||
model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-large", from_pt=True)
|
||||
|
||||
ds = load_dataset("common_voice", "ja", split="test", streaming=True)
|
||||
ds = load_dataset("legacy-datasets/common_voice", "ja", split="test", streaming=True, trust_remote_code=True)
|
||||
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
|
||||
input_speech = next(iter(ds))["audio"]["array"]
|
||||
input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="np")
|
||||
|
||||
@@ -704,7 +704,7 @@ class TFWhisperModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestC
|
||||
|
||||
|
||||
def _load_datasamples(num_samples):
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
||||
|
||||
@@ -795,7 +795,7 @@ def _test_large_generation_multilingual(in_queue, out_queue, timeout):
|
||||
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
|
||||
model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
|
||||
|
||||
ds = load_dataset("common_voice", "ja", split="test", streaming=True)
|
||||
ds = load_dataset("legacy-datasets/common_voice", "ja", split="test", streaming=True, trust_remote_code=True)
|
||||
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
|
||||
input_speech = next(iter(ds))["audio"]["array"]
|
||||
input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="tf").input_features
|
||||
|
||||
@@ -1552,7 +1552,9 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
||||
return WhisperProcessor.from_pretrained("openai/whisper-base")
|
||||
|
||||
def _load_datasamples(self, num_samples):
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
ds = load_dataset(
|
||||
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
|
||||
)
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
||||
|
||||
@@ -1763,7 +1765,9 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
||||
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
|
||||
model.to(torch_device)
|
||||
|
||||
ds = load_dataset("facebook/multilingual_librispeech", "german", split="test", streaming=True)
|
||||
ds = load_dataset(
|
||||
"facebook/multilingual_librispeech", "german", split="test", streaming=True, trust_remote_code=True
|
||||
)
|
||||
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
|
||||
|
||||
input_speech = next(iter(ds))["audio"]["array"]
|
||||
@@ -1830,7 +1834,14 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
||||
model.to(torch_device)
|
||||
|
||||
token = os.getenv("HF_HUB_READ_TOKEN", True)
|
||||
ds = load_dataset("mozilla-foundation/common_voice_6_1", "ja", split="test", streaming=True, token=token)
|
||||
ds = load_dataset(
|
||||
"mozilla-foundation/common_voice_6_1",
|
||||
"ja",
|
||||
split="test",
|
||||
streaming=True,
|
||||
token=token,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
|
||||
|
||||
input_speech = next(iter(ds))["audio"]["array"]
|
||||
@@ -2358,7 +2369,9 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
||||
)
|
||||
assistant_model.to(torch_device)
|
||||
|
||||
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
dataset = load_dataset(
|
||||
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
|
||||
)
|
||||
sample = dataset[0]["audio"]
|
||||
|
||||
input_features = processor(sample["array"], return_tensors="pt", sampling_rate=16_000).input_features
|
||||
@@ -2407,7 +2420,9 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
||||
)
|
||||
assistant_model.to(torch_device)
|
||||
|
||||
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
dataset = load_dataset(
|
||||
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
|
||||
)
|
||||
sample = dataset[0]["audio"]
|
||||
|
||||
input_features = processor(sample["array"], return_tensors="pt", sampling_rate=16_000).input_features
|
||||
@@ -2448,7 +2463,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
||||
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
||||
model = model.to(torch_device)
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True)
|
||||
one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
|
||||
|
||||
input_features = processor(
|
||||
@@ -2484,7 +2499,9 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
||||
prompt = "Mr. Kilter, Brionno." # let's force Quilter -> Kilter, Brion -> Brionno
|
||||
prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device)
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:-1]")
|
||||
ds = load_dataset(
|
||||
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:-1]", trust_remote_code=True
|
||||
)
|
||||
one_audio = np.concatenate([x["array"] for x in ds["audio"]], dtype=np.float32)
|
||||
|
||||
first_text = ds[0]["text"].lower()
|
||||
@@ -2535,7 +2552,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
||||
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
||||
model = model.to(torch_device)
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True)
|
||||
one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
|
||||
|
||||
input_features = processor(
|
||||
@@ -2568,7 +2585,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
||||
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
||||
model = model.to(torch_device)
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True)
|
||||
one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
|
||||
|
||||
input_features = processor(
|
||||
@@ -2610,7 +2627,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
||||
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
||||
model = model.to(torch_device)
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True)
|
||||
one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
|
||||
audios = []
|
||||
audios.append(one_audio[110000:])
|
||||
@@ -2664,7 +2681,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
||||
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
|
||||
model = model.to(torch_device)
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True)
|
||||
one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
|
||||
audios = []
|
||||
audios.append(one_audio[110000:])
|
||||
|
||||
Reference in New Issue
Block a user