Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code

* Pass trust_remote_code in more tests

* Add trust_remote_dataset_code arg to some tests

* Revert "Temporarily pin datasets upper version to fix CI"

This reverts commit b7672826ca.

* Pass trust_remote_code in librispeech_asr_dummy docstrings

* Revert "Pin datasets<2.20.0 for examples"

This reverts commit 833fc17a3e.

* Pass trust_remote_code to all examples

* Revert "Add trust_remote_dataset_code arg to some tests" to research_projects

* Pass trust_remote_code to tests

* Pass trust_remote_code to docstrings

* Fix flax examples tests requirements

* Pass trust_remote_dataset_code arg to tests

* Replace trust_remote_dataset_code with trust_remote_code in one example

* Fix duplicate trust_remote_code

* Replace args.trust_remote_dataset_code with args.trust_remote_code

* Replace trust_remote_dataset_code with trust_remote_code in parser

* Replace trust_remote_dataset_code with trust_remote_code in dataclasses

* Replace trust_remote_dataset_code with trust_remote_code arg
This commit is contained in:
Albert Villanova del Moral
2024-06-17 18:29:13 +02:00
committed by GitHub
parent 485fd81471
commit a14b055b65
168 changed files with 804 additions and 410 deletions

View File

@@ -69,7 +69,9 @@ class AudioClassificationPipelineTests(unittest.TestCase):
import datasets
# test with a local file
dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
dataset = datasets.load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio = dataset[0]["audio"]["array"]
output = audio_classifier(audio)
self.assertEqual(
@@ -115,7 +117,7 @@ class AudioClassificationPipelineTests(unittest.TestCase):
model = "superb/wav2vec2-base-superb-ks"
audio_classifier = pipeline("audio-classification", model=model)
dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test")
dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test", trust_remote_code=True)
audio = np.array(dataset[3]["speech"], dtype=np.float32)
output = audio_classifier(audio, top_k=4)

View File

@@ -206,7 +206,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
@require_torch
@require_pyctcdecode
def test_large_model_pt_with_lm(self):
dataset = load_dataset("Narsil/asr_dummy", streaming=True)
dataset = load_dataset("Narsil/asr_dummy", streaming=True, trust_remote_code=True)
third_item = next(iter(dataset["test"].skip(3)))
filename = third_item["file"]
@@ -296,7 +296,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
output = speech_recognizer(waveform)
self.assertEqual(output, {"text": ""})
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
filename = ds[40]["file"]
output = speech_recognizer(filename)
self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
@@ -313,7 +315,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
output = speech_recognizer(waveform)
self.assertEqual(output, {"text": ""})
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
filename = ds[40]["file"]
output = speech_recognizer(filename)
self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
@@ -328,7 +332,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
chunk_length_s=8,
stride_length_s=1,
)
data = load_dataset("librispeech_asr", "clean", split="test", streaming=True)
data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
sample = next(iter(data))
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language="en", task="transcribe")
@@ -371,7 +375,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
task="automatic-speech-recognition",
model="openai/whisper-tiny.en",
)
data = load_dataset("librispeech_asr", "clean", split="test", streaming=True)
data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
samples = [next(iter(data)) for _ in range(8)]
audio = np.concatenate([sample["audio"]["array"] for sample in samples])
@@ -488,7 +492,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
model="openai/whisper-tiny",
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
filename = ds[40]["file"]
output = speech_recognizer(filename)
self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."})
@@ -663,7 +669,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
@slow
@require_torch
def test_whisper_timestamp_prediction(self):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
array = np.concatenate(
[ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]]
)
@@ -761,7 +769,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
@slow
@require_torch
def test_whisper_large_timestamp_prediction(self):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
array = np.concatenate(
[ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]]
)
@@ -855,7 +865,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
chunk_length_s=3,
return_timestamps="word",
)
data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
data = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
sample = data[0]["audio"]
# not the same output as test_simple_whisper_asr because of chunking
@@ -898,7 +910,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
model="openai/whisper-large-v3",
return_timestamps="word",
)
data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
data = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
sample = data[0]["audio"]
# not the same output as test_simple_whisper_asr because of chunking
@@ -943,7 +957,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
filename = ds[40]["file"]
output = speech_recognizer(filename)
self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'})
@@ -961,7 +977,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
output = asr(waveform)
self.assertEqual(output, {"text": ""})
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
filename = ds[40]["file"]
output = asr(filename)
self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
@@ -987,7 +1005,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
output = asr(waveform)
self.assertEqual(output, {"text": "(Applausi)"})
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
filename = ds[40]["file"]
output = asr(filename)
self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
@@ -1007,7 +1027,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
model="openai/whisper-tiny.en",
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
filename = ds[0]["file"]
output = speech_recognizer(filename)
self.assertEqual(
@@ -1076,7 +1098,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
model="openai/whisper-large",
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
filename = ds[40]["file"]
output = speech_recognizer(filename)
self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."})
@@ -1111,7 +1135,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
model="openai/whisper-tiny.en",
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
filename = ds[0]["file"]
# 1. English-only model compatible with no language argument
@@ -1144,7 +1170,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
@slow
def test_speculative_decoding_whisper_non_distil(self):
# Load data:
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
dataset = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
)
sample = dataset[0]["audio"]
# Load model:
@@ -1188,7 +1216,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
@slow
def test_speculative_decoding_whisper_distil(self):
# Load data:
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
dataset = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
)
sample = dataset[0]["audio"]
# Load model:
@@ -1240,7 +1270,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
filename = ds[40]["file"]
output = speech_recognizer(filename)
self.assertEqual(output, {"text": "A man said to the universe: “Sir, I exist."})
@@ -1256,7 +1288,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
filename = ds[40]["file"]
output = speech_recognizer(filename)
self.assertEqual(output, {"text": "Ein Mann sagte zu dem Universum, Sir, ich bin da."})
@@ -1273,7 +1307,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
filename = ds[40]["file"]
output = speech_recognizer(filename)
@@ -1290,7 +1326,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
framework="pt",
)
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
dataset = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
sample = dataset[0]["audio"]
output = speech_recognizer(sample)
@@ -1307,7 +1345,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
chunk_length_s=10.0,
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
audio = ds[40]["audio"]["array"]
n_repeats = 2
@@ -1323,7 +1363,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
model="hf-internal-testing/tiny-random-wav2vec2",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
# Take short audio to keep the test readable
audio = ds[40]["audio"]["array"][:800]
@@ -1367,7 +1409,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
chunk_length_s=10.0,
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
audio = ds[40]["audio"]["array"]
n_repeats = 2
@@ -1395,7 +1439,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
)
self.assertEqual(speech_recognizer.type, "ctc_with_lm")
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
audio = ds[40]["audio"]["array"]
n_repeats = 2
@@ -1423,7 +1469,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
)
self.assertEqual(speech_recognizer.type, "ctc_with_lm")
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
audio = ds[40]["audio"]["array"]
n_repeats = 2
@@ -1507,7 +1555,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
device=torch_device,
)
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
dataset = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
sample = dataset[0]["audio"]
result = pipe(sample, generate_kwargs={"tgt_lang": "eng"})
@@ -1530,7 +1580,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
chunk_length_s=10.0,
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
audio = ds[40]["audio"]["array"]
n_repeats = 10
@@ -1642,7 +1694,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
model="patrickvonplaten/wav2vec2-base-100h-with-lm",
chunk_length_s=10.0,
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
audio = ds[40]["audio"]["array"]
n_repeats = 10

View File

@@ -840,7 +840,9 @@ class CustomPipelineTest(unittest.TestCase):
def test_chunk_pipeline_batching_single_file(self):
# Make sure we have cached the pipeline.
pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC")
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
ds = datasets.load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
).sort("id")
audio = ds[40]["audio"]["array"]
pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC")

View File

@@ -567,7 +567,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor)
image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
file = image[0]["file"]
outputs = image_segmenter(file, threshold=threshold)
@@ -621,7 +621,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
def test_oneformer(self):
image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny")
image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
file = image[0]["file"]
outputs = image_segmenter(file, threshold=0.99)
# Shortening by hashing