[AutoProcessor] Correct AutoProcessor and automatically add processor… (#14881)

* [AutoProcessor] Correct AutoProcessor and automatically add processor class

* up

* up

* up

* up

* up

* up

* up

* up

* continue tomorrow

* up

* up

* up

* make processor class private

* fix loop
This commit is contained in:
Patrick von Platen
2021-12-30 09:56:43 +01:00
committed by GitHub
parent d7d60df0ec
commit a1392883ce
11 changed files with 110 additions and 7 deletions

View File

@@ -13,13 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import tempfile
import unittest
from shutil import copyfile
from transformers import AutoProcessor, Wav2Vec2Config, Wav2Vec2Processor
from transformers import AutoProcessor, AutoTokenizer, Wav2Vec2Config, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
from transformers.file_utils import FEATURE_EXTRACTOR_NAME
from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
SAMPLE_PROCESSOR_CONFIG = os.path.join(
@@ -55,3 +57,61 @@ class AutoFeatureExtractorTest(unittest.TestCase):
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_feat_extr_processor_class(self):
with tempfile.TemporaryDirectory() as tmpdirname:
feature_extractor = Wav2Vec2FeatureExtractor()
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor(feature_extractor, tokenizer)
# save in new folder
processor.save_pretrained(tmpdirname)
# drop `processor_class` in tokenizer
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f:
config_dict = json.load(f)
config_dict.pop("processor_class")
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "w") as f:
f.write(json.dumps(config_dict))
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_tokenizer_processor_class(self):
with tempfile.TemporaryDirectory() as tmpdirname:
feature_extractor = Wav2Vec2FeatureExtractor()
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor(feature_extractor, tokenizer)
# save in new folder
processor.save_pretrained(tmpdirname)
# drop `processor_class` in feature extractor
with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "r") as f:
config_dict = json.load(f)
config_dict.pop("processor_class")
with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "w") as f:
f.write(json.dumps(config_dict))
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_local_directory_from_model_config(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model_config = Wav2Vec2Config(processor_class="Wav2Vec2Processor")
model_config.save_pretrained(tmpdirname)
# copy relevant files
copyfile(SAMPLE_VOCAB, os.path.join(tmpdirname, "vocab.json"))
# create emtpy sample processor
with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "w") as f:
f.write("{}")
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)