Trainer - deprecate tokenizer for processing_class (#32385)

* Trainer - deprecate tokenizer for processing_class

* Extend chage across Seq2Seq trainer and docs

* Add tests

* Update to FutureWarning and add deprecation version
This commit is contained in:
amyeroberts
2024-10-02 14:08:46 +01:00
committed by GitHub
parent e7c8af7f33
commit b7474f211d
99 changed files with 569 additions and 442 deletions

View File

@@ -38,6 +38,9 @@ from parameterized import parameterized
from requests.exceptions import HTTPError
from transformers import (
AutoFeatureExtractor,
AutoImageProcessor,
AutoProcessor,
AutoTokenizer,
IntervalStrategy,
PretrainedConfig,
@@ -1059,14 +1062,14 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
max_steps=10,
use_cpu=True,
)
trainer = Trainer(tiny_model, args, tokenizer=tokenizer, train_dataset=train_dataset)
trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
trainer.train()
parameters = dict(tiny_model.named_parameters())
state = dataclasses.asdict(trainer.state)
# Reinitialize trainer
trainer = Trainer(tiny_model, args, tokenizer=tokenizer, train_dataset=train_dataset)
trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
checkpoint = os.path.join(tmpdir, "checkpoint-5")
@@ -3786,6 +3789,98 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
_ = trainer.evaluate()
_ = trainer.predict(eval_dataset)
def test_trainer_saves_tokenizer(self):
MODEL_ID = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
with tempfile.TemporaryDirectory() as tmp_dir:
config = RegressionModelConfig(a=1.5, b=2.5)
trainer = Trainer(
model=RegressionPreTrainedModel(config),
args=TrainingArguments(output_dir=tmp_dir),
processing_class=tokenizer,
)
trainer.save_model()
reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
# For tokenizers, there isn't a direct to_dict method and the properties stored in the configs e.g.
# saved tokens change overtime, so we check that two tokenizers are equal by comparing their encoded outputs
test_sentence = "This is a test sentence"
self.assertListEqual(
tokenizer(test_sentence, padding="max_length").input_ids,
reloaded_tokenizer(test_sentence, padding="max_length").input_ids,
)
def test_trainer_saves_image_processor(self):
MODEL_ID = "openai/clip-vit-base-patch32"
image_processor = AutoImageProcessor.from_pretrained(MODEL_ID)
with tempfile.TemporaryDirectory() as tmp_dir:
config = RegressionModelConfig(a=1.5, b=2.5)
trainer = Trainer(
model=RegressionPreTrainedModel(config),
args=TrainingArguments(output_dir=tmp_dir),
processing_class=image_processor,
)
trainer.save_model()
reloaded_image_processor = AutoImageProcessor.from_pretrained(tmp_dir)
self.assertDictEqual(image_processor.to_dict(), reloaded_image_processor.to_dict())
def test_trainer_saves_feature_extractor(self):
MODEL_ID = "facebook/wav2vec2-base-960h"
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID)
with tempfile.TemporaryDirectory() as tmp_dir:
config = RegressionModelConfig(a=1.5, b=2.5)
trainer = Trainer(
model=RegressionPreTrainedModel(config),
args=TrainingArguments(output_dir=tmp_dir),
processing_class=feature_extractor,
)
trainer.save_model()
reloaded_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_dir)
self.assertDictEqual(feature_extractor.to_dict(), reloaded_feature_extractor.to_dict())
def test_trainer_saves_processor(self):
MODEL_ID = "openai/clip-vit-base-patch32"
image_processor = AutoImageProcessor.from_pretrained(MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
processor = AutoProcessor.from_pretrained(MODEL_ID)
with tempfile.TemporaryDirectory() as tmp_dir:
config = RegressionModelConfig(a=1.5, b=2.5)
trainer = Trainer(
model=RegressionPreTrainedModel(config),
args=TrainingArguments(output_dir=tmp_dir),
processing_class=processor,
)
trainer.save_model()
reloaded_processor = AutoProcessor.from_pretrained(tmp_dir)
reloaded_image_processor = AutoImageProcessor.from_pretrained(tmp_dir)
reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
self.assertDictEqual(reloaded_processor.to_dict(), processor.to_dict())
image_processor_dict = image_processor.to_dict()
reloaded_image_processor_dict = reloaded_image_processor.to_dict()
# When the processor is saved in the trainer, the _processor_class gets set in the reload_image_processor dict
image_processor_dict.pop("_processor_class")
reloaded_image_processor_dict.pop("_processor_class")
self.assertDictEqual(image_processor_dict, reloaded_image_processor_dict)
# For tokenizers, there isn't a direct to_dict method and the properties stored in the configs e.g.
# saved tokens change overtime, so we check that two tokenizers are equal by comparing their encoded outputs
test_sentence = "This is a test sentence"
self.assertListEqual(
tokenizer(test_sentence, padding="max_length").input_ids,
reloaded_tokenizer(test_sentence, padding="max_length").input_ids,
)
@require_torch
@is_staging_test