Trainer - deprecate tokenizer for processing_class (#32385)
* Trainer - deprecate tokenizer for processing_class * Extend chage across Seq2Seq trainer and docs * Add tests * Update to FutureWarning and add deprecation version
This commit is contained in:
@@ -38,6 +38,9 @@ from parameterized import parameterized
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from transformers import (
|
||||
AutoFeatureExtractor,
|
||||
AutoImageProcessor,
|
||||
AutoProcessor,
|
||||
AutoTokenizer,
|
||||
IntervalStrategy,
|
||||
PretrainedConfig,
|
||||
@@ -1059,14 +1062,14 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
max_steps=10,
|
||||
use_cpu=True,
|
||||
)
|
||||
trainer = Trainer(tiny_model, args, tokenizer=tokenizer, train_dataset=train_dataset)
|
||||
trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
|
||||
|
||||
trainer.train()
|
||||
parameters = dict(tiny_model.named_parameters())
|
||||
state = dataclasses.asdict(trainer.state)
|
||||
|
||||
# Reinitialize trainer
|
||||
trainer = Trainer(tiny_model, args, tokenizer=tokenizer, train_dataset=train_dataset)
|
||||
trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
|
||||
|
||||
checkpoint = os.path.join(tmpdir, "checkpoint-5")
|
||||
|
||||
@@ -3786,6 +3789,98 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
_ = trainer.evaluate()
|
||||
_ = trainer.predict(eval_dataset)
|
||||
|
||||
def test_trainer_saves_tokenizer(self):
|
||||
MODEL_ID = "google-bert/bert-base-uncased"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
config = RegressionModelConfig(a=1.5, b=2.5)
|
||||
trainer = Trainer(
|
||||
model=RegressionPreTrainedModel(config),
|
||||
args=TrainingArguments(output_dir=tmp_dir),
|
||||
processing_class=tokenizer,
|
||||
)
|
||||
trainer.save_model()
|
||||
|
||||
reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
|
||||
|
||||
# For tokenizers, there isn't a direct to_dict method and the properties stored in the configs e.g.
|
||||
# saved tokens change overtime, so we check that two tokenizers are equal by comparing their encoded outputs
|
||||
test_sentence = "This is a test sentence"
|
||||
self.assertListEqual(
|
||||
tokenizer(test_sentence, padding="max_length").input_ids,
|
||||
reloaded_tokenizer(test_sentence, padding="max_length").input_ids,
|
||||
)
|
||||
|
||||
def test_trainer_saves_image_processor(self):
|
||||
MODEL_ID = "openai/clip-vit-base-patch32"
|
||||
image_processor = AutoImageProcessor.from_pretrained(MODEL_ID)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
config = RegressionModelConfig(a=1.5, b=2.5)
|
||||
trainer = Trainer(
|
||||
model=RegressionPreTrainedModel(config),
|
||||
args=TrainingArguments(output_dir=tmp_dir),
|
||||
processing_class=image_processor,
|
||||
)
|
||||
trainer.save_model()
|
||||
reloaded_image_processor = AutoImageProcessor.from_pretrained(tmp_dir)
|
||||
|
||||
self.assertDictEqual(image_processor.to_dict(), reloaded_image_processor.to_dict())
|
||||
|
||||
def test_trainer_saves_feature_extractor(self):
|
||||
MODEL_ID = "facebook/wav2vec2-base-960h"
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
config = RegressionModelConfig(a=1.5, b=2.5)
|
||||
trainer = Trainer(
|
||||
model=RegressionPreTrainedModel(config),
|
||||
args=TrainingArguments(output_dir=tmp_dir),
|
||||
processing_class=feature_extractor,
|
||||
)
|
||||
trainer.save_model()
|
||||
|
||||
reloaded_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_dir)
|
||||
|
||||
self.assertDictEqual(feature_extractor.to_dict(), reloaded_feature_extractor.to_dict())
|
||||
|
||||
def test_trainer_saves_processor(self):
|
||||
MODEL_ID = "openai/clip-vit-base-patch32"
|
||||
image_processor = AutoImageProcessor.from_pretrained(MODEL_ID)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
|
||||
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
config = RegressionModelConfig(a=1.5, b=2.5)
|
||||
trainer = Trainer(
|
||||
model=RegressionPreTrainedModel(config),
|
||||
args=TrainingArguments(output_dir=tmp_dir),
|
||||
processing_class=processor,
|
||||
)
|
||||
trainer.save_model()
|
||||
|
||||
reloaded_processor = AutoProcessor.from_pretrained(tmp_dir)
|
||||
reloaded_image_processor = AutoImageProcessor.from_pretrained(tmp_dir)
|
||||
reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
|
||||
|
||||
self.assertDictEqual(reloaded_processor.to_dict(), processor.to_dict())
|
||||
|
||||
image_processor_dict = image_processor.to_dict()
|
||||
reloaded_image_processor_dict = reloaded_image_processor.to_dict()
|
||||
# When the processor is saved in the trainer, the _processor_class gets set in the reload_image_processor dict
|
||||
image_processor_dict.pop("_processor_class")
|
||||
reloaded_image_processor_dict.pop("_processor_class")
|
||||
self.assertDictEqual(image_processor_dict, reloaded_image_processor_dict)
|
||||
|
||||
# For tokenizers, there isn't a direct to_dict method and the properties stored in the configs e.g.
|
||||
# saved tokens change overtime, so we check that two tokenizers are equal by comparing their encoded outputs
|
||||
test_sentence = "This is a test sentence"
|
||||
self.assertListEqual(
|
||||
tokenizer(test_sentence, padding="max_length").input_ids,
|
||||
reloaded_tokenizer(test_sentence, padding="max_length").input_ids,
|
||||
)
|
||||
|
||||
|
||||
@require_torch
|
||||
@is_staging_test
|
||||
|
||||
@@ -129,7 +129,7 @@ class Seq2seqTrainerTester(TestCasePlus):
|
||||
compute_metrics=_compute_metrics,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=val_dataset,
|
||||
tokenizer=tokenizer,
|
||||
processing_class=tokenizer,
|
||||
)
|
||||
|
||||
# start training
|
||||
@@ -158,7 +158,7 @@ class Seq2seqTrainerTester(TestCasePlus):
|
||||
trainer = Seq2SeqTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
tokenizer=tokenizer,
|
||||
processing_class=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=lambda x: {"samples": x[0].shape[0]},
|
||||
)
|
||||
@@ -199,7 +199,7 @@ class Seq2seqTrainerTester(TestCasePlus):
|
||||
_ = Seq2SeqTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
tokenizer=tokenizer,
|
||||
processing_class=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=lambda x: {"samples": x[0].shape[0]},
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user