[FEAT] Add Neftune into transformers Trainer (#27141)
* add v1 neftune * use `unwrap_model` instead * add test + docs * Apply suggestions from code review Co-authored-by: Zach Mueller <muellerzr@gmail.com> * more details * fixup * Update docs/source/en/main_classes/trainer.md Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * refactor a bit * more elaborated test * fix unwrap issue --------- Co-authored-by: Zach Mueller <muellerzr@gmail.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
@@ -740,3 +740,27 @@ Sections that were moved:
|
|||||||
| <a href="./deepspeed#deepspeed-grad-clip">Gradient Clipping</a><a id="gradient-clipping"></a>
|
| <a href="./deepspeed#deepspeed-grad-clip">Gradient Clipping</a><a id="gradient-clipping"></a>
|
||||||
| <a href="./deepspeed#deepspeed-weight-extraction">Getting The Model Weights Out</a><a id="getting-the-model-weights-out"></a>
|
| <a href="./deepspeed#deepspeed-weight-extraction">Getting The Model Weights Out</a><a id="getting-the-model-weights-out"></a>
|
||||||
]
|
]
|
||||||
|
|
||||||
|
## Boost your fine-tuning performances using NEFTune
|
||||||
|
|
||||||
|
|
||||||
|
NEFTune is a technique to boost the performance of chat models and was introduced by the paper “NEFTune: Noisy Embeddings Improve Instruction Finetuning” from Jain et al. it consists of adding noise to the embedding vectors during training. According to the abstract of the paper:
|
||||||
|
|
||||||
|
> Standard finetuning of LLaMA-2-7B using Alpaca achieves 29.79% on AlpacaEval, which rises to 64.69% using noisy embeddings. NEFTune also improves over strong baselines on modern instruction datasets. Models trained with Evol-Instruct see a 10% improvement, with ShareGPT an 8% improvement, and with OpenPlatypus an 8% improvement. Even powerful models further refined with RLHF such as LLaMA-2-Chat benefit from additional training with NEFTune.
|
||||||
|
|
||||||
|
<div style="text-align: center">
|
||||||
|
<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/neft-screenshot.png">
|
||||||
|
</div>
|
||||||
|
|
||||||
|
To use it in `Trainer` simply pass `neftune_noise_alpha` when creating your `TrainingArguments` instance. Note that to avoid any surprising behaviour, NEFTune is disabled after training to retrieve back the original behaviour of the embedding layer.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import Trainer, TrainingArguments
|
||||||
|
|
||||||
|
args = TrainingArguments(..., neftune_noise_alpha=0.1)
|
||||||
|
trainer = Trainer(..., args=args)
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
trainer.train()
|
||||||
|
```
|
||||||
|
|||||||
@@ -113,6 +113,7 @@ from .trainer_utils import (
|
|||||||
find_executable_batch_size,
|
find_executable_batch_size,
|
||||||
get_last_checkpoint,
|
get_last_checkpoint,
|
||||||
has_length,
|
has_length,
|
||||||
|
neftune_post_forward_hook,
|
||||||
number_of_arguments,
|
number_of_arguments,
|
||||||
seed_worker,
|
seed_worker,
|
||||||
set_seed,
|
set_seed,
|
||||||
@@ -486,6 +487,8 @@ class Trainer:
|
|||||||
self.model_wrapped = model
|
self.model_wrapped = model
|
||||||
self.model = model
|
self.model = model
|
||||||
|
|
||||||
|
self.neftune_noise_alpha = args.neftune_noise_alpha
|
||||||
|
|
||||||
self.compute_metrics = compute_metrics
|
self.compute_metrics = compute_metrics
|
||||||
self.preprocess_logits_for_metrics = preprocess_logits_for_metrics
|
self.preprocess_logits_for_metrics = preprocess_logits_for_metrics
|
||||||
self.optimizer, self.lr_scheduler = optimizers
|
self.optimizer, self.lr_scheduler = optimizers
|
||||||
@@ -634,6 +637,42 @@ class Trainer:
|
|||||||
if args.torch_compile and not is_torch_compile_available():
|
if args.torch_compile and not is_torch_compile_available():
|
||||||
raise RuntimeError("Using torch.compile requires PyTorch 2.0 or higher.")
|
raise RuntimeError("Using torch.compile requires PyTorch 2.0 or higher.")
|
||||||
|
|
||||||
|
def _activate_neftune(self, model):
|
||||||
|
r"""
|
||||||
|
Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper:
|
||||||
|
https://arxiv.org/abs/2310.05914
|
||||||
|
"""
|
||||||
|
unwrapped_model = unwrap_model(model)
|
||||||
|
|
||||||
|
if is_peft_available() and isinstance(unwrapped_model, PeftModel):
|
||||||
|
embeddings = unwrapped_model.base_model.get_input_embeddings()
|
||||||
|
else:
|
||||||
|
embeddings = unwrapped_model.get_input_embeddings()
|
||||||
|
|
||||||
|
del unwrapped_model
|
||||||
|
|
||||||
|
embeddings.neftune_noise_alpha = self.neftune_noise_alpha
|
||||||
|
hook_handle = embeddings.register_forward_hook(neftune_post_forward_hook)
|
||||||
|
self.neftune_hook_handle = hook_handle
|
||||||
|
return model
|
||||||
|
|
||||||
|
def _deactivate_neftune(self, model):
|
||||||
|
"""
|
||||||
|
Deactivates the neftune method. Make sure to call `_activate_neftune` first.
|
||||||
|
"""
|
||||||
|
if not hasattr(self, "neftune_hook_handle"):
|
||||||
|
raise ValueError("Neftune is not activated make sure to call `trainer._activate_neftune()` first")
|
||||||
|
|
||||||
|
unwrapped_model = unwrap_model(model)
|
||||||
|
|
||||||
|
if is_peft_available() and isinstance(unwrapped_model, PeftModel):
|
||||||
|
embeddings = unwrapped_model.base_model.get_input_embeddings()
|
||||||
|
else:
|
||||||
|
embeddings = unwrapped_model.get_input_embeddings()
|
||||||
|
|
||||||
|
self.neftune_hook_handle.remove()
|
||||||
|
del embeddings.neftune_noise_alpha, unwrapped_model
|
||||||
|
|
||||||
def add_callback(self, callback):
|
def add_callback(self, callback):
|
||||||
"""
|
"""
|
||||||
Add a callback to the current list of [`~transformer.TrainerCallback`].
|
Add a callback to the current list of [`~transformer.TrainerCallback`].
|
||||||
@@ -1444,6 +1483,10 @@ class Trainer:
|
|||||||
|
|
||||||
self.is_in_train = True
|
self.is_in_train = True
|
||||||
|
|
||||||
|
# Attach NEFTune hooks if necessary
|
||||||
|
if self.neftune_noise_alpha is not None:
|
||||||
|
self.model = self._activate_neftune(self.model)
|
||||||
|
|
||||||
# do_train is not a reliable argument, as it might not be set and .train() still called, so
|
# do_train is not a reliable argument, as it might not be set and .train() still called, so
|
||||||
# the following is a workaround:
|
# the following is a workaround:
|
||||||
if (args.fp16_full_eval or args.bf16_full_eval) and not args.do_train:
|
if (args.fp16_full_eval or args.bf16_full_eval) and not args.do_train:
|
||||||
@@ -1956,6 +1999,11 @@ class Trainer:
|
|||||||
# Wait for the checkpoint to be uploaded.
|
# Wait for the checkpoint to be uploaded.
|
||||||
self._finish_current_push()
|
self._finish_current_push()
|
||||||
|
|
||||||
|
# After training we make sure to retrieve back the original forward pass method
|
||||||
|
# for the embedding layer by removing the forward post hook.
|
||||||
|
if self.neftune_noise_alpha is not None:
|
||||||
|
self._deactivate_neftune(self.model)
|
||||||
|
|
||||||
return TrainOutput(self.state.global_step, train_loss, metrics)
|
return TrainOutput(self.state.global_step, train_loss, metrics)
|
||||||
|
|
||||||
def _get_output_dir(self, trial):
|
def _get_output_dir(self, trial):
|
||||||
|
|||||||
@@ -105,6 +105,32 @@ def set_seed(seed: int):
|
|||||||
tf.random.set_seed(seed)
|
tf.random.set_seed(seed)
|
||||||
|
|
||||||
|
|
||||||
|
def neftune_post_forward_hook(module, input, output):
|
||||||
|
"""
|
||||||
|
Implements the NEFTune forward pass for the model using forward hooks. Note this works only for torch.nn.Embedding
|
||||||
|
layers. This method is slightly adapted from the original source code that can be found here:
|
||||||
|
https://github.com/neelsjain/NEFTune Simply add it to your model as follows:
|
||||||
|
```python
|
||||||
|
model = ...
|
||||||
|
model.embed_tokens.neftune_noise_alpha = 0.1
|
||||||
|
model.embed_tokens.register_forward_hook(neftune_post_forward_hook)
|
||||||
|
```
|
||||||
|
Args:
|
||||||
|
module (`torch.nn.Module`):
|
||||||
|
The embedding module where the hook is attached. Note that you need to set `module.neftune_noise_alpha` to
|
||||||
|
the desired noise alpha value.
|
||||||
|
input (`torch.Tensor`):
|
||||||
|
The input tensor to the model.
|
||||||
|
output (`torch.Tensor`):
|
||||||
|
The output tensor of the model (i.e. the embeddings).
|
||||||
|
"""
|
||||||
|
if module.training:
|
||||||
|
dims = torch.tensor(output.size(1) * output.size(2))
|
||||||
|
mag_norm = module.neftune_noise_alpha / torch.sqrt(dims)
|
||||||
|
output = output + torch.zeros_like(output).uniform_(-mag_norm, mag_norm)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
class EvalPrediction:
|
class EvalPrediction:
|
||||||
"""
|
"""
|
||||||
Evaluation output (always contains labels), to be used to compute metrics.
|
Evaluation output (always contains labels), to be used to compute metrics.
|
||||||
|
|||||||
@@ -627,6 +627,11 @@ class TrainingArguments:
|
|||||||
This will iterate over the entire training dataloader once beforehand,
|
This will iterate over the entire training dataloader once beforehand,
|
||||||
|
|
||||||
and will slow down the entire process.
|
and will slow down the entire process.
|
||||||
|
neftune_noise_alpha (`Optional[float]`):
|
||||||
|
If not `None`, this will activate NEFTune noise embeddings. This can drastically improve model performance
|
||||||
|
for instruction fine-tuning. Check out the [original paper](https://arxiv.org/abs/2310.05914) and the
|
||||||
|
[original code](https://github.com/neelsjain/NEFTune). Support transformers `PreTrainedModel` and also
|
||||||
|
`PeftModel` from peft.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
framework = "pt"
|
framework = "pt"
|
||||||
@@ -1226,6 +1231,13 @@ class TrainingArguments:
|
|||||||
metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."},
|
metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
neftune_noise_alpha: float = field(
|
||||||
|
default=None,
|
||||||
|
metadata={
|
||||||
|
"help": "Activates neftune noise embeddings into the model. NEFTune has been proven to drastically improve model performances for instrcution fine-tuning. Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
# expand paths, if not os.makedirs("~/bar") will make directory
|
# expand paths, if not os.makedirs("~/bar") will make directory
|
||||||
# in the current directory instead of the actual home
|
# in the current directory instead of the actual home
|
||||||
|
|||||||
@@ -838,6 +838,50 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
train_output = trainer.train()
|
train_output = trainer.train()
|
||||||
self.assertEqual(train_output.global_step, 10)
|
self.assertEqual(train_output.global_step, 10)
|
||||||
|
|
||||||
|
def test_neftune(self):
|
||||||
|
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
|
||||||
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
||||||
|
x = torch.randint(0, 100, (128,))
|
||||||
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
|
# Trainer without inf/nan filter
|
||||||
|
args = TrainingArguments(
|
||||||
|
"./test", learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, neftune_noise_alpha=0.4
|
||||||
|
)
|
||||||
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
|
|
||||||
|
trainer.model = trainer._activate_neftune(trainer.model)
|
||||||
|
|
||||||
|
dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device)
|
||||||
|
|
||||||
|
emb1 = trainer.model.get_input_embeddings()(dummy_input)
|
||||||
|
emb2 = trainer.model.get_input_embeddings()(dummy_input)
|
||||||
|
|
||||||
|
self.assertFalse(torch.allclose(emb1, emb2), "Neftune noise is not applied!")
|
||||||
|
|
||||||
|
# redefine the model
|
||||||
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
||||||
|
# Trainer without inf/nan filter
|
||||||
|
args = TrainingArguments(
|
||||||
|
"./test", learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, neftune_noise_alpha=0.4
|
||||||
|
)
|
||||||
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
|
|
||||||
|
# Check that it trains without errors
|
||||||
|
trainer.train()
|
||||||
|
|
||||||
|
# Make sure forward pass works fine
|
||||||
|
_ = trainer.model(dummy_input)
|
||||||
|
self.assertTrue(len(trainer.model.get_input_embeddings()._forward_hooks) == 0)
|
||||||
|
|
||||||
|
trainer.model.eval()
|
||||||
|
|
||||||
|
# Check that we get identical embeddings just in case
|
||||||
|
emb1 = trainer.model.get_input_embeddings()(dummy_input)
|
||||||
|
emb2 = trainer.model.get_input_embeddings()(dummy_input)
|
||||||
|
|
||||||
|
self.assertTrue(torch.allclose(emb1, emb2), "Neftune noise is still applied!")
|
||||||
|
|
||||||
def test_logging_inf_nan_filter(self):
|
def test_logging_inf_nan_filter(self):
|
||||||
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
|
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
|
||||||
tiny_gpt2 = GPT2LMHeadModel(config)
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
||||||
|
|||||||
Reference in New Issue
Block a user