diff --git a/docs/source/en/main_classes/trainer.md b/docs/source/en/main_classes/trainer.md
index 462cea55dc..e9a93bbff7 100644
--- a/docs/source/en/main_classes/trainer.md
+++ b/docs/source/en/main_classes/trainer.md
@@ -740,3 +740,27 @@ Sections that were moved:
| Gradient Clipping
| Getting The Model Weights Out
]
+
+## Boost your fine-tuning performances using NEFTune
+
+
+NEFTune is a technique to boost the performance of chat models and was introduced by the paper “NEFTune: Noisy Embeddings Improve Instruction Finetuning” from Jain et al. it consists of adding noise to the embedding vectors during training. According to the abstract of the paper:
+
+> Standard finetuning of LLaMA-2-7B using Alpaca achieves 29.79% on AlpacaEval, which rises to 64.69% using noisy embeddings. NEFTune also improves over strong baselines on modern instruction datasets. Models trained with Evol-Instruct see a 10% improvement, with ShareGPT an 8% improvement, and with OpenPlatypus an 8% improvement. Even powerful models further refined with RLHF such as LLaMA-2-Chat benefit from additional training with NEFTune.
+
+
+

+
+
+To use it in `Trainer` simply pass `neftune_noise_alpha` when creating your `TrainingArguments` instance. Note that to avoid any surprising behaviour, NEFTune is disabled after training to retrieve back the original behaviour of the embedding layer.
+
+```python
+from transformers import Trainer, TrainingArguments
+
+args = TrainingArguments(..., neftune_noise_alpha=0.1)
+trainer = Trainer(..., args=args)
+
+...
+
+trainer.train()
+```
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 945b557021..aa5e372bdc 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -113,6 +113,7 @@ from .trainer_utils import (
find_executable_batch_size,
get_last_checkpoint,
has_length,
+ neftune_post_forward_hook,
number_of_arguments,
seed_worker,
set_seed,
@@ -486,6 +487,8 @@ class Trainer:
self.model_wrapped = model
self.model = model
+ self.neftune_noise_alpha = args.neftune_noise_alpha
+
self.compute_metrics = compute_metrics
self.preprocess_logits_for_metrics = preprocess_logits_for_metrics
self.optimizer, self.lr_scheduler = optimizers
@@ -634,6 +637,42 @@ class Trainer:
if args.torch_compile and not is_torch_compile_available():
raise RuntimeError("Using torch.compile requires PyTorch 2.0 or higher.")
+ def _activate_neftune(self, model):
+ r"""
+ Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper:
+ https://arxiv.org/abs/2310.05914
+ """
+ unwrapped_model = unwrap_model(model)
+
+ if is_peft_available() and isinstance(unwrapped_model, PeftModel):
+ embeddings = unwrapped_model.base_model.get_input_embeddings()
+ else:
+ embeddings = unwrapped_model.get_input_embeddings()
+
+ del unwrapped_model
+
+ embeddings.neftune_noise_alpha = self.neftune_noise_alpha
+ hook_handle = embeddings.register_forward_hook(neftune_post_forward_hook)
+ self.neftune_hook_handle = hook_handle
+ return model
+
+ def _deactivate_neftune(self, model):
+ """
+ Deactivates the neftune method. Make sure to call `_activate_neftune` first.
+ """
+ if not hasattr(self, "neftune_hook_handle"):
+ raise ValueError("Neftune is not activated make sure to call `trainer._activate_neftune()` first")
+
+ unwrapped_model = unwrap_model(model)
+
+ if is_peft_available() and isinstance(unwrapped_model, PeftModel):
+ embeddings = unwrapped_model.base_model.get_input_embeddings()
+ else:
+ embeddings = unwrapped_model.get_input_embeddings()
+
+ self.neftune_hook_handle.remove()
+ del embeddings.neftune_noise_alpha, unwrapped_model
+
def add_callback(self, callback):
"""
Add a callback to the current list of [`~transformer.TrainerCallback`].
@@ -1444,6 +1483,10 @@ class Trainer:
self.is_in_train = True
+ # Attach NEFTune hooks if necessary
+ if self.neftune_noise_alpha is not None:
+ self.model = self._activate_neftune(self.model)
+
# do_train is not a reliable argument, as it might not be set and .train() still called, so
# the following is a workaround:
if (args.fp16_full_eval or args.bf16_full_eval) and not args.do_train:
@@ -1956,6 +1999,11 @@ class Trainer:
# Wait for the checkpoint to be uploaded.
self._finish_current_push()
+ # After training we make sure to retrieve back the original forward pass method
+ # for the embedding layer by removing the forward post hook.
+ if self.neftune_noise_alpha is not None:
+ self._deactivate_neftune(self.model)
+
return TrainOutput(self.state.global_step, train_loss, metrics)
def _get_output_dir(self, trial):
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index 5bf29efffa..dd793c0203 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -105,6 +105,32 @@ def set_seed(seed: int):
tf.random.set_seed(seed)
+def neftune_post_forward_hook(module, input, output):
+ """
+ Implements the NEFTune forward pass for the model using forward hooks. Note this works only for torch.nn.Embedding
+ layers. This method is slightly adapted from the original source code that can be found here:
+ https://github.com/neelsjain/NEFTune Simply add it to your model as follows:
+ ```python
+ model = ...
+ model.embed_tokens.neftune_noise_alpha = 0.1
+ model.embed_tokens.register_forward_hook(neftune_post_forward_hook)
+ ```
+ Args:
+ module (`torch.nn.Module`):
+ The embedding module where the hook is attached. Note that you need to set `module.neftune_noise_alpha` to
+ the desired noise alpha value.
+ input (`torch.Tensor`):
+ The input tensor to the model.
+ output (`torch.Tensor`):
+ The output tensor of the model (i.e. the embeddings).
+ """
+ if module.training:
+ dims = torch.tensor(output.size(1) * output.size(2))
+ mag_norm = module.neftune_noise_alpha / torch.sqrt(dims)
+ output = output + torch.zeros_like(output).uniform_(-mag_norm, mag_norm)
+ return output
+
+
class EvalPrediction:
"""
Evaluation output (always contains labels), to be used to compute metrics.
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 507515c696..8a6d7255f5 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -627,6 +627,11 @@ class TrainingArguments:
This will iterate over the entire training dataloader once beforehand,
and will slow down the entire process.
+ neftune_noise_alpha (`Optional[float]`):
+ If not `None`, this will activate NEFTune noise embeddings. This can drastically improve model performance
+ for instruction fine-tuning. Check out the [original paper](https://arxiv.org/abs/2310.05914) and the
+ [original code](https://github.com/neelsjain/NEFTune). Support transformers `PreTrainedModel` and also
+ `PeftModel` from peft.
"""
framework = "pt"
@@ -1226,6 +1231,13 @@ class TrainingArguments:
metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."},
)
+ neftune_noise_alpha: float = field(
+ default=None,
+ metadata={
+ "help": "Activates neftune noise embeddings into the model. NEFTune has been proven to drastically improve model performances for instrcution fine-tuning. Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes."
+ },
+ )
+
def __post_init__(self):
# expand paths, if not os.makedirs("~/bar") will make directory
# in the current directory instead of the actual home
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 8791e92c71..6c208d0de0 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -838,6 +838,50 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
train_output = trainer.train()
self.assertEqual(train_output.global_step, 10)
+ def test_neftune(self):
+ config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
+ tiny_gpt2 = GPT2LMHeadModel(config)
+ x = torch.randint(0, 100, (128,))
+ train_dataset = RepeatDataset(x)
+
+ # Trainer without inf/nan filter
+ args = TrainingArguments(
+ "./test", learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, neftune_noise_alpha=0.4
+ )
+ trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+
+ trainer.model = trainer._activate_neftune(trainer.model)
+
+ dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device)
+
+ emb1 = trainer.model.get_input_embeddings()(dummy_input)
+ emb2 = trainer.model.get_input_embeddings()(dummy_input)
+
+ self.assertFalse(torch.allclose(emb1, emb2), "Neftune noise is not applied!")
+
+ # redefine the model
+ tiny_gpt2 = GPT2LMHeadModel(config)
+ # Trainer without inf/nan filter
+ args = TrainingArguments(
+ "./test", learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, neftune_noise_alpha=0.4
+ )
+ trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+
+ # Check that it trains without errors
+ trainer.train()
+
+ # Make sure forward pass works fine
+ _ = trainer.model(dummy_input)
+ self.assertTrue(len(trainer.model.get_input_embeddings()._forward_hooks) == 0)
+
+ trainer.model.eval()
+
+ # Check that we get identical embeddings just in case
+ emb1 = trainer.model.get_input_embeddings()(dummy_input)
+ emb2 = trainer.model.get_input_embeddings()(dummy_input)
+
+ self.assertTrue(torch.allclose(emb1, emb2), "Neftune noise is still applied!")
+
def test_logging_inf_nan_filter(self):
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
tiny_gpt2 = GPT2LMHeadModel(config)