@@ -80,7 +80,6 @@ from .trainer_pt_utils import (
|
||||
SequentialDistributedSampler,
|
||||
distributed_broadcast_scalars,
|
||||
distributed_concat,
|
||||
get_parameter_names,
|
||||
nested_concat,
|
||||
nested_detach,
|
||||
nested_numpify,
|
||||
@@ -614,15 +613,14 @@ class Trainer:
|
||||
Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.
|
||||
"""
|
||||
if self.optimizer is None:
|
||||
decay_parameters = get_parameter_names(self.model, [torch.nn.LayerNorm])
|
||||
decay_parameters = [name for name in decay_parameters if "bias" not in name]
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in self.model.named_parameters() if n in decay_parameters],
|
||||
"params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": self.args.weight_decay,
|
||||
},
|
||||
{
|
||||
"params": [p for n, p in self.model.named_parameters() if n not in decay_parameters],
|
||||
"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
|
||||
@@ -672,19 +672,3 @@ def save_state(self):
|
||||
|
||||
path = os.path.join(self.args.output_dir, "trainer_state.json")
|
||||
self.state.save_to_json(path)
|
||||
|
||||
|
||||
def get_parameter_names(model, forbidden_layer_types):
|
||||
"""
|
||||
Returns the names of the model parameters that are not inside a forbidden layer.
|
||||
"""
|
||||
result = []
|
||||
for name, child in model.named_children():
|
||||
result += [
|
||||
f"{name}.{n}"
|
||||
for n in get_parameter_names(child, forbidden_layer_types)
|
||||
if not isinstance(child, tuple(forbidden_layer_types))
|
||||
]
|
||||
# Add model specific parameters (defined with nn.Parameter) since they are not in any child.
|
||||
result += list(model._parameters.keys())
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user