1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
# 1. 增加参数,读取预训练权重
if args.init: # load from previous checkpoint
print(f"init {args.init} to device:{args.local_rank} ")
init = torch.load(args.init, map_location=f"cuda:{args.local_rank}")
init_state_dict = init["state_dict"]
net_state_dict = net.state_dict()
pretrained_dict = {k: v for k, v in init_state_dict.items() if k in net_state_dict and net_state_dict[k].size() == v.size()}
# 更新模型的state_dict
net_state_dict.update(pretrained_dict)
net.load_state_dict(net_state_dict)
# optimizer, init_lr_sch = configure_pretrained_optimizers(net, init, args)
optimizer = configure_pretrained_optimizers(net, init, args)
lr_scheduler = None
# 2. 设置优化器
def configure_pretrained_optimizers(net,init,args):
init_params=[]
net_only_params = []
for name, param in net.named_parameters():
if not name.endswith(".quantiles") and param.requires_grad:
if name in init["state_dict"]:
print(f"Pretrained Layer: {name}")
init_params.append(param)
else:
print(f"New Layer: {name}")
net_only_params.append(param)
init_lr = args.learning_rate*0.0001
net_lr = args.learning_rate
print(f"pretrained layer lr:{init_lr}, added layer lr:{net_lr}")
param_groups = [
{"params": init_params, "lr": init_lr},
{"params": net_only_params, "lr": net_lr},
]
optimizer = optim.Adam(param_groups)
return optimizer #, get_init_sch(optimizer, args)
# 3. 到达一定epoch,设置两部分学习率相同,同时启动lr_scheduler
if args.init:
if epoch == args.equalize_epoch:
for param_group in optimizer.param_groups:
param_group['lr'] = args.learning_rate * 0.1
lr_scheduler= optim.lr_scheduler.MultiStepLR(
optimizer,
milestones=milestones,
gamma=0.5,
last_epoch=-1
)
|