数据并行只上传一次训练结果
This commit is contained in:
parent
1fa0edbf01
commit
09afaecfe8
10
pretrain.py
10
pretrain.py
|
@ -115,7 +115,7 @@ if __name__ == "__main__":
|
|||
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
|
||||
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
|
||||
|
||||
###假如选择了模型文件,使用pretrain_dir,注意ckpt_url的方式依然保留,你依然可以使用ckpt_url的方式,但是这种方式将会逐渐废弃
|
||||
###假如选择了模型文件,使用pretrain_url的方式,注意ckpt_url的方式依然保留,你依然可以使用ckpt_url的方式,但是这种方式将会逐渐废弃
|
||||
load_param_into_net(network, load_checkpoint(os.path.join(pretrain_dir, "checkpoint_lenet-1_1875.ckpt")))
|
||||
|
||||
if args.device_target != "Ascend":
|
||||
|
@ -133,12 +133,8 @@ if __name__ == "__main__":
|
|||
config_ck = CheckpointConfig(
|
||||
save_checkpoint_steps=cfg.save_checkpoint_steps,
|
||||
keep_checkpoint_max=cfg.keep_checkpoint_max)
|
||||
#Note that this method saves the model file on each card. You need to specify the save path on each card.
|
||||
# In this example, get_rank() is added to distinguish different paths.
|
||||
if device_num == 1:
|
||||
outputDirectory = train_dir + "/"
|
||||
if device_num > 1:
|
||||
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
|
||||
#若是数据并行,则只需要上传一次模型文件
|
||||
outputDirectory = train_dir + "/"
|
||||
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
|
||||
directory=outputDirectory,
|
||||
config=config_ck)
|
||||
|
|
|
@ -112,12 +112,9 @@ if __name__ == "__main__":
|
|||
model = Model(network, net_loss,net_opt,metrics={"accuracy"},amp_level="O2")
|
||||
config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
|
||||
keep_checkpoint_max=cfg.keep_checkpoint_max)
|
||||
#Note that this method saves the model file on each card. You need to specify the save path on each card.
|
||||
# In this example, get_rank() is added to distinguish different paths.
|
||||
if device_num == 1:
|
||||
outputDirectory = train_dir
|
||||
if device_num > 1:
|
||||
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
|
||||
print("============== Starting Training ==============")
|
||||
#若是数据并行,只需要上传一次输出的模型文件
|
||||
outputDirectory = train_dir
|
||||
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
|
||||
directory=outputDirectory,
|
||||
config=config_ck)
|
||||
|
@ -126,11 +123,6 @@ if __name__ == "__main__":
|
|||
if (args.epoch_size):
|
||||
epoch_size = args.epoch_size
|
||||
print('epoch_size is: ', epoch_size)
|
||||
# set callback functions
|
||||
callback =[time_cb,LossMonitor()]
|
||||
local_rank=int(os.getenv('RANK_ID'))
|
||||
# for data parallel, only save checkpoint on rank 0
|
||||
if local_rank==0 :
|
||||
callback.append(ckpoint_cb)
|
||||
model.train(epoch_size,ds_train,callbacks=callback)
|
||||
###智算不需要回传训练结果,会在任务结束后自动回传
|
8
train.py
8
train.py
|
@ -115,12 +115,8 @@ if __name__ == "__main__":
|
|||
config_ck = CheckpointConfig(
|
||||
save_checkpoint_steps=cfg.save_checkpoint_steps,
|
||||
keep_checkpoint_max=cfg.keep_checkpoint_max)
|
||||
#Note that this method saves the model file on each card. You need to specify the save path on each card.
|
||||
# In this example, get_rank() is added to distinguish different paths.
|
||||
if device_num == 1:
|
||||
outputDirectory = train_dir + "/"
|
||||
if device_num > 1:
|
||||
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
|
||||
#若是数据并行,则只需要上传一次模型文件
|
||||
outputDirectory = train_dir + "/"
|
||||
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
|
||||
directory=outputDirectory,
|
||||
config=config_ck)
|
||||
|
|
|
@ -98,12 +98,8 @@ if __name__ == "__main__":
|
|||
model = Model(network, net_loss,net_opt,metrics={"accuracy"},amp_level="O2")
|
||||
config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
|
||||
keep_checkpoint_max=cfg.keep_checkpoint_max)
|
||||
#Note that this method saves the model file on each card. You need to specify the save path on each card.
|
||||
# In this example, get_rank() is added to distinguish different paths.
|
||||
if device_num == 1:
|
||||
outputDirectory = train_dir
|
||||
if device_num > 1:
|
||||
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
|
||||
#若是数据并行,只需要上传一次输出的模型文件
|
||||
outputDirectory = train_dir
|
||||
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
|
||||
directory=outputDirectory,
|
||||
config=config_ck)
|
||||
|
@ -112,10 +108,5 @@ if __name__ == "__main__":
|
|||
if (args.epoch_size):
|
||||
epoch_size = args.epoch_size
|
||||
print('epoch_size is: ', epoch_size)
|
||||
# set callback functions
|
||||
callback =[time_cb,LossMonitor()]
|
||||
local_rank=int(os.getenv('RANK_ID'))
|
||||
# for data parallel, only save checkpoint on rank 0
|
||||
if local_rank==0 :
|
||||
callback.append(ckpoint_cb)
|
||||
model.train(epoch_size,ds_train,callbacks=callback)
|
Loading…
Reference in New Issue