数据并行只上传一次训练结果

This commit is contained in:
liuzx 2023-05-26 10:40:20 +08:00
parent 1fa0edbf01
commit 09afaecfe8
4 changed files with 10 additions and 35 deletions

View File

@ -115,7 +115,7 @@ if __name__ == "__main__":
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
###假如选择了模型文件使用pretrain_dir,注意ckpt_url的方式依然保留你依然可以使用ckpt_url的方式但是这种方式将会逐渐废弃
###假如选择了模型文件使用pretrain_url的方式,注意ckpt_url的方式依然保留你依然可以使用ckpt_url的方式但是这种方式将会逐渐废弃
load_param_into_net(network, load_checkpoint(os.path.join(pretrain_dir, "checkpoint_lenet-1_1875.ckpt")))
if args.device_target != "Ascend":
@ -133,12 +133,8 @@ if __name__ == "__main__":
config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#Note that this method saves the model file on each card. You need to specify the save path on each card.
# In this example, get_rank() is added to distinguish different paths.
if device_num == 1:
outputDirectory = train_dir + "/"
if device_num > 1:
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
#若是数据并行,则只需要上传一次模型文件
outputDirectory = train_dir + "/"
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=outputDirectory,
config=config_ck)

View File

@ -112,12 +112,9 @@ if __name__ == "__main__":
model = Model(network, net_loss,net_opt,metrics={"accuracy"},amp_level="O2")
config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#Note that this method saves the model file on each card. You need to specify the save path on each card.
# In this example, get_rank() is added to distinguish different paths.
if device_num == 1:
outputDirectory = train_dir
if device_num > 1:
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
print("============== Starting Training ==============")
#若是数据并行,只需要上传一次输出的模型文件
outputDirectory = train_dir
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=outputDirectory,
config=config_ck)
@ -126,11 +123,6 @@ if __name__ == "__main__":
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)
# set callback functions
callback =[time_cb,LossMonitor()]
local_rank=int(os.getenv('RANK_ID'))
# for data parallel, only save checkpoint on rank 0
if local_rank==0 :
callback.append(ckpoint_cb)
model.train(epoch_size,ds_train,callbacks=callback)
###智算不需要回传训练结果,会在任务结束后自动回传

View File

@ -115,12 +115,8 @@ if __name__ == "__main__":
config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#Note that this method saves the model file on each card. You need to specify the save path on each card.
# In this example, get_rank() is added to distinguish different paths.
if device_num == 1:
outputDirectory = train_dir + "/"
if device_num > 1:
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
#若是数据并行,则只需要上传一次模型文件
outputDirectory = train_dir + "/"
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=outputDirectory,
config=config_ck)

View File

@ -98,12 +98,8 @@ if __name__ == "__main__":
model = Model(network, net_loss,net_opt,metrics={"accuracy"},amp_level="O2")
config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#Note that this method saves the model file on each card. You need to specify the save path on each card.
# In this example, get_rank() is added to distinguish different paths.
if device_num == 1:
outputDirectory = train_dir
if device_num > 1:
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
#若是数据并行,只需要上传一次输出的模型文件
outputDirectory = train_dir
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=outputDirectory,
config=config_ck)
@ -112,10 +108,5 @@ if __name__ == "__main__":
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)
# set callback functions
callback =[time_cb,LossMonitor()]
local_rank=int(os.getenv('RANK_ID'))
# for data parallel, only save checkpoint on rank 0
if local_rank==0 :
callback.append(ckpoint_cb)
model.train(epoch_size,ds_train,callbacks=callback)