数据并行只上传一次训练结果
This commit is contained in:
parent
1fa0edbf01
commit
09afaecfe8
10
pretrain.py
10
pretrain.py
|
@ -115,7 +115,7 @@ if __name__ == "__main__":
|
||||||
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
|
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
|
||||||
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
|
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
|
||||||
|
|
||||||
###假如选择了模型文件,使用pretrain_dir,注意ckpt_url的方式依然保留,你依然可以使用ckpt_url的方式,但是这种方式将会逐渐废弃
|
###假如选择了模型文件,使用pretrain_url的方式,注意ckpt_url的方式依然保留,你依然可以使用ckpt_url的方式,但是这种方式将会逐渐废弃
|
||||||
load_param_into_net(network, load_checkpoint(os.path.join(pretrain_dir, "checkpoint_lenet-1_1875.ckpt")))
|
load_param_into_net(network, load_checkpoint(os.path.join(pretrain_dir, "checkpoint_lenet-1_1875.ckpt")))
|
||||||
|
|
||||||
if args.device_target != "Ascend":
|
if args.device_target != "Ascend":
|
||||||
|
@ -133,12 +133,8 @@ if __name__ == "__main__":
|
||||||
config_ck = CheckpointConfig(
|
config_ck = CheckpointConfig(
|
||||||
save_checkpoint_steps=cfg.save_checkpoint_steps,
|
save_checkpoint_steps=cfg.save_checkpoint_steps,
|
||||||
keep_checkpoint_max=cfg.keep_checkpoint_max)
|
keep_checkpoint_max=cfg.keep_checkpoint_max)
|
||||||
#Note that this method saves the model file on each card. You need to specify the save path on each card.
|
#若是数据并行,则只需要上传一次模型文件
|
||||||
# In this example, get_rank() is added to distinguish different paths.
|
outputDirectory = train_dir + "/"
|
||||||
if device_num == 1:
|
|
||||||
outputDirectory = train_dir + "/"
|
|
||||||
if device_num > 1:
|
|
||||||
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
|
|
||||||
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
|
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
|
||||||
directory=outputDirectory,
|
directory=outputDirectory,
|
||||||
config=config_ck)
|
config=config_ck)
|
||||||
|
|
|
@ -112,12 +112,9 @@ if __name__ == "__main__":
|
||||||
model = Model(network, net_loss,net_opt,metrics={"accuracy"},amp_level="O2")
|
model = Model(network, net_loss,net_opt,metrics={"accuracy"},amp_level="O2")
|
||||||
config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
|
config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
|
||||||
keep_checkpoint_max=cfg.keep_checkpoint_max)
|
keep_checkpoint_max=cfg.keep_checkpoint_max)
|
||||||
#Note that this method saves the model file on each card. You need to specify the save path on each card.
|
print("============== Starting Training ==============")
|
||||||
# In this example, get_rank() is added to distinguish different paths.
|
#若是数据并行,只需要上传一次输出的模型文件
|
||||||
if device_num == 1:
|
outputDirectory = train_dir
|
||||||
outputDirectory = train_dir
|
|
||||||
if device_num > 1:
|
|
||||||
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
|
|
||||||
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
|
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
|
||||||
directory=outputDirectory,
|
directory=outputDirectory,
|
||||||
config=config_ck)
|
config=config_ck)
|
||||||
|
@ -126,11 +123,6 @@ if __name__ == "__main__":
|
||||||
if (args.epoch_size):
|
if (args.epoch_size):
|
||||||
epoch_size = args.epoch_size
|
epoch_size = args.epoch_size
|
||||||
print('epoch_size is: ', epoch_size)
|
print('epoch_size is: ', epoch_size)
|
||||||
# set callback functions
|
|
||||||
callback =[time_cb,LossMonitor()]
|
callback =[time_cb,LossMonitor()]
|
||||||
local_rank=int(os.getenv('RANK_ID'))
|
|
||||||
# for data parallel, only save checkpoint on rank 0
|
|
||||||
if local_rank==0 :
|
|
||||||
callback.append(ckpoint_cb)
|
|
||||||
model.train(epoch_size,ds_train,callbacks=callback)
|
model.train(epoch_size,ds_train,callbacks=callback)
|
||||||
###智算不需要回传训练结果,会在任务结束后自动回传
|
###智算不需要回传训练结果,会在任务结束后自动回传
|
8
train.py
8
train.py
|
@ -115,12 +115,8 @@ if __name__ == "__main__":
|
||||||
config_ck = CheckpointConfig(
|
config_ck = CheckpointConfig(
|
||||||
save_checkpoint_steps=cfg.save_checkpoint_steps,
|
save_checkpoint_steps=cfg.save_checkpoint_steps,
|
||||||
keep_checkpoint_max=cfg.keep_checkpoint_max)
|
keep_checkpoint_max=cfg.keep_checkpoint_max)
|
||||||
#Note that this method saves the model file on each card. You need to specify the save path on each card.
|
#若是数据并行,则只需要上传一次模型文件
|
||||||
# In this example, get_rank() is added to distinguish different paths.
|
outputDirectory = train_dir + "/"
|
||||||
if device_num == 1:
|
|
||||||
outputDirectory = train_dir + "/"
|
|
||||||
if device_num > 1:
|
|
||||||
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
|
|
||||||
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
|
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
|
||||||
directory=outputDirectory,
|
directory=outputDirectory,
|
||||||
config=config_ck)
|
config=config_ck)
|
||||||
|
|
|
@ -98,12 +98,8 @@ if __name__ == "__main__":
|
||||||
model = Model(network, net_loss,net_opt,metrics={"accuracy"},amp_level="O2")
|
model = Model(network, net_loss,net_opt,metrics={"accuracy"},amp_level="O2")
|
||||||
config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
|
config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
|
||||||
keep_checkpoint_max=cfg.keep_checkpoint_max)
|
keep_checkpoint_max=cfg.keep_checkpoint_max)
|
||||||
#Note that this method saves the model file on each card. You need to specify the save path on each card.
|
#若是数据并行,只需要上传一次输出的模型文件
|
||||||
# In this example, get_rank() is added to distinguish different paths.
|
outputDirectory = train_dir
|
||||||
if device_num == 1:
|
|
||||||
outputDirectory = train_dir
|
|
||||||
if device_num > 1:
|
|
||||||
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
|
|
||||||
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
|
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
|
||||||
directory=outputDirectory,
|
directory=outputDirectory,
|
||||||
config=config_ck)
|
config=config_ck)
|
||||||
|
@ -112,10 +108,5 @@ if __name__ == "__main__":
|
||||||
if (args.epoch_size):
|
if (args.epoch_size):
|
||||||
epoch_size = args.epoch_size
|
epoch_size = args.epoch_size
|
||||||
print('epoch_size is: ', epoch_size)
|
print('epoch_size is: ', epoch_size)
|
||||||
# set callback functions
|
|
||||||
callback =[time_cb,LossMonitor()]
|
callback =[time_cb,LossMonitor()]
|
||||||
local_rank=int(os.getenv('RANK_ID'))
|
|
||||||
# for data parallel, only save checkpoint on rank 0
|
|
||||||
if local_rank==0 :
|
|
||||||
callback.append(ckpoint_cb)
|
|
||||||
model.train(epoch_size,ds_train,callbacks=callback)
|
model.train(epoch_size,ds_train,callbacks=callback)
|
Loading…
Reference in New Issue