数据并行只上传一次训练结果

2023-05-26 10:40:20 +08:00 · 2023-05-26 10:40:20 +08:00 · 09afaecfe8
parent 1fa0edbf01
commit 09afaecfe8
4 changed files with 10 additions and 35 deletions
--- a/pretrain.py
+++ b/pretrain.py
@ -115,7 +115,7 @@ if __name__ == "__main__":
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

-    ###假如选择了模型文件，使用pretrain_dir,注意ckpt_url的方式依然保留，你依然可以使用ckpt_url的方式，但是这种方式将会逐渐废弃
+    ###假如选择了模型文件，使用pretrain_url的方式,注意ckpt_url的方式依然保留，你依然可以使用ckpt_url的方式，但是这种方式将会逐渐废弃
    load_param_into_net(network, load_checkpoint(os.path.join(pretrain_dir, "checkpoint_lenet-1_1875.ckpt")))

    if args.device_target != "Ascend":
@ -133,12 +133,8 @@ if __name__ == "__main__":
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
-    #Note that this method saves the model file on each card. You need to specify the save path on each card.
-    # In this example, get_rank() is added to distinguish different paths.
-    if device_num == 1:
-        outputDirectory = train_dir + "/"
-    if device_num > 1:
-        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
+    #若是数据并行，则只需要上传一次模型文件
+    outputDirectory = train_dir + "/"
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
--- a/pretrain_for_c2net.py
+++ b/pretrain_for_c2net.py
@ -112,12 +112,9 @@ if __name__ == "__main__":
        model = Model(network, net_loss,net_opt,metrics={"accuracy"},amp_level="O2")
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
                                keep_checkpoint_max=cfg.keep_checkpoint_max)
-    #Note that this method saves the model file on each card. You need to specify the save path on each card.
-    # In this example, get_rank() is added to distinguish different paths.
-    if device_num == 1:
-        outputDirectory = train_dir 
-    if device_num > 1:
-        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
+    print("============== Starting Training ==============")
+    #若是数据并行，只需要上传一次输出的模型文件
+    outputDirectory = train_dir 
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
@ -126,11 +123,6 @@ if __name__ == "__main__":
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
-    # set callback functions
    callback =[time_cb,LossMonitor()]
-    local_rank=int(os.getenv('RANK_ID'))
-    # for data parallel, only save checkpoint on rank 0
-    if local_rank==0 :
-        callback.append(ckpoint_cb) 
    model.train(epoch_size,ds_train,callbacks=callback)
    ###智算不需要回传训练结果，会在任务结束后自动回传
--- a/train.py
+++ b/train.py
@ -115,12 +115,8 @@ if __name__ == "__main__":
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
-    #Note that this method saves the model file on each card. You need to specify the save path on each card.
-    # In this example, get_rank() is added to distinguish different paths.
-    if device_num == 1:
-        outputDirectory = train_dir + "/"
-    if device_num > 1:
-        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
+    #若是数据并行，则只需要上传一次模型文件
+    outputDirectory = train_dir + "/"
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
--- a/train_for_c2net.py
+++ b/train_for_c2net.py
@ -98,12 +98,8 @@ if __name__ == "__main__":
        model = Model(network, net_loss,net_opt,metrics={"accuracy"},amp_level="O2")
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
                                keep_checkpoint_max=cfg.keep_checkpoint_max)
-    #Note that this method saves the model file on each card. You need to specify the save path on each card.
-    # In this example, get_rank() is added to distinguish different paths.
-    if device_num == 1:
-        outputDirectory = train_dir 
-    if device_num > 1:
-        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
+    #若是数据并行，只需要上传一次输出的模型文件
+    outputDirectory = train_dir 
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
@ -112,10 +108,5 @@ if __name__ == "__main__":
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
-    # set callback functions
    callback =[time_cb,LossMonitor()]
-    local_rank=int(os.getenv('RANK_ID'))
-    # for data parallel, only save checkpoint on rank 0
-    if local_rank==0 :
-        callback.append(ckpoint_cb) 
    model.train(epoch_size,ds_train,callbacks=callback)