27 lines
650 B
Bash
27 lines
650 B
Bash
#!/bin/bash
|
|
set -e
|
|
EXEC_PATH=$(pwd)
|
|
|
|
export RANK_SIZE=2
|
|
test_dist_2pcs()
|
|
{
|
|
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
|
|
export RANK_SIZE=2
|
|
}
|
|
|
|
test_dist_${RANK_SIZE}pcs
|
|
|
|
for((i=0;i<2;i++))
|
|
do
|
|
rm -rf device$i
|
|
mkdir device$i
|
|
cp ./train_dataparallel_debug_env.py ./config.py ./lenet.py ./dataset_distributed.py ./device$i
|
|
cd ./device$i
|
|
export DEVICE_ID=$i
|
|
export RANK_ID=$i
|
|
echo "start training for device $i"
|
|
env > env$i.log
|
|
python ./train_dataparallel_debug_env.py > train_dataparallel_debug_env.log$i 2>&1 &
|
|
cd ../
|
|
done
|
|
echo "The program launch succeed, the log is under device0/train.log0." |