Provide bloom reasoning scripts and refine the bloom readme.

2023-10-17 19:06:12 +08:00 · 2023-10-17 19:06:12 +08:00 · 010747ebb8
parent 2c117613ab
commit 010747ebb8
16 changed files with 562 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -84,7 +84,7 @@ The plan for more tasks, like RLHF and RM, is under way ...
      <td> <a href="examples/baichuan/pretrain_baichuan_ptd_13B.sh">Train</a> </td>
    </tr>
    <tr>
-      <td rowspan="3"><a href="examples/bloom/README.md">Bloom</a></td>
+      <td rowspan="2"><a href="examples/bloom/README.md">Bloom</a></td>
      <td>7B1</td>
      <td> 1x8</td>
      <td> FP16 </td>
@ -94,19 +94,12 @@ The plan for more tasks, like RLHF and RM, is under way ...
      <td> <a href="examples/bloom/pretrain_bloom_7b1.sh">Train</a> </td>
    </tr>
    <tr>
-      <td rowspan="2">176B</td>
-      <td rowspan="2">12x8</td>
-      <td> FP16 </td>
-      <td> 106 tokens/p/s </td>
-      <td> 107 tokens/p/s </td>
-      <td> <a href="sources/images/bloom176B_fp16_loss.png">Loss</a> </td>
-      <td> -- </td>
-    </tr>
-    <tr>
+      <td >176B</td>
+      <td >12x8</td>
      <td> BF16 </td>
-      <td> -- </td>
-      <td> -- </td>
-      <td> -- </td>
+      <td> 100 tokens/p/s </td>
+      <td> 107 tokens/p/s </td>
+      <td> <a href="examples/bloom/images/bloom176b_lm_loss_compare.PNG">Loss</a> </td>
      <td> <a href="examples/bloom/pretrain_bloom_176b.sh">Train</a> </td>
    </tr>
    <tr>
@ -334,14 +327,14 @@ For llama and other LLMs without bias in FFN, the linear transformation in FFN c
      <td rowspan="2"><a href="examples/bloom/README.md">Bloom</a></td>
      <td> 7B1 </td>
      <td> -- </td>
-      <td> -- </td>
+      <td> <a href="https://gitee.com/ascend/AscendSpeed/tree/master/examples/bloom/generate_bloom_7B_tp8_pp1.sh">inference</a> </td>
      <td> -- </td>
      <td> -- </td>
    </tr>
    <tr>
      <td> 176B </td>
      <td> -- </td>
-      <td> -- </td>
+      <td> <a href="https://gitee.com/ascend/AscendSpeed/tree/master/examples/bloom/generate_176b_2nodes.sh">inference</a> </td>
      <td> -- </td>
      <td> -- </td>
    </tr>
--- a/examples/alpaca/generate_alpaca_13B_deepspeed.sh
+++ b/examples/alpaca/generate_alpaca_13B_deepspeed.sh
@ -33,6 +33,7 @@ EOT

 deepspeed --num_nodes $NNODES --num_gpus $NPUS_PER_NODE \
       ./tasks/inference/inference_alpaca.py \
+       --no-contiguous-buffers-in-local-ddp \
       --num-layers 40  \
       --hidden-size 5120  \
       --ffn-hidden-size 13824 \
--- a/examples/alpaca/generate_alpaca_13B_lora_deepspeed.sh
+++ b/examples/alpaca/generate_alpaca_13B_lora_deepspeed.sh
@ -35,6 +35,7 @@ EOT

 deepspeed --num_nodes $NNODES --num_gpus $NPUS_PER_NODE \
       ./tasks/inference/inference_alpaca.py \
+       --no-contiguous-buffers-in-local-ddp \
       --num-layers 40  \
       --hidden-size 5120  \
       --ffn-hidden-size 13824 \
--- a/examples/baichuan/generate_baichuan_13B_tp8_pp1.sh
+++ b/examples/baichuan/generate_baichuan_13B_tp8_pp1.sh
@ -17,6 +17,7 @@ CHECKPOINT="your megatron checkpoint path"
 VOCAB_FILE="your vocab path"

 python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/inference/inference_llama.py \
+       --no-contiguous-buffers-in-local-ddp \
       --tensor-model-parallel-size 8  \
       --pipeline-model-parallel-size 1  \
       --num-layers 40  \
--- a/examples/bloom/README.md
+++ b/examples/bloom/README.md
@ -1,22 +1,207 @@

 # Contents

- [Bloom-7B1](#contents)
+- [Bloom-7B](#contents)
  - [Training](#pre-training)
    - [Script](#script)
    - [Performance](#performance)
      - [Machine performance](#machine-performance)
      - [Accuracy of the loss](#accuracy-of-the-loss)
-
+  - [Inference](#Inference)
+    - [Script](#script)
 - [Bloom-176B](#contents)
  - [Training](#pre-training)
    - [Script](#script)
    - [Performance](#performance)
      - [Machine performance](#machine-performance)
      - [Accuracy of the loss](#accuracy-of-the-loss)
-  - [Inference](#inference)
+  - [Inference](#Inference)
+    - [Script](#script)

-# Bloom-7B1
+# Bloom-7B
+
+## Training
+
+
+Here's a hardware summary of pre-training Bloom-7B:
+
+| **Hardware** | **Value** |
+| ------------ | --------- |
+| NPU          | 1x8 Ascend 910 | 
+
+Here's a software summary of pre-training Bloom-7B:
+
+|         Software          |                Version                 |                                                                                      link                                                                                       |
+| :-----------------------: |:--------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
+|          Python           |                 3.8.18                 |                                                                                        -                                                                                        |
+|          driver           |             23.0.RC3.B070              | [link](https://support.huawei.com/enterprise/zh/ascend-computing/ascend-hdk-pid-252764743/software/261159048?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C252764743) |
+|         firmware          |              6.4.0.3.220               | [link](https://support.huawei.com/enterprise/zh/ascend-computing/ascend-hdk-pid-252764743/software/261159048?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C252764743) |
+|           CANN            |              7.0.RC1.B070              |    [link](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software/261305467?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C251168373)    |
+| binary arithmetic package | Ascend-cann-kernels-XXXX_7.0.RC1_linux |    [link](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software/261305467?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C251168373)    |
+|           torch           |                 2.0.1                  |                                                                                        -                                                                                        |
+|         torch_npu         |                 2.0.1                  |                                                   [link](https://gitee.com/ascend/pytorch/releases/tag/v5.0.rc3-pytorch2.0.1)                                                   | 
+
+## Script
+
+1. Clone the repository to your local server
+
+```shell
+git clone https://gitee.com/ascend/AscendSpeed.git
+cd AscendSpeed
+```
+
+2. Build enviroment
+
+```shell
+# python3.8
+conda create -n bloom7b python=3.8
+conda activate bloom7b
+
+# install torch and torch_npu and apex
+pip install torch-2.0.1-cp38-cp38-manylinux2014_aarch64.whl
+pip install torch_npu-2.0.1rc1.postxxxxxxxx-cp38-cp38-linux_aarch64.whl
+pip install apex-0.1_ascend_xxxxxxxx-cp38-cp38-linux_aarch64.whl
+
+# install megatron-core
+pip3 install -e git+https://github.com/NVIDIA/Megatron-LM.git@23.05#egg=megatron-core
+
+# install deepspeed and deepspeed_npu
+pip install deepspeed==0.9.2
+git clone https://gitee.com/ascend/DeepSpeed.git -b v0.9.2 deepspeed_npu
+cd deepspeed_npu
+pip3 install -e ./
+cd ..
+
+# install other packages
+pip install -r requirements.txt
+```
+
+3. Prepare pretrained weights
+
+Download the Bloom-7B tokensizer from [here](https://huggingface.co/bigscience/bloom-7b1/tree/main).
+
+```shell
+mkdir tokenizer
+cd tokenizer
+wget https://huggingface.co/bigscience/bloom/resolve/main/special_tokens_map.json
+wget https://huggingface.co/bigscience/bloom/resolve/main/tokenizer.json
+wget https://huggingface.co/bigscience/bloom/resolve/main/tokenizer_config.json
+cd ..
+```
+
+We provide scripts that support converting pretrained weights into weights that AscendSpeed can load and used for train and inference. 
+
+```shell
+#!/bin/bash
+
+SCRIPT_PATH=./tasks/ckpt_convert/bloom/convert_weights_from_huggingface.py
+python $SCRIPT_PATH \
+    --input-model-dir "your huggingface checkpoint path" \
+    --output-model-dir "your ascendspeed checkpoint path" \
+    --tensor-model-parallel-size 8 \
+    --pipeline-model-parallel-size 1 \
+    --type 7B \
+    --deepspeed
+```
+
+4. Prepare dataset
+
+Download the Bloom-7B datasets from [here](https://huggingface.co/datasets/teven/enwiki_100k). The downloaded dataset is in the parquet format by default.
+You need to convert the dataset to the loose json format and preprocess the dataset.
+
+```shell
+# download datasets
+mkdir enwiki_100k_datasets
+cd enwiki_100k_datasets
+wget https://huggingface.co/datasets/teven/enwiki_100k/resolve/main/data/train-00000-of-00006-67bcc7d401923db0.parquet
+wget https://huggingface.co/datasets/teven/enwiki_100k/resolve/main/data/train-00001-of-00006-6b8562cbb05789a4.parquet
+wget https://huggingface.co/datasets/teven/enwiki_100k/resolve/main/data/train-00002-of-00006-62d2b426a93b0912.parquet
+wget https://huggingface.co/datasets/teven/enwiki_100k/resolve/main/data/train-00003-of-00006-36c3d6da04c724b6.parquet
+wget https://huggingface.co/datasets/teven/enwiki_100k/resolve/main/data/train-00004-of-00006-48bdf99256dcfa5d.parquet
+wget https://huggingface.co/datasets/teven/enwiki_100k/resolve/main/data/train-00005-of-00006-bcb3b3af8d7a4140.parquet
+cd ..
+
+# preprocess datasets
+python ./tools/preprocess_data.py \
+  --input ./enwiki_100k_datasets/ \
+  --tokenizer-name-or-path ./tokenizer \
+  --output-prefix ./enwiki_100k_datasets/enwiki-100k \
+  --worker 4 \
+  --log-interval 1000 \
+  --tokenizer-type PretrainedFromHF
+```
+
+5. Config Bloom-7B pre-training script: examples/bloom/pretrain_bloom_7b1.sh
+
+```shell
+# modify the datasets path and tokenizer path
+TOKENIZER_NAME_OR_PATH=/home/bloom_data/vocab_file/
+DATA_PATH=/home/bloom_data/enwiki_100k/enwiki-100k_text_document
+```
+
+6. Launch Bloom-7B pre-training script: examples/bloom/pretrain_bloom_7b1.sh
+
+Run the examples/bloom/pretrain_bloom_7b1.sh on all nodes in the cluster.
+
+```shell
+bash examples/bloom/pretrain_bloom_7b1.sh
+```
+
+## Performance
+
+### Machine Performance
+
+The performance of Bloom-7B in **Ascend NPU** and **Reference**:
+
+| Device | Model    | total Iterations | throughput rate (samples/s/p) | throughput rate (tokens/s/p) | single-step time (s/step) | floating point operation (TFLOPs/s) |
+| ------ |----------|------------------|-------------------------------|------------------------------|---------------------------|-------------------------------------|
+| NPUs   | Bloom-7B | 1000             | 10.289                        | 2603                         | 18.67                     | 115.55                              |
+| Reference   | Bloom-7B | 1000             | 9.894                         | 2525                         | 19.40                     | 111.19                              |
+
+
+
+#### Accuracy of the loss
+
+NPU vs GPU loss.
+
+The NPU runs smoothly, the resource usage is stable, no errors are reported in the middle of the process, the Loss is on a decreasing trend, and the convergence speed is as expected. 
+
+![7b_lm_loss.png](images%2F7b_lm_loss.png)
+
+NPU vs GPU loss relative error.
+
+![relative_error.png](images%2Frelative_error.png)
+
+## Inference
+
+We support AscendSpeed Inference for text generation with BLOOM 7B.
+
+Use [convert_weights_from_gptmodelpipe_to_gptmodel.sh](../../tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel.sh), converting deep speed checkpoints to megatron.Convert the checkpoint of deepspeed to megtron.
+
+```bash
+SCRIPT_PATH=./tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel_v2.py
+python $SCRIPT_PATH \
+    --input-model-dir ${INPUT_PATH} \
+    --output-model-dir ${OUTPUT_PATH} \
+    --tensor-model-parallel-size 8 \
+    --pipeline-model-parallel-size 1 \
+    --type 7B
+```
+### Script
+
+We generate text samples using the `generate_bloom` script. Inference different from pre-training, such as we need to Load pre training checkpoint and the length of the output samples:
+
+Config Bloom-7B inference script: examples/bloom/generate_bloom_7B_tp8_pp1.sh
+
+```shell
+# modify the model weight path and tokenizer path
+CHECKPOINT=/home/bloom_data/enwiki_100k/enwiki-100k_text_document
+VOCAB_FILE=/home/bloom_data/vocab_file/
+```
+
+```shell
+bash ./examples/bloom/generate_bloom_7B_tp8_pp1.sh
+```

 # Bloom-176B

@ -30,7 +215,7 @@ Here's a hardware summary of pre-training Bloom-176B:

 | **Hardware** | **Value** |
 | ------------ | --------- |
-| NPU          | 12x8 Ascend 910B | 
+| NPU          | 12x8 Ascend 910 | 

 Here's a software summary of pre-training Bloom-176B:

@ -42,7 +227,7 @@ Here's a software summary of pre-training Bloom-176B:
 | CANN                   | Ascend-cann-toolkit_6.3.RC3.1_linux-aarch64.run         | [link](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software/261213460?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C251168373)       |
 | CANN-kernels           | Ascend-cann-kernels-910b_6.3.RC3.1_linux.run            | [link](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software/261213460?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C251168373)       |
 | torch                  | 2.0.1                                                   | <center>-</center>                                                                                                                                                              |
-| torch_npu              | 2.0.1                                                   | [link](https://gitee.com/ascend/pytorch/releases/tag/v5.0.rc2-pytorch2.0.1)                                                                                                     |
+| torch_npu              | 2.0.1                                                   | [link](https://gitee.com/ascend/pytorch/releases/tag/v5.0.rc3-pytorch2.0.1)                                                                                                     |

 ## Script

@ -91,7 +276,21 @@ wget https://huggingface.co/bigscience/bloom/resolve/main/tokenizer.json
 wget https://huggingface.co/bigscience/bloom/resolve/main/tokenizer_config.json
 cd ..
 ```
+We provide scripts that support converting pretrained weights into weights that AscendSpeed can load and used for train and inference. `--partition-layers` specifies the partitioning strategy under the pipeline parallel strategy, you can also modify it to a different strategy, but the sum of all elements of `--partition layers` should be equal to 70 and the number of elements in `--partition-layers` should be equal to `--pipeline-model-parallel-size`.

+```shell
+#!/bin/bash
+
+SCRIPT_PATH=./tasks/ckpt_convert/bloom/convert_weights_from_huggingface.py
+python $SCRIPT_PATH \
+    --input-model-dir "your huggingface checkpoint path" \
+    --output-model-dir "your ascendspeed checkpoint path" \
+    --tensor-model-parallel-size 8 \
+    --pipeline-model-parallel-size 12 \
+    --type 176B \
+    --deepspeed \
+    --partition-layers 6,6,6,6,6,6,6,6,6,6,6,4
+```
 4. Prepare dataset

 Download the Bloom-176B datasets from [here](https://huggingface.co/datasets/teven/enwiki_100k). The downloaded dataset is in the parquet format by default.
@ -164,3 +363,61 @@ and GPU on a single-node system. The average relative error is 0.1%, less than 2

 ![bloom176b_1node_lm_loss_compare](./images/bloom176b_lm_loss_1node_compare.PNG)

+## Inference
+
+We support AscendSpeed Inference for text generation with BLOOM 176B.
+
+Use [convert_weights_from_gptmodelpipe_to_gptmodel.sh](../../tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel.sh), converting deep speed checkpoints to megatron.Convert the checkpoint of deepspeed to megtron.
+
+We use two-machine reasoning. First of all, we need to manually move the pre-trained ckpt to the two machines, node 0 requires layer 1-37, node 1 requires layer 38-74, move the conversion script configuration directory and related parameters, and execute the conversion.
+```bash
+SCRIPT_PATH=./tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel_v2.py
+python $SCRIPT_PATH \
+    --input-model-dir ${INPUT_PATH} \
+    --output-model-dir ${OUTPUT_PATH} \
+    --tensor-model-parallel-size 8 \
+    --pipeline-model-parallel-size 2 \
+    --type 176B
+```
+### Script
+We generate text samples using the `generate_bloom` script. Inference different from pre-training, such as we need to Load pre training checkpoint and the length of the output samples:
+
+Config Bloom-176B inference script: examples/bloom/generate_bloom_176b_2nodes.sh
+
+```shell
+# modify MASTER_ADDR to the IP address of the master node in the cluster.
+# the master node is localhost, and the other nodes are the IP address of the master node, for example, 90.90.2.166
+MASTER_ADDR=localhost
+
+# modify the rank number of a node. The rank number of the master node is 0, and the rank number of other nodes increases in ascending order.
+NODE_RANK=0
+
+# modify the model weight path and tokenizer path
+CHECKPOINT=/home/bloom_data/enwiki_100k/enwiki-100k_text_document
+VOCAB_FILE=/home/bloom_data/vocab_file/
+```
+
+```shell
+bash ./examples/bloom/generate_bloom_176b_2nodes.sh
+```
+
+## Example
+1. bloom 7b
+![bloom_7b_generate.png](..%2F..%2Fsources%2Fimages%2Fbloom_7b_generate.png)
+2. bloom 176b
+![bloom_176b_generate.png](..%2F..%2Fsources%2Fimages%2Fbloom_176b_generate.png)
+
+All the provided scripts are tested on 910 64GB NPUs for BLOOM 7B and BLOOM 176B (fp16). These scripts might not work for other models or a different number of NPUs.
+
+> Note: Sometimes NPUs memory is not freed when inference deployment crashes. You can free this memory by running kill all python in terminal.
+
+## Citation
+
+You may also consider original work in your reference:
+
+@article{scao2022bloom,
+  title={Bloom: A 176b-parameter open-access multilingual language model},
+  author={Scao, Teven Le and Fan, Angela and Akiki, Christopher and Pavlick, Ellie and Ili{\'c}, Suzana and Hesslow, Daniel and Castagn{\'e}, Roman and Luccioni, Alexandra Sasha and Yvon, Fran{\c{c}}ois and Gall{\'e}, Matthias and others},
+  journal={arXiv preprint arXiv:2211.05100},
+  year={2022}
+}
--- a/examples/bloom/generate_176b_2nodes.sh
+++ b/examples/bloom/generate_176b_2nodes.sh
@ -0,0 +1,45 @@
+#!/bin/bash
+export TOKENIZERS_PARALLELISM=false
+export LD_LIBRARY_PATH=/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
+export HCCL_CONNECT_TIMEOUT=1200
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+MASTER_ADDR=**.**.**.**
+MASTER_PORT=12890
+NNODES=2
+NPUS_PER_NODE=8
+NODE_RANK=1
+
+VOCAB_FILE="your VOCAB FILE path"
+CHECKPOINT="your checkpoint path"
+
+DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE \
+                  --nnodes $NNODES \
+                  --node_rank $NODE_RANK \
+                  --master_addr $MASTER_ADDR \
+                  --master_port $MASTER_PORT"
+
+# Real script
+python -m torch.distributed.run $DISTRIBUTED_ARGS ./tasks/inference/inference_gpt.py \
+               --no-contiguous-buffers-in-local-ddp \
+               --load ${CHECKPOINT} \
+               --tokenizer-type PretrainedFromHF \
+               --tokenizer-name-or-path ${VOCAB_FILE}  \
+               --tensor-model-parallel-size 8 \
+               --pipeline-model-parallel-size 2 \
+               --embed-layernorm \
+               --position-embedding-type alibi \
+               --num-layers 70  \
+               --hidden-size 14336  \
+               --num-attention-heads 112 \
+               --max-position-embeddings 2048 \
+               --seq-length 2048 \
+               --attention-dropout 0 \
+               --hidden-dropout 0 \
+               --micro-batch-size 1 \
+               --init-method-std 0.0048 \
+               --layernorm-epsilon 1e-6 \
+               --fp16 \
+               --no-load-optim \
+               --no-load-rng
--- a/examples/bloom/generate_bloom_7B_tp8_pp1.sh
+++ b/examples/bloom/generate_bloom_7B_tp8_pp1.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+export TOKENIZERS_PARALLELISM=false
+export LD_LIBRARY_PATH=/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
+export HCCL_CONNECT_TIMEOUT=1200
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE \
+                  --nnodes $NNODES \
+                  --node_rank $NODE_RANK \
+                  --master_addr $MASTER_ADDR \
+                  --master_port $MASTER_PORT"
+
+
+VOCAB_FILE="your VOCAB FILE path"
+CHECKPOINT="your checkpoint path"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/inference/inference_gpt.py \
+       --no-contiguous-buffers-in-local-ddp \
+       --tensor-model-parallel-size 8  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 30  \
+       --hidden-size 4096  \
+       --num-attention-heads 32 \
+       --max-position-embeddings 2048 \
+       --tokenizer-type PretrainedFromHF  \
+       --tokenizer-name-or-path "$VOCAB_FILE" \
+       --tokenizer-not-use-fast \
+       --fp16 \
+       --micro-batch-size 1 \
+       --seq-length 2048 \
+       --max-new-tokens 64 \
+       --seed 42 \
+       --load "${CHECKPOINT}"  \
+       --embed-layernorm \
+       --position-embedding-type alibi \
--- a/examples/bloom/generate_bloom_7b_deepspeed_pipeline.sh
+++ b/examples/bloom/generate_bloom_7b_deepspeed_pipeline.sh
@ -33,7 +33,8 @@ cat <<EOT > $config_json
 EOT

 deepspeed --num_nodes $NNODES --num_gpus $NPUS_PER_NODE \
-       ./tasks/inference/inference_bloom_pipeline \
+       ./tasks/inference/inference_bloom_pipeline.py \
+       --no-contiguous-buffers-in-local-ddp \
       --tensor-model-parallel-size 8  \
       --pipeline-model-parallel-size 1  \
       --num-layers 30  \
--- a/examples/llama/generate_llama_7B_deepspeed_pipeline.sh
+++ b/examples/llama/generate_llama_7B_deepspeed_pipeline.sh
@ -34,6 +34,7 @@ EOT

 deepspeed --num_nodes $NNODES --num_gpus $NPUS_PER_NODE \
       ./tasks/inference/inference_llama_pipeline.py \
+       --no-contiguous-buffers-in-local-ddp \
       --tensor-model-parallel-size 8  \
       --num-layers 30  \
       --hidden-size 4096  \
--- a/examples/llama/generate_llama_7B_tp2_pp2.sh
+++ b/examples/llama/generate_llama_7B_tp2_pp2.sh
@ -17,6 +17,7 @@ CHECKPOINT="your megatron checkpoint path"
 VOCAB_FILE="your vocab path"

 python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/inference/inference_llama.py \
+       --no-contiguous-buffers-in-local-ddp \
       --tensor-model-parallel-size 2  \
       --pipeline-model-parallel-size 2  \
       --num-layers 32  \
--- a/sources/images/bloom176B_fp16_loss.png
+++ b/sources/images/bloom176B_fp16_loss.png
--- a/sources/images/bloom_176b_generate.png
+++ b/sources/images/bloom_176b_generate.png
--- a/sources/images/bloom_7b_generate.png
+++ b/sources/images/bloom_7b_generate.png
--- a/tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel.sh
+++ b/tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+export TOKENIZERS_PARALLELISM=false
+export LD_LIBRARY_PATH=/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
+export HCCL_CONNECT_TIMEOUT=1200
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+INPUT_PATH="your input checkpoint path"
+OUTPUT_PATH="your output checkpoint path"
+
+SCRIPT_PATH=./tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel_v2.py
+python $SCRIPT_PATH \
+    --input-model-dir ${INPUT_PATH} \
+    --output-model-dir ${OUTPUT_PATH} \
+    --tensor-model-parallel-size 8 \
+    --pipeline-model-parallel-size 1 \
+    --type 7B
--- a/tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel_2nodes.sh
+++ b/tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel_2nodes.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+export TOKENIZERS_PARALLELISM=false
+export LD_LIBRARY_PATH=/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
+export HCCL_CONNECT_TIMEOUT=1200
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+INPUT_PATH="your input checkpoint path"
+OUTPUT_PATH="your output checkpoint path"
+
+SCRIPT_PATH=./tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel_v2.py
+python $SCRIPT_PATH \
+    --input-model-dir ${INPUT_PATH} \
+    --output-model-dir ${OUTPUT_PATH} \
+    --tensor-model-parallel-size 8 \
+    --pipeline-model-parallel-size 2 \
+    --type 176B
--- a/tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel_v2.py
+++ b/tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel_v2.py
@ -0,0 +1,153 @@
+import argparse
+import json
+import math
+import os
+import sys
+import torch
+
+work_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))
+print(f"work_path : {os.path.abspath(work_path)}")
+sys.path.append(work_path)
+
+from tools.ckpt_convert.llama.ckpt_utils import make_ascendspeed_model_dirs
+from ascendspeed.error_utils import check_divisible_by_zero
+
+
+def get_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--input-model-dir", type=str, default="./input_model_dir", help="bloom huggingface model dir")
+    parser.add_argument("--output-model-dir", type=str, default="./output_model_dir",
+                        help="bloom ascendspeed model dir")
+    parser.add_argument("--tensor-model-parallel-size", type=int, default=1, help="degree of tensor model parallel")
+    parser.add_argument("--pipeline-model-parallel-size", type=int, default=1,
+                        help="degree of pipeline model parallel")
+    parser.add_argument("--type", type=str, choices=["7B", "176B"], default="7B")
+    return parser.parse_args()
+
+
+model_config = {
+    "7B": [30, 4096, 32],  # num_layers, hidden_size, num_attention_heads
+    "176B": [70, 14336, 112]
+}
+
+
+def extract_gptmodelpipe(input_model_dir):
+    files = os.listdir(os.path.join(input_model_dir, f'global_step1000'))
+    input_models = {}
+    for f in files:
+        ckpt_file_path = os.path.join(input_model_dir, f'global_step1000', f)
+        input_models[f] = torch.load(ckpt_file_path, map_location="cpu")
+    print(f"load gpt model pipe finish")
+    return input_models
+
+
+def generate_gptmodel_weights(input_dir, output_dir, tp_size, pp_size, model_type):
+    layer_size = model_config.get(model_type)[0]
+    gptmodelpipe = extract_gptmodelpipe(input_dir)
+
+    ### 实际上有的参数文件集合
+    param_file_set = [os.path.basename(file_path) for file_path in gptmodelpipe.keys()]
+    param_file_set = sorted([file_name for file_name in param_file_set if file_name.startswith('layer_')])
+    print(f"文件集合 : {param_file_set}")
+
+    release_model_dir = os.path.join(output_dir, "release")
+    language_model = {}
+    language_model['encoder'] = {}
+
+    word_embeddings_for_head = {}
+
+    for pp_rank in range(pp_size):
+        layer_mean = math.ceil(check_divisible_by_zero(layer_size, pp_size))
+        current_layer_pp_rank = list(range(pp_rank * layer_mean + 3, (pp_rank + 1) * layer_mean + 3))
+        if pp_rank == 0:
+            current_layer_pp_rank.append(1)
+        if pp_rank == pp_size - 1:
+            current_layer_pp_rank = list(range(pp_rank * layer_mean + 3, layer_size + 3))
+            current_layer_pp_rank.append(1)
+            current_layer_pp_rank.append(layer_size + 4)
+
+        ### 原理上应该有的参数文件集合
+        theo_file_set = []
+        for layer_num in current_layer_pp_rank:
+            for tp_rank in range(tp_size):
+                theo_file_set.append(f"layer_{layer_num:02d}-model_{tp_rank:02d}-model_states.pt")
+        print(f"原理文件集合 : {theo_file_set}")
+
+        if len(set(param_file_set) & set(theo_file_set)) == len(set(param_file_set)):
+            print(f"current rank : {pp_rank}, 包含的层 ：{current_layer_pp_rank}")
+        else:
+            print(f"{current_layer_pp_rank} 不在rank: {pp_rank}")
+            continue
+
+        for tp_rank in range(tp_size):
+            for layer_num in current_layer_pp_rank:
+                layer_name = f"layer_{layer_num:02d}-model_{tp_rank:02d}-model_states.pt"
+                if layer_num == 1:
+                    if pp_rank == 0:
+                        language_model['embedding'] = {}
+                        language_model['embedding']['word_embeddings'] = {}
+                        language_model['embedding']['word_embeddings']['weight'] = gptmodelpipe.get(layer_name).get(
+                            'word_embeddings.weight')
+
+                    if pp_rank == pp_size - 1:
+                        word_embeddings_for_head['weight'] = gptmodelpipe.get(layer_name).get(
+                            'word_embeddings.weight')
+
+                elif layer_num == 2:
+                    continue
+                elif layer_num == layer_size + 3:
+                    continue
+                elif layer_num == layer_size + 4:
+                    language_model['encoder']["final_layernorm.weight"] = gptmodelpipe.get(layer_name).get(
+                        "final_layernorm.weight")
+                    language_model['encoder']["final_layernorm.bias"] = gptmodelpipe.get(layer_name).get(
+                        "final_layernorm.bias")
+                else:
+                    encoder_layer_name = f"layers.{layer_num - 3 - pp_rank * layer_mean}."
+
+                    language_model['encoder'][f"{encoder_layer_name}input_layernorm.weight"] = gptmodelpipe.get(
+                        layer_name).get(
+                        'input_layernorm.weight')
+                    language_model['encoder'][f"{encoder_layer_name}input_layernorm.bias"] = gptmodelpipe.get(
+                        layer_name).get(
+                        'input_layernorm.bias')
+                    language_model['encoder'][f"{encoder_layer_name}self_attention.query_key_value.weight"] = \
+                        gptmodelpipe.get(layer_name).get('self_attention.query_key_value.weight')
+                    language_model['encoder'][f"{encoder_layer_name}self_attention.query_key_value.bias"] = \
+                        gptmodelpipe.get(layer_name).get('self_attention.query_key_value.bias')
+                    language_model['encoder'][f"{encoder_layer_name}self_attention.dense.weight"] = \
+                        gptmodelpipe.get(layer_name).get('self_attention.dense.weight')
+                    language_model['encoder'][f"{encoder_layer_name}self_attention.dense.bias"] = \
+                        gptmodelpipe.get(layer_name).get('self_attention.dense.bias')
+                    language_model['encoder'][f"{encoder_layer_name}post_attention_layernorm.weight"] = \
+                        gptmodelpipe.get(layer_name).get('post_attention_layernorm.weight')
+                    language_model['encoder'][f"{encoder_layer_name}post_attention_layernorm.bias"] = \
+                        gptmodelpipe.get(layer_name).get('post_attention_layernorm.bias')
+                    language_model['encoder'][f"{encoder_layer_name}mlp.dense_h_to_4h.weight"] = \
+                        gptmodelpipe.get(layer_name).get('mlp.dense_h_to_4h.weight')
+                    language_model['encoder'][f"{encoder_layer_name}mlp.dense_h_to_4h.bias"] = gptmodelpipe.get(
+                        layer_name).get('mlp.dense_h_to_4h.bias')
+                    language_model['encoder'][f"{encoder_layer_name}mlp.dense_4h_to_h.weight"] = \
+                        gptmodelpipe.get(layer_name).get('mlp.dense_4h_to_h.weight')
+                    language_model['encoder'][f"{encoder_layer_name}mlp.dense_4h_to_h.bias"] = gptmodelpipe.get(
+                        layer_name).get('mlp.dense_4h_to_h.bias')
+            model_dic = {'checkpoint_version': 3.0, 'model': {}}
+            model_dic['model']['language_model'] = language_model
+            if pp_rank == pp_size - 1:
+                model_dic['model']['word_embeddings_for_head'] = word_embeddings_for_head
+
+            print(model_dic['model'].keys())
+            model_dir = os.path.join(release_model_dir, f"{'mp_rank_{:02d}_{:03d}'.format(tp_rank, pp_rank)}")
+            os.makedirs(model_dir, exist_ok=True)
+            torch.save(model_dic, os.path.join(model_dir, "model_optim_rng.pt"))
+            print(f'{os.path.join(model_dir, "model_optim_rng.pt")} saved!')
+        # 找到对应的rank，保存结束
+        break
+    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>change over>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
+
+
+if __name__ == '__main__':
+    args = get_args()
+    make_ascendspeed_model_dirs(args.output_model_dir)
+    generate_gptmodel_weights(args.input_model_dir, args.output_model_dir, args.tensor_model_parallel_size,
+                              args.pipeline_model_parallel_size, args.type)