Provide bloom reasoning scripts and refine the bloom readme.
This commit is contained in:
parent
2c117613ab
commit
010747ebb8
23
README.md
23
README.md
|
@ -84,7 +84,7 @@ The plan for more tasks, like RLHF and RM, is under way ...
|
|||
<td> <a href="examples/baichuan/pretrain_baichuan_ptd_13B.sh">Train</a> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td rowspan="3"><a href="examples/bloom/README.md">Bloom</a></td>
|
||||
<td rowspan="2"><a href="examples/bloom/README.md">Bloom</a></td>
|
||||
<td>7B1</td>
|
||||
<td> 1x8</td>
|
||||
<td> FP16 </td>
|
||||
|
@ -94,19 +94,12 @@ The plan for more tasks, like RLHF and RM, is under way ...
|
|||
<td> <a href="examples/bloom/pretrain_bloom_7b1.sh">Train</a> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td rowspan="2">176B</td>
|
||||
<td rowspan="2">12x8</td>
|
||||
<td> FP16 </td>
|
||||
<td> 106 tokens/p/s </td>
|
||||
<td> 107 tokens/p/s </td>
|
||||
<td> <a href="sources/images/bloom176B_fp16_loss.png">Loss</a> </td>
|
||||
<td> -- </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td >176B</td>
|
||||
<td >12x8</td>
|
||||
<td> BF16 </td>
|
||||
<td> -- </td>
|
||||
<td> -- </td>
|
||||
<td> -- </td>
|
||||
<td> 100 tokens/p/s </td>
|
||||
<td> 107 tokens/p/s </td>
|
||||
<td> <a href="examples/bloom/images/bloom176b_lm_loss_compare.PNG">Loss</a> </td>
|
||||
<td> <a href="examples/bloom/pretrain_bloom_176b.sh">Train</a> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
|
@ -334,14 +327,14 @@ For llama and other LLMs without bias in FFN, the linear transformation in FFN c
|
|||
<td rowspan="2"><a href="examples/bloom/README.md">Bloom</a></td>
|
||||
<td> 7B1 </td>
|
||||
<td> -- </td>
|
||||
<td> -- </td>
|
||||
<td> <a href="https://gitee.com/ascend/AscendSpeed/tree/master/examples/bloom/generate_bloom_7B_tp8_pp1.sh">inference</a> </td>
|
||||
<td> -- </td>
|
||||
<td> -- </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td> 176B </td>
|
||||
<td> -- </td>
|
||||
<td> -- </td>
|
||||
<td> <a href="https://gitee.com/ascend/AscendSpeed/tree/master/examples/bloom/generate_176b_2nodes.sh">inference</a> </td>
|
||||
<td> -- </td>
|
||||
<td> -- </td>
|
||||
</tr>
|
||||
|
|
|
@ -33,6 +33,7 @@ EOT
|
|||
|
||||
deepspeed --num_nodes $NNODES --num_gpus $NPUS_PER_NODE \
|
||||
./tasks/inference/inference_alpaca.py \
|
||||
--no-contiguous-buffers-in-local-ddp \
|
||||
--num-layers 40 \
|
||||
--hidden-size 5120 \
|
||||
--ffn-hidden-size 13824 \
|
|
@ -35,6 +35,7 @@ EOT
|
|||
|
||||
deepspeed --num_nodes $NNODES --num_gpus $NPUS_PER_NODE \
|
||||
./tasks/inference/inference_alpaca.py \
|
||||
--no-contiguous-buffers-in-local-ddp \
|
||||
--num-layers 40 \
|
||||
--hidden-size 5120 \
|
||||
--ffn-hidden-size 13824 \
|
|
@ -17,6 +17,7 @@ CHECKPOINT="your megatron checkpoint path"
|
|||
VOCAB_FILE="your vocab path"
|
||||
|
||||
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/inference/inference_llama.py \
|
||||
--no-contiguous-buffers-in-local-ddp \
|
||||
--tensor-model-parallel-size 8 \
|
||||
--pipeline-model-parallel-size 1 \
|
||||
--num-layers 40 \
|
||||
|
|
|
@ -1,22 +1,207 @@
|
|||
|
||||
# Contents
|
||||
|
||||
- [Bloom-7B1](#contents)
|
||||
- [Bloom-7B](#contents)
|
||||
- [Training](#pre-training)
|
||||
- [Script](#script)
|
||||
- [Performance](#performance)
|
||||
- [Machine performance](#machine-performance)
|
||||
- [Accuracy of the loss](#accuracy-of-the-loss)
|
||||
|
||||
- [Script](#script)
|
||||
- [Performance](#performance)
|
||||
- [Machine performance](#machine-performance)
|
||||
- [Accuracy of the loss](#accuracy-of-the-loss)
|
||||
- [Inference](#Inference)
|
||||
- [Script](#script)
|
||||
- [Bloom-176B](#contents)
|
||||
- [Training](#pre-training)
|
||||
- [Script](#script)
|
||||
- [Performance](#performance)
|
||||
- [Machine performance](#machine-performance)
|
||||
- [Accuracy of the loss](#accuracy-of-the-loss)
|
||||
- [Inference](#inference)
|
||||
- [Script](#script)
|
||||
- [Performance](#performance)
|
||||
- [Machine performance](#machine-performance)
|
||||
- [Accuracy of the loss](#accuracy-of-the-loss)
|
||||
- [Inference](#Inference)
|
||||
- [Script](#script)
|
||||
|
||||
# Bloom-7B1
|
||||
# Bloom-7B
|
||||
|
||||
## Training
|
||||
|
||||
|
||||
Here's a hardware summary of pre-training Bloom-7B:
|
||||
|
||||
| **Hardware** | **Value** |
|
||||
| ------------ | --------- |
|
||||
| NPU | 1x8 Ascend 910 |
|
||||
|
||||
Here's a software summary of pre-training Bloom-7B:
|
||||
|
||||
| Software | Version | link |
|
||||
| :-----------------------: |:--------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
|
||||
| Python | 3.8.18 | - |
|
||||
| driver | 23.0.RC3.B070 | [link](https://support.huawei.com/enterprise/zh/ascend-computing/ascend-hdk-pid-252764743/software/261159048?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C252764743) |
|
||||
| firmware | 6.4.0.3.220 | [link](https://support.huawei.com/enterprise/zh/ascend-computing/ascend-hdk-pid-252764743/software/261159048?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C252764743) |
|
||||
| CANN | 7.0.RC1.B070 | [link](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software/261305467?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C251168373) |
|
||||
| binary arithmetic package | Ascend-cann-kernels-XXXX_7.0.RC1_linux | [link](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software/261305467?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C251168373) |
|
||||
| torch | 2.0.1 | - |
|
||||
| torch_npu | 2.0.1 | [link](https://gitee.com/ascend/pytorch/releases/tag/v5.0.rc3-pytorch2.0.1) |
|
||||
|
||||
## Script
|
||||
|
||||
1. Clone the repository to your local server
|
||||
|
||||
```shell
|
||||
git clone https://gitee.com/ascend/AscendSpeed.git
|
||||
cd AscendSpeed
|
||||
```
|
||||
|
||||
2. Build enviroment
|
||||
|
||||
```shell
|
||||
# python3.8
|
||||
conda create -n bloom7b python=3.8
|
||||
conda activate bloom7b
|
||||
|
||||
# install torch and torch_npu and apex
|
||||
pip install torch-2.0.1-cp38-cp38-manylinux2014_aarch64.whl
|
||||
pip install torch_npu-2.0.1rc1.postxxxxxxxx-cp38-cp38-linux_aarch64.whl
|
||||
pip install apex-0.1_ascend_xxxxxxxx-cp38-cp38-linux_aarch64.whl
|
||||
|
||||
# install megatron-core
|
||||
pip3 install -e git+https://github.com/NVIDIA/Megatron-LM.git@23.05#egg=megatron-core
|
||||
|
||||
# install deepspeed and deepspeed_npu
|
||||
pip install deepspeed==0.9.2
|
||||
git clone https://gitee.com/ascend/DeepSpeed.git -b v0.9.2 deepspeed_npu
|
||||
cd deepspeed_npu
|
||||
pip3 install -e ./
|
||||
cd ..
|
||||
|
||||
# install other packages
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Prepare pretrained weights
|
||||
|
||||
Download the Bloom-7B tokensizer from [here](https://huggingface.co/bigscience/bloom-7b1/tree/main).
|
||||
|
||||
```shell
|
||||
mkdir tokenizer
|
||||
cd tokenizer
|
||||
wget https://huggingface.co/bigscience/bloom/resolve/main/special_tokens_map.json
|
||||
wget https://huggingface.co/bigscience/bloom/resolve/main/tokenizer.json
|
||||
wget https://huggingface.co/bigscience/bloom/resolve/main/tokenizer_config.json
|
||||
cd ..
|
||||
```
|
||||
|
||||
We provide scripts that support converting pretrained weights into weights that AscendSpeed can load and used for train and inference.
|
||||
|
||||
```shell
|
||||
#!/bin/bash
|
||||
|
||||
SCRIPT_PATH=./tasks/ckpt_convert/bloom/convert_weights_from_huggingface.py
|
||||
python $SCRIPT_PATH \
|
||||
--input-model-dir "your huggingface checkpoint path" \
|
||||
--output-model-dir "your ascendspeed checkpoint path" \
|
||||
--tensor-model-parallel-size 8 \
|
||||
--pipeline-model-parallel-size 1 \
|
||||
--type 7B \
|
||||
--deepspeed
|
||||
```
|
||||
|
||||
4. Prepare dataset
|
||||
|
||||
Download the Bloom-7B datasets from [here](https://huggingface.co/datasets/teven/enwiki_100k). The downloaded dataset is in the parquet format by default.
|
||||
You need to convert the dataset to the loose json format and preprocess the dataset.
|
||||
|
||||
```shell
|
||||
# download datasets
|
||||
mkdir enwiki_100k_datasets
|
||||
cd enwiki_100k_datasets
|
||||
wget https://huggingface.co/datasets/teven/enwiki_100k/resolve/main/data/train-00000-of-00006-67bcc7d401923db0.parquet
|
||||
wget https://huggingface.co/datasets/teven/enwiki_100k/resolve/main/data/train-00001-of-00006-6b8562cbb05789a4.parquet
|
||||
wget https://huggingface.co/datasets/teven/enwiki_100k/resolve/main/data/train-00002-of-00006-62d2b426a93b0912.parquet
|
||||
wget https://huggingface.co/datasets/teven/enwiki_100k/resolve/main/data/train-00003-of-00006-36c3d6da04c724b6.parquet
|
||||
wget https://huggingface.co/datasets/teven/enwiki_100k/resolve/main/data/train-00004-of-00006-48bdf99256dcfa5d.parquet
|
||||
wget https://huggingface.co/datasets/teven/enwiki_100k/resolve/main/data/train-00005-of-00006-bcb3b3af8d7a4140.parquet
|
||||
cd ..
|
||||
|
||||
# preprocess datasets
|
||||
python ./tools/preprocess_data.py \
|
||||
--input ./enwiki_100k_datasets/ \
|
||||
--tokenizer-name-or-path ./tokenizer \
|
||||
--output-prefix ./enwiki_100k_datasets/enwiki-100k \
|
||||
--worker 4 \
|
||||
--log-interval 1000 \
|
||||
--tokenizer-type PretrainedFromHF
|
||||
```
|
||||
|
||||
5. Config Bloom-7B pre-training script: examples/bloom/pretrain_bloom_7b1.sh
|
||||
|
||||
```shell
|
||||
# modify the datasets path and tokenizer path
|
||||
TOKENIZER_NAME_OR_PATH=/home/bloom_data/vocab_file/
|
||||
DATA_PATH=/home/bloom_data/enwiki_100k/enwiki-100k_text_document
|
||||
```
|
||||
|
||||
6. Launch Bloom-7B pre-training script: examples/bloom/pretrain_bloom_7b1.sh
|
||||
|
||||
Run the examples/bloom/pretrain_bloom_7b1.sh on all nodes in the cluster.
|
||||
|
||||
```shell
|
||||
bash examples/bloom/pretrain_bloom_7b1.sh
|
||||
```
|
||||
|
||||
## Performance
|
||||
|
||||
### Machine Performance
|
||||
|
||||
The performance of Bloom-7B in **Ascend NPU** and **Reference**:
|
||||
|
||||
| Device | Model | total Iterations | throughput rate (samples/s/p) | throughput rate (tokens/s/p) | single-step time (s/step) | floating point operation (TFLOPs/s) |
|
||||
| ------ |----------|------------------|-------------------------------|------------------------------|---------------------------|-------------------------------------|
|
||||
| NPUs | Bloom-7B | 1000 | 10.289 | 2603 | 18.67 | 115.55 |
|
||||
| Reference | Bloom-7B | 1000 | 9.894 | 2525 | 19.40 | 111.19 |
|
||||
|
||||
|
||||
|
||||
#### Accuracy of the loss
|
||||
|
||||
NPU vs GPU loss.
|
||||
|
||||
The NPU runs smoothly, the resource usage is stable, no errors are reported in the middle of the process, the Loss is on a decreasing trend, and the convergence speed is as expected.
|
||||
|
||||

|
||||
|
||||
NPU vs GPU loss relative error.
|
||||
|
||||

|
||||
|
||||
## Inference
|
||||
|
||||
We support AscendSpeed Inference for text generation with BLOOM 7B.
|
||||
|
||||
Use [convert_weights_from_gptmodelpipe_to_gptmodel.sh](../../tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel.sh), converting deep speed checkpoints to megatron.Convert the checkpoint of deepspeed to megtron.
|
||||
|
||||
```bash
|
||||
SCRIPT_PATH=./tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel_v2.py
|
||||
python $SCRIPT_PATH \
|
||||
--input-model-dir ${INPUT_PATH} \
|
||||
--output-model-dir ${OUTPUT_PATH} \
|
||||
--tensor-model-parallel-size 8 \
|
||||
--pipeline-model-parallel-size 1 \
|
||||
--type 7B
|
||||
```
|
||||
### Script
|
||||
|
||||
We generate text samples using the `generate_bloom` script. Inference different from pre-training, such as we need to Load pre training checkpoint and the length of the output samples:
|
||||
|
||||
Config Bloom-7B inference script: examples/bloom/generate_bloom_7B_tp8_pp1.sh
|
||||
|
||||
```shell
|
||||
# modify the model weight path and tokenizer path
|
||||
CHECKPOINT=/home/bloom_data/enwiki_100k/enwiki-100k_text_document
|
||||
VOCAB_FILE=/home/bloom_data/vocab_file/
|
||||
```
|
||||
|
||||
```shell
|
||||
bash ./examples/bloom/generate_bloom_7B_tp8_pp1.sh
|
||||
```
|
||||
|
||||
# Bloom-176B
|
||||
|
||||
|
@ -30,7 +215,7 @@ Here's a hardware summary of pre-training Bloom-176B:
|
|||
|
||||
| **Hardware** | **Value** |
|
||||
| ------------ | --------- |
|
||||
| NPU | 12x8 Ascend 910B |
|
||||
| NPU | 12x8 Ascend 910 |
|
||||
|
||||
Here's a software summary of pre-training Bloom-176B:
|
||||
|
||||
|
@ -42,7 +227,7 @@ Here's a software summary of pre-training Bloom-176B:
|
|||
| CANN | Ascend-cann-toolkit_6.3.RC3.1_linux-aarch64.run | [link](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software/261213460?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C251168373) |
|
||||
| CANN-kernels | Ascend-cann-kernels-910b_6.3.RC3.1_linux.run | [link](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software/261213460?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C251168373) |
|
||||
| torch | 2.0.1 | <center>-</center> |
|
||||
| torch_npu | 2.0.1 | [link](https://gitee.com/ascend/pytorch/releases/tag/v5.0.rc2-pytorch2.0.1) |
|
||||
| torch_npu | 2.0.1 | [link](https://gitee.com/ascend/pytorch/releases/tag/v5.0.rc3-pytorch2.0.1) |
|
||||
|
||||
## Script
|
||||
|
||||
|
@ -91,7 +276,21 @@ wget https://huggingface.co/bigscience/bloom/resolve/main/tokenizer.json
|
|||
wget https://huggingface.co/bigscience/bloom/resolve/main/tokenizer_config.json
|
||||
cd ..
|
||||
```
|
||||
We provide scripts that support converting pretrained weights into weights that AscendSpeed can load and used for train and inference. `--partition-layers` specifies the partitioning strategy under the pipeline parallel strategy, you can also modify it to a different strategy, but the sum of all elements of `--partition layers` should be equal to 70 and the number of elements in `--partition-layers` should be equal to `--pipeline-model-parallel-size`.
|
||||
|
||||
```shell
|
||||
#!/bin/bash
|
||||
|
||||
SCRIPT_PATH=./tasks/ckpt_convert/bloom/convert_weights_from_huggingface.py
|
||||
python $SCRIPT_PATH \
|
||||
--input-model-dir "your huggingface checkpoint path" \
|
||||
--output-model-dir "your ascendspeed checkpoint path" \
|
||||
--tensor-model-parallel-size 8 \
|
||||
--pipeline-model-parallel-size 12 \
|
||||
--type 176B \
|
||||
--deepspeed \
|
||||
--partition-layers 6,6,6,6,6,6,6,6,6,6,6,4
|
||||
```
|
||||
4. Prepare dataset
|
||||
|
||||
Download the Bloom-176B datasets from [here](https://huggingface.co/datasets/teven/enwiki_100k). The downloaded dataset is in the parquet format by default.
|
||||
|
@ -164,3 +363,61 @@ and GPU on a single-node system. The average relative error is 0.1%, less than 2
|
|||
|
||||

|
||||
|
||||
## Inference
|
||||
|
||||
We support AscendSpeed Inference for text generation with BLOOM 176B.
|
||||
|
||||
Use [convert_weights_from_gptmodelpipe_to_gptmodel.sh](../../tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel.sh), converting deep speed checkpoints to megatron.Convert the checkpoint of deepspeed to megtron.
|
||||
|
||||
We use two-machine reasoning. First of all, we need to manually move the pre-trained ckpt to the two machines, node 0 requires layer 1-37, node 1 requires layer 38-74, move the conversion script configuration directory and related parameters, and execute the conversion.
|
||||
```bash
|
||||
SCRIPT_PATH=./tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel_v2.py
|
||||
python $SCRIPT_PATH \
|
||||
--input-model-dir ${INPUT_PATH} \
|
||||
--output-model-dir ${OUTPUT_PATH} \
|
||||
--tensor-model-parallel-size 8 \
|
||||
--pipeline-model-parallel-size 2 \
|
||||
--type 176B
|
||||
```
|
||||
### Script
|
||||
We generate text samples using the `generate_bloom` script. Inference different from pre-training, such as we need to Load pre training checkpoint and the length of the output samples:
|
||||
|
||||
Config Bloom-176B inference script: examples/bloom/generate_bloom_176b_2nodes.sh
|
||||
|
||||
```shell
|
||||
# modify MASTER_ADDR to the IP address of the master node in the cluster.
|
||||
# the master node is localhost, and the other nodes are the IP address of the master node, for example, 90.90.2.166
|
||||
MASTER_ADDR=localhost
|
||||
|
||||
# modify the rank number of a node. The rank number of the master node is 0, and the rank number of other nodes increases in ascending order.
|
||||
NODE_RANK=0
|
||||
|
||||
# modify the model weight path and tokenizer path
|
||||
CHECKPOINT=/home/bloom_data/enwiki_100k/enwiki-100k_text_document
|
||||
VOCAB_FILE=/home/bloom_data/vocab_file/
|
||||
```
|
||||
|
||||
```shell
|
||||
bash ./examples/bloom/generate_bloom_176b_2nodes.sh
|
||||
```
|
||||
|
||||
## Example
|
||||
1. bloom 7b
|
||||

|
||||
2. bloom 176b
|
||||

|
||||
|
||||
All the provided scripts are tested on 910 64GB NPUs for BLOOM 7B and BLOOM 176B (fp16). These scripts might not work for other models or a different number of NPUs.
|
||||
|
||||
> Note: Sometimes NPUs memory is not freed when inference deployment crashes. You can free this memory by running kill all python in terminal.
|
||||
|
||||
## Citation
|
||||
|
||||
You may also consider original work in your reference:
|
||||
|
||||
@article{scao2022bloom,
|
||||
title={Bloom: A 176b-parameter open-access multilingual language model},
|
||||
author={Scao, Teven Le and Fan, Angela and Akiki, Christopher and Pavlick, Ellie and Ili{\'c}, Suzana and Hesslow, Daniel and Castagn{\'e}, Roman and Luccioni, Alexandra Sasha and Yvon, Fran{\c{c}}ois and Gall{\'e}, Matthias and others},
|
||||
journal={arXiv preprint arXiv:2211.05100},
|
||||
year={2022}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
#!/bin/bash
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
export LD_LIBRARY_PATH=/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
|
||||
export HCCL_CONNECT_TIMEOUT=1200
|
||||
export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
MASTER_ADDR=**.**.**.**
|
||||
MASTER_PORT=12890
|
||||
NNODES=2
|
||||
NPUS_PER_NODE=8
|
||||
NODE_RANK=1
|
||||
|
||||
VOCAB_FILE="your VOCAB FILE path"
|
||||
CHECKPOINT="your checkpoint path"
|
||||
|
||||
DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $NODE_RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT"
|
||||
|
||||
# Real script
|
||||
python -m torch.distributed.run $DISTRIBUTED_ARGS ./tasks/inference/inference_gpt.py \
|
||||
--no-contiguous-buffers-in-local-ddp \
|
||||
--load ${CHECKPOINT} \
|
||||
--tokenizer-type PretrainedFromHF \
|
||||
--tokenizer-name-or-path ${VOCAB_FILE} \
|
||||
--tensor-model-parallel-size 8 \
|
||||
--pipeline-model-parallel-size 2 \
|
||||
--embed-layernorm \
|
||||
--position-embedding-type alibi \
|
||||
--num-layers 70 \
|
||||
--hidden-size 14336 \
|
||||
--num-attention-heads 112 \
|
||||
--max-position-embeddings 2048 \
|
||||
--seq-length 2048 \
|
||||
--attention-dropout 0 \
|
||||
--hidden-dropout 0 \
|
||||
--micro-batch-size 1 \
|
||||
--init-method-std 0.0048 \
|
||||
--layernorm-epsilon 1e-6 \
|
||||
--fp16 \
|
||||
--no-load-optim \
|
||||
--no-load-rng
|
|
@ -0,0 +1,42 @@
|
|||
#!/bin/bash
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
export LD_LIBRARY_PATH=/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
|
||||
export HCCL_CONNECT_TIMEOUT=1200
|
||||
export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
MASTER_ADDR=localhost
|
||||
MASTER_PORT=6000
|
||||
NNODES=1
|
||||
NODE_RANK=0
|
||||
NPUS_PER_NODE=8
|
||||
|
||||
DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $NODE_RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT"
|
||||
|
||||
|
||||
VOCAB_FILE="your VOCAB FILE path"
|
||||
CHECKPOINT="your checkpoint path"
|
||||
|
||||
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/inference/inference_gpt.py \
|
||||
--no-contiguous-buffers-in-local-ddp \
|
||||
--tensor-model-parallel-size 8 \
|
||||
--pipeline-model-parallel-size 1 \
|
||||
--num-layers 30 \
|
||||
--hidden-size 4096 \
|
||||
--num-attention-heads 32 \
|
||||
--max-position-embeddings 2048 \
|
||||
--tokenizer-type PretrainedFromHF \
|
||||
--tokenizer-name-or-path "$VOCAB_FILE" \
|
||||
--tokenizer-not-use-fast \
|
||||
--fp16 \
|
||||
--micro-batch-size 1 \
|
||||
--seq-length 2048 \
|
||||
--max-new-tokens 64 \
|
||||
--seed 42 \
|
||||
--load "${CHECKPOINT}" \
|
||||
--embed-layernorm \
|
||||
--position-embedding-type alibi \
|
|
@ -33,7 +33,8 @@ cat <<EOT > $config_json
|
|||
EOT
|
||||
|
||||
deepspeed --num_nodes $NNODES --num_gpus $NPUS_PER_NODE \
|
||||
./tasks/inference/inference_bloom_pipeline \
|
||||
./tasks/inference/inference_bloom_pipeline.py \
|
||||
--no-contiguous-buffers-in-local-ddp \
|
||||
--tensor-model-parallel-size 8 \
|
||||
--pipeline-model-parallel-size 1 \
|
||||
--num-layers 30 \
|
||||
|
|
|
@ -34,6 +34,7 @@ EOT
|
|||
|
||||
deepspeed --num_nodes $NNODES --num_gpus $NPUS_PER_NODE \
|
||||
./tasks/inference/inference_llama_pipeline.py \
|
||||
--no-contiguous-buffers-in-local-ddp \
|
||||
--tensor-model-parallel-size 8 \
|
||||
--num-layers 30 \
|
||||
--hidden-size 4096 \
|
||||
|
|
|
@ -17,6 +17,7 @@ CHECKPOINT="your megatron checkpoint path"
|
|||
VOCAB_FILE="your vocab path"
|
||||
|
||||
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/inference/inference_llama.py \
|
||||
--no-contiguous-buffers-in-local-ddp \
|
||||
--tensor-model-parallel-size 2 \
|
||||
--pipeline-model-parallel-size 2 \
|
||||
--num-layers 32 \
|
||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 36 KiB |
Binary file not shown.
After Width: | Height: | Size: 19 KiB |
Binary file not shown.
After Width: | Height: | Size: 28 KiB |
|
@ -0,0 +1,18 @@
|
|||
#!/bin/bash
|
||||
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
export LD_LIBRARY_PATH=/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
|
||||
export HCCL_CONNECT_TIMEOUT=1200
|
||||
export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
INPUT_PATH="your input checkpoint path"
|
||||
OUTPUT_PATH="your output checkpoint path"
|
||||
|
||||
SCRIPT_PATH=./tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel_v2.py
|
||||
python $SCRIPT_PATH \
|
||||
--input-model-dir ${INPUT_PATH} \
|
||||
--output-model-dir ${OUTPUT_PATH} \
|
||||
--tensor-model-parallel-size 8 \
|
||||
--pipeline-model-parallel-size 1 \
|
||||
--type 7B
|
|
@ -0,0 +1,18 @@
|
|||
#!/bin/bash
|
||||
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
export LD_LIBRARY_PATH=/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
|
||||
export HCCL_CONNECT_TIMEOUT=1200
|
||||
export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
INPUT_PATH="your input checkpoint path"
|
||||
OUTPUT_PATH="your output checkpoint path"
|
||||
|
||||
SCRIPT_PATH=./tools/ckpt_convert/bloom/convert_weights_from_gptmodelpipe_to_gptmodel_v2.py
|
||||
python $SCRIPT_PATH \
|
||||
--input-model-dir ${INPUT_PATH} \
|
||||
--output-model-dir ${OUTPUT_PATH} \
|
||||
--tensor-model-parallel-size 8 \
|
||||
--pipeline-model-parallel-size 2 \
|
||||
--type 176B
|
|
@ -0,0 +1,153 @@
|
|||
import argparse
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
import torch
|
||||
|
||||
work_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))
|
||||
print(f"work_path : {os.path.abspath(work_path)}")
|
||||
sys.path.append(work_path)
|
||||
|
||||
from tools.ckpt_convert.llama.ckpt_utils import make_ascendspeed_model_dirs
|
||||
from ascendspeed.error_utils import check_divisible_by_zero
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument("--input-model-dir", type=str, default="./input_model_dir", help="bloom huggingface model dir")
|
||||
parser.add_argument("--output-model-dir", type=str, default="./output_model_dir",
|
||||
help="bloom ascendspeed model dir")
|
||||
parser.add_argument("--tensor-model-parallel-size", type=int, default=1, help="degree of tensor model parallel")
|
||||
parser.add_argument("--pipeline-model-parallel-size", type=int, default=1,
|
||||
help="degree of pipeline model parallel")
|
||||
parser.add_argument("--type", type=str, choices=["7B", "176B"], default="7B")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
model_config = {
|
||||
"7B": [30, 4096, 32], # num_layers, hidden_size, num_attention_heads
|
||||
"176B": [70, 14336, 112]
|
||||
}
|
||||
|
||||
|
||||
def extract_gptmodelpipe(input_model_dir):
|
||||
files = os.listdir(os.path.join(input_model_dir, f'global_step1000'))
|
||||
input_models = {}
|
||||
for f in files:
|
||||
ckpt_file_path = os.path.join(input_model_dir, f'global_step1000', f)
|
||||
input_models[f] = torch.load(ckpt_file_path, map_location="cpu")
|
||||
print(f"load gpt model pipe finish")
|
||||
return input_models
|
||||
|
||||
|
||||
def generate_gptmodel_weights(input_dir, output_dir, tp_size, pp_size, model_type):
|
||||
layer_size = model_config.get(model_type)[0]
|
||||
gptmodelpipe = extract_gptmodelpipe(input_dir)
|
||||
|
||||
### 实际上有的参数文件集合
|
||||
param_file_set = [os.path.basename(file_path) for file_path in gptmodelpipe.keys()]
|
||||
param_file_set = sorted([file_name for file_name in param_file_set if file_name.startswith('layer_')])
|
||||
print(f"文件集合 : {param_file_set}")
|
||||
|
||||
release_model_dir = os.path.join(output_dir, "release")
|
||||
language_model = {}
|
||||
language_model['encoder'] = {}
|
||||
|
||||
word_embeddings_for_head = {}
|
||||
|
||||
for pp_rank in range(pp_size):
|
||||
layer_mean = math.ceil(check_divisible_by_zero(layer_size, pp_size))
|
||||
current_layer_pp_rank = list(range(pp_rank * layer_mean + 3, (pp_rank + 1) * layer_mean + 3))
|
||||
if pp_rank == 0:
|
||||
current_layer_pp_rank.append(1)
|
||||
if pp_rank == pp_size - 1:
|
||||
current_layer_pp_rank = list(range(pp_rank * layer_mean + 3, layer_size + 3))
|
||||
current_layer_pp_rank.append(1)
|
||||
current_layer_pp_rank.append(layer_size + 4)
|
||||
|
||||
### 原理上应该有的参数文件集合
|
||||
theo_file_set = []
|
||||
for layer_num in current_layer_pp_rank:
|
||||
for tp_rank in range(tp_size):
|
||||
theo_file_set.append(f"layer_{layer_num:02d}-model_{tp_rank:02d}-model_states.pt")
|
||||
print(f"原理文件集合 : {theo_file_set}")
|
||||
|
||||
if len(set(param_file_set) & set(theo_file_set)) == len(set(param_file_set)):
|
||||
print(f"current rank : {pp_rank}, 包含的层 :{current_layer_pp_rank}")
|
||||
else:
|
||||
print(f"{current_layer_pp_rank} 不在rank: {pp_rank}")
|
||||
continue
|
||||
|
||||
for tp_rank in range(tp_size):
|
||||
for layer_num in current_layer_pp_rank:
|
||||
layer_name = f"layer_{layer_num:02d}-model_{tp_rank:02d}-model_states.pt"
|
||||
if layer_num == 1:
|
||||
if pp_rank == 0:
|
||||
language_model['embedding'] = {}
|
||||
language_model['embedding']['word_embeddings'] = {}
|
||||
language_model['embedding']['word_embeddings']['weight'] = gptmodelpipe.get(layer_name).get(
|
||||
'word_embeddings.weight')
|
||||
|
||||
if pp_rank == pp_size - 1:
|
||||
word_embeddings_for_head['weight'] = gptmodelpipe.get(layer_name).get(
|
||||
'word_embeddings.weight')
|
||||
|
||||
elif layer_num == 2:
|
||||
continue
|
||||
elif layer_num == layer_size + 3:
|
||||
continue
|
||||
elif layer_num == layer_size + 4:
|
||||
language_model['encoder']["final_layernorm.weight"] = gptmodelpipe.get(layer_name).get(
|
||||
"final_layernorm.weight")
|
||||
language_model['encoder']["final_layernorm.bias"] = gptmodelpipe.get(layer_name).get(
|
||||
"final_layernorm.bias")
|
||||
else:
|
||||
encoder_layer_name = f"layers.{layer_num - 3 - pp_rank * layer_mean}."
|
||||
|
||||
language_model['encoder'][f"{encoder_layer_name}input_layernorm.weight"] = gptmodelpipe.get(
|
||||
layer_name).get(
|
||||
'input_layernorm.weight')
|
||||
language_model['encoder'][f"{encoder_layer_name}input_layernorm.bias"] = gptmodelpipe.get(
|
||||
layer_name).get(
|
||||
'input_layernorm.bias')
|
||||
language_model['encoder'][f"{encoder_layer_name}self_attention.query_key_value.weight"] = \
|
||||
gptmodelpipe.get(layer_name).get('self_attention.query_key_value.weight')
|
||||
language_model['encoder'][f"{encoder_layer_name}self_attention.query_key_value.bias"] = \
|
||||
gptmodelpipe.get(layer_name).get('self_attention.query_key_value.bias')
|
||||
language_model['encoder'][f"{encoder_layer_name}self_attention.dense.weight"] = \
|
||||
gptmodelpipe.get(layer_name).get('self_attention.dense.weight')
|
||||
language_model['encoder'][f"{encoder_layer_name}self_attention.dense.bias"] = \
|
||||
gptmodelpipe.get(layer_name).get('self_attention.dense.bias')
|
||||
language_model['encoder'][f"{encoder_layer_name}post_attention_layernorm.weight"] = \
|
||||
gptmodelpipe.get(layer_name).get('post_attention_layernorm.weight')
|
||||
language_model['encoder'][f"{encoder_layer_name}post_attention_layernorm.bias"] = \
|
||||
gptmodelpipe.get(layer_name).get('post_attention_layernorm.bias')
|
||||
language_model['encoder'][f"{encoder_layer_name}mlp.dense_h_to_4h.weight"] = \
|
||||
gptmodelpipe.get(layer_name).get('mlp.dense_h_to_4h.weight')
|
||||
language_model['encoder'][f"{encoder_layer_name}mlp.dense_h_to_4h.bias"] = gptmodelpipe.get(
|
||||
layer_name).get('mlp.dense_h_to_4h.bias')
|
||||
language_model['encoder'][f"{encoder_layer_name}mlp.dense_4h_to_h.weight"] = \
|
||||
gptmodelpipe.get(layer_name).get('mlp.dense_4h_to_h.weight')
|
||||
language_model['encoder'][f"{encoder_layer_name}mlp.dense_4h_to_h.bias"] = gptmodelpipe.get(
|
||||
layer_name).get('mlp.dense_4h_to_h.bias')
|
||||
model_dic = {'checkpoint_version': 3.0, 'model': {}}
|
||||
model_dic['model']['language_model'] = language_model
|
||||
if pp_rank == pp_size - 1:
|
||||
model_dic['model']['word_embeddings_for_head'] = word_embeddings_for_head
|
||||
|
||||
print(model_dic['model'].keys())
|
||||
model_dir = os.path.join(release_model_dir, f"{'mp_rank_{:02d}_{:03d}'.format(tp_rank, pp_rank)}")
|
||||
os.makedirs(model_dir, exist_ok=True)
|
||||
torch.save(model_dic, os.path.join(model_dir, "model_optim_rng.pt"))
|
||||
print(f'{os.path.join(model_dir, "model_optim_rng.pt")} saved!')
|
||||
# 找到对应的rank,保存结束
|
||||
break
|
||||
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>change over>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = get_args()
|
||||
make_ascendspeed_model_dirs(args.output_model_dir)
|
||||
generate_gptmodel_weights(args.input_model_dir, args.output_model_dir, args.tensor_model_parallel_size,
|
||||
args.pipeline_model_parallel_size, args.type)
|
Loading…
Reference in New Issue