parent
22c184321b
commit
979a633740
12
README.md
12
README.md
|
@ -114,7 +114,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
|
||||||
<td>7B1</td>
|
<td>7B1</td>
|
||||||
<td> 1x8</td>
|
<td> 1x8</td>
|
||||||
<td> FP16 </td>
|
<td> FP16 </td>
|
||||||
<td> 2611 </td>
|
<td> 2034 </td>
|
||||||
<td> 2525 </td>
|
<td> 2525 </td>
|
||||||
<td> <a href="examples/bloom/pretrain_bloom_7b1.sh">训练</a> </td>
|
<td> <a href="examples/bloom/pretrain_bloom_7b1.sh">训练</a> </td>
|
||||||
</tr>
|
</tr>
|
||||||
|
@ -122,7 +122,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
|
||||||
<td >176B</td>
|
<td >176B</td>
|
||||||
<td >12x8</td>
|
<td >12x8</td>
|
||||||
<td> BF16 </td>
|
<td> BF16 </td>
|
||||||
<td> 112 </td>
|
<td> 100 </td>
|
||||||
<td> 107 </td>
|
<td> 107 </td>
|
||||||
<td> <a href="examples/bloom/pretrain_bloom_176b.sh">训练</a> </td>
|
<td> <a href="examples/bloom/pretrain_bloom_176b.sh">训练</a> </td>
|
||||||
</tr>
|
</tr>
|
||||||
|
@ -232,8 +232,8 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
|
||||||
<td>72B</td>
|
<td>72B</td>
|
||||||
<td>16x8</td>
|
<td>16x8</td>
|
||||||
<td>BF16 </td>
|
<td>BF16 </td>
|
||||||
<td> -- </td>
|
<td> 285 </td>
|
||||||
<td> -- </td>
|
<td> 345 </td>
|
||||||
<td> <a href="examples/qwen/pretrain_qwen_72b_ptd.sh">训练</a> </td>
|
<td> <a href="examples/qwen/pretrain_qwen_72b_ptd.sh">训练</a> </td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
|
@ -417,8 +417,8 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
|
||||||
<tr>
|
<tr>
|
||||||
<td>72B</td>
|
<td>72B</td>
|
||||||
<td> -- </td>
|
<td> -- </td>
|
||||||
<td> -- </td>
|
<td> <a href="https://gitee.com/ascend/ModelLink/blob/modellink/tasks/inference/generate_qwen_72b_ptd.sh">对话 </a> </td>
|
||||||
<td> -- </td>
|
<td> <a href="https://gitee.com/ascend/ModelLink/blob/modellink/tasks/evaluation/evaluate_qwen_72b_ptd.sh">评估 </a> </td>
|
||||||
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
|
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
|
|
12
README_en.md
12
README_en.md
|
@ -112,7 +112,7 @@ Currently, the following downstream tasks have been supported:
|
||||||
<td>7B1</td>
|
<td>7B1</td>
|
||||||
<td> 1x8</td>
|
<td> 1x8</td>
|
||||||
<td> FP16 </td>
|
<td> FP16 </td>
|
||||||
<td> 2611 </td>
|
<td> 2034 </td>
|
||||||
<td> 2525 </td>
|
<td> 2525 </td>
|
||||||
<td> <a href="examples/bloom/pretrain_bloom_7b1.sh">Train</a> </td>
|
<td> <a href="examples/bloom/pretrain_bloom_7b1.sh">Train</a> </td>
|
||||||
</tr>
|
</tr>
|
||||||
|
@ -120,7 +120,7 @@ Currently, the following downstream tasks have been supported:
|
||||||
<td >176B</td>
|
<td >176B</td>
|
||||||
<td >12x8</td>
|
<td >12x8</td>
|
||||||
<td> BF16 </td>
|
<td> BF16 </td>
|
||||||
<td> 112 </td>
|
<td> 100 </td>
|
||||||
<td> 107 </td>
|
<td> 107 </td>
|
||||||
<td> <a href="examples/bloom/pretrain_bloom_176b.sh">Train</a> </td>
|
<td> <a href="examples/bloom/pretrain_bloom_176b.sh">Train</a> </td>
|
||||||
</tr>
|
</tr>
|
||||||
|
@ -230,8 +230,8 @@ Currently, the following downstream tasks have been supported:
|
||||||
<td>72B</td>
|
<td>72B</td>
|
||||||
<td>16x8</td>
|
<td>16x8</td>
|
||||||
<td>BF16 </td>
|
<td>BF16 </td>
|
||||||
<td> -- </td>
|
<td> 285 </td>
|
||||||
<td> -- </td>
|
<td> 345 </td>
|
||||||
<td> <a href="examples/qwen/pretrain_qwen_72b_ptd.sh">Train</a> </td>
|
<td> <a href="examples/qwen/pretrain_qwen_72b_ptd.sh">Train</a> </td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
|
@ -414,8 +414,8 @@ Currently, the following downstream tasks have been supported:
|
||||||
<tr>
|
<tr>
|
||||||
<td>72B</td>
|
<td>72B</td>
|
||||||
<td> -- </td>
|
<td> -- </td>
|
||||||
<td> -- </td>
|
<td> <a href="https://gitee.com/ascend/ModelLink/blob/modellink/tasks/inference/generate_qwen_72b_ptd.sh">inference </a> </td>
|
||||||
<td> -- </td>
|
<td> <a href="https://gitee.com/ascend/ModelLink/blob/modellink/tasks/evaluation/evaluate_qwen_72b_ptd.sh">evaluation </a> </td>
|
||||||
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
|
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
|
|
|
@ -28,6 +28,8 @@
|
||||||
- [脚本](#脚本)
|
- [脚本](#脚本)
|
||||||
- [性能](#性能)
|
- [性能](#性能)
|
||||||
- [吞吐](#吞吐)
|
- [吞吐](#吞吐)
|
||||||
|
- [推理](#推理)
|
||||||
|
- [评估](#评估)
|
||||||
|
|
||||||
# Qwen-7B
|
# Qwen-7B
|
||||||
|
|
||||||
|
@ -140,9 +142,9 @@ Qwen-7B 训练的硬件配置:
|
||||||
|
|
||||||
cd ..
|
cd ..
|
||||||
```
|
```
|
||||||
5. 微调
|
5. 预训练
|
||||||
|
|
||||||
配置Qwen-7B 微调脚本: examples/qwen/pretrain_qwen_7b_ptd.sh
|
配置Qwen-7B 预训练脚本: examples/qwen/pretrain_qwen_7b_ptd.sh
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
# 设置 ascend-toolkit 路径
|
# 设置 ascend-toolkit 路径
|
||||||
|
@ -155,7 +157,7 @@ Qwen-7B 训练的硬件配置:
|
||||||
CKPT_LOAD_DIR="your megatron ckpt save path"
|
CKPT_LOAD_DIR="your megatron ckpt save path"
|
||||||
```
|
```
|
||||||
|
|
||||||
启动 Qwen-7B 微调脚本: examples/qwen/pretrain_qwen_7b_ptd.sh
|
启动 Qwen-7B 预训练脚本: examples/qwen/pretrain_qwen_7b_ptd.sh
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
bash examples/qwen/pretrain_qwen_7b_ptd.sh
|
bash examples/qwen/pretrain_qwen_7b_ptd.sh
|
||||||
|
@ -222,10 +224,10 @@ TASK="mmlu" # ceval任务配置为 "ceval"
|
||||||
bash tasks/evaluation/evaluate_qwen_7b_ptd.sh
|
bash tasks/evaluation/evaluate_qwen_7b_ptd.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
| 数据集 | 总学科数 | 总问题数 | 参考准确率 | NPU准确率 |
|
| 数据集 | 总学科数 | 总问题数 | 参考准确率 | NPU准确率 |
|
||||||
|:---:|:---:|:---:|:---:|:---:|
|
|:---:|:---:|:---:|:-----------------------------------------------------------------------------:|:---:|
|
||||||
| CEval | 52 | 1346 | 63.5 | 62.5 |
|
| CEval | 52 | 1346 | [63.5](https://huggingface.co/Qwen/Qwen-7B) | 62.5 |
|
||||||
| MMLU | 57 | 14042 | 58.2 | 58.1 |
|
| MMLU | 57 | 14042 | [58.2](https://huggingface.co/Qwen/Qwen-7B) | 58.1 |
|
||||||
|
|
||||||
# Qwen-14B
|
# Qwen-14B
|
||||||
|
|
||||||
|
@ -345,9 +347,9 @@ Qwen-14B 训练的硬件配置:
|
||||||
|
|
||||||
cd ..
|
cd ..
|
||||||
```
|
```
|
||||||
5. 微调
|
5. 预训练
|
||||||
|
|
||||||
配置Qwen-14B 微调脚本: examples/qwen/pretrain_qwen_14b_ptd.sh
|
配置Qwen-14B 预训练脚本: examples/qwen/pretrain_qwen_14b_ptd.sh
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
# 设置 ascend-toolkit 路径
|
# 设置 ascend-toolkit 路径
|
||||||
|
@ -360,7 +362,7 @@ Qwen-14B 训练的硬件配置:
|
||||||
CKPT_LOAD_DIR="your megatron ckpt save path"
|
CKPT_LOAD_DIR="your megatron ckpt save path"
|
||||||
```
|
```
|
||||||
|
|
||||||
启动 Qwen-14B 微调脚本: examples/qwen/pretrain_qwen_14b_ptd.sh
|
启动 Qwen-14B 预训练脚本: examples/qwen/pretrain_qwen_14b_ptd.sh
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
bash examples/qwen/pretrain_qwen_14b_ptd.sh
|
bash examples/qwen/pretrain_qwen_14b_ptd.sh
|
||||||
|
@ -424,10 +426,10 @@ TASK="mmlu" # ceval任务配置为 "ceval"
|
||||||
bash tasks/evaluation/evaluate_qwen_14b_ptd.sh
|
bash tasks/evaluation/evaluate_qwen_14b_ptd.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
| 数据集 | 总学科数 | 总问题数 | 参考准确率 | NPU准确率 |
|
| 数据集 | 总学科数 | 总问题数 | 参考准确率 | NPU准确率 |
|
||||||
|:---:|:---:|:---:|:---:|:---:|
|
|:---:|:---:|:---:|:--------------------------------------------:|:---:|
|
||||||
| CEval | 52 | 1346 | 72.1 | 71.1 |
|
| CEval | 52 | 1346 | [72.1](https://huggingface.co/Qwen/Qwen-14B) | 71.1 |
|
||||||
| MMLU | 57 | 14042 | 66.3 | 65.3 |
|
| MMLU | 57 | 14042 | [66.3](https://huggingface.co/Qwen/Qwen-14B) | 65.3 |
|
||||||
|
|
||||||
|
|
||||||
# Qwen-72B
|
# Qwen-72B
|
||||||
|
@ -436,9 +438,10 @@ bash tasks/evaluation/evaluate_qwen_14b_ptd.sh
|
||||||
|
|
||||||
Qwen-72B 训练的硬件配置:
|
Qwen-72B 训练的硬件配置:
|
||||||
|
|
||||||
| 硬件 | 配置 |
|
| 硬件 | 序列长度 | 配置 |
|
||||||
| :--: |:-----------------:|
|
|:---:|:----:|:-----------------:|
|
||||||
| NPU | 128 x Ascend NPUs |
|
| NPU | 8k | 64 x Ascend NPUs |
|
||||||
|
| NPU | 32k | 320 x Ascend NPUs |
|
||||||
|
|
||||||
### 脚本
|
### 脚本
|
||||||
|
|
||||||
|
@ -522,15 +525,15 @@ Qwen-72B 训练的硬件配置:
|
||||||
--tokenizer-name-or-path ../qwen-72b-hf \
|
--tokenizer-name-or-path ../qwen-72b-hf \
|
||||||
--output-prefix ../dataset_qwen-72b/alpaca \
|
--output-prefix ../dataset_qwen-72b/alpaca \
|
||||||
--tokenizer-type PretrainedFromHF \
|
--tokenizer-type PretrainedFromHF \
|
||||||
--seq-length 32768 \
|
--seq-length 8192 \
|
||||||
--workers 4 \
|
--workers 4 \
|
||||||
--log-interval 1000 \
|
--log-interval 1000 \
|
||||||
|
|
||||||
cd ..
|
cd ..
|
||||||
```
|
```
|
||||||
5. 微调
|
5. 预训练
|
||||||
|
|
||||||
配置Qwen-72B 微调脚本: examples/qwen/pretrain_qwen_72b_ptd.sh
|
配置Qwen-72B 预训练脚本: examples/qwen/pretrain_qwen_72b_ptd.sh
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
# 设置 ascend-toolkit 路径
|
# 设置 ascend-toolkit 路径
|
||||||
|
@ -542,8 +545,18 @@ Qwen-72B 训练的硬件配置:
|
||||||
DATA_PATH="./dataset_qwen-72b/alpaca_text_document" #数据集路径
|
DATA_PATH="./dataset_qwen-72b/alpaca_text_document" #数据集路径
|
||||||
CKPT_LOAD_DIR="your megatron ckpt save path"
|
CKPT_LOAD_DIR="your megatron ckpt save path"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
若使用32k长序列,需要开启重计算特性并修改seq-length参数值为32768,参数配置如下:
|
||||||
|
|
||||||
启动 Qwen-72B 微调脚本: examples/qwen/pretrain_qwen_72b_ptd.sh
|
```shell
|
||||||
|
--seq-length 32768 \
|
||||||
|
|
||||||
|
--recompute-granularity full \
|
||||||
|
--recompute-method block \
|
||||||
|
--recompute-num-layers 2 \
|
||||||
|
```
|
||||||
|
|
||||||
|
启动 Qwen-72B 预训练脚本: examples/qwen/pretrain_qwen_72b_ptd.sh
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
bash examples/qwen/pretrain_qwen_72b_ptd.sh
|
bash examples/qwen/pretrain_qwen_72b_ptd.sh
|
||||||
|
@ -558,4 +571,57 @@ Qwen-72B 在 **昇腾芯片** 和 **参考芯片** 上的性能对比:
|
||||||
| 设备 | 模型 | tokens吞吐 (tokens/s/p)(8k序列) | tokens吞吐 (tokens/s/p)(32k序列) |
|
| 设备 | 模型 | tokens吞吐 (tokens/s/p)(8k序列) | tokens吞吐 (tokens/s/p)(32k序列) |
|
||||||
|:----:|:--------:|:-----------------------:|:-----------------------:|
|
|:----:|:--------:|:-----------------------:|:-----------------------:|
|
||||||
| NPUs | Qwen-72B | 285 | -- |
|
| NPUs | Qwen-72B | 285 | -- |
|
||||||
| 参考 | Qwen-72B | 345 | -- |
|
| 参考 | Qwen-72B | 345 | -- |
|
||||||
|
|
||||||
|
|
||||||
|
## 推理
|
||||||
|
|
||||||
|
配置 qwen-72b 推理脚本:tasks/inference/generate_qwen_72b_ptd.sh
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# ascend-toolkit 路径
|
||||||
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||||
|
|
||||||
|
# 修改模型权重路径和此表路径
|
||||||
|
CHECKPOINT="your model directory path"
|
||||||
|
TOKENIZER_PATH=./qwen-72b-hf
|
||||||
|
```
|
||||||
|
|
||||||
|
启动qwen-72b推理脚本
|
||||||
|
```bash
|
||||||
|
bash tasks/inference/generate_qwen_72b_ptd.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
推理示例如下:
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
|
## 评估
|
||||||
|
|
||||||
|
使用[CEval数据集](https://huggingface.co/datasets/ceval/ceval-exam)和[MMLU数据集](https://huggingface.co/datasets/cais/mmlu)评估模型.
|
||||||
|
|
||||||
|
配置qwen-72b评估脚本: tasks/evaluation/evaluate_qwen_72b_ptd.sh
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# ascend-toolkit 路径
|
||||||
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||||
|
|
||||||
|
# 修改模型参数路径和词表路径
|
||||||
|
TOKENIZER_PATH=./qwen-72b-hf #词表路径
|
||||||
|
CHECKPOINT="your model directory path" #模型路径
|
||||||
|
|
||||||
|
# 配置任务和数据集路径
|
||||||
|
DATA_PATH="./mmlu/data/test/" # ceval任务配置为 "./ceval/val/"
|
||||||
|
TASK="mmlu" # ceval任务配置为 "ceval"
|
||||||
|
```
|
||||||
|
|
||||||
|
启动评估
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash tasks/evaluation/evaluate_qwen_72b_ptd.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
| 数据集 | 总学科数 | 总问题数 | 参考准确率 | NPU准确率 |
|
||||||
|
|:---:|:---:|:---:|:--------------------------------------------:|:---:|
|
||||||
|
| CEval | 52 | 1346 | [83.3](https://huggingface.co/Qwen/Qwen-72B) | 81.8 |
|
||||||
|
| MMLU | 57 | 14042 | [77.4](https://huggingface.co/Qwen/Qwen-72B) | 74.6 |
|
|
@ -27,6 +27,8 @@
|
||||||
- [Script](#script)
|
- [Script](#script)
|
||||||
- [Performance](#performance)
|
- [Performance](#performance)
|
||||||
- [Machine performance](#machine-performance)
|
- [Machine performance](#machine-performance)
|
||||||
|
- [Inference](#Inference)
|
||||||
|
- [Evaluation](#Evaluation)
|
||||||
|
|
||||||
# Qwen-7B
|
# Qwen-7B
|
||||||
|
|
||||||
|
@ -140,9 +142,9 @@ Here's a hardware summary of pre-training Qwen-7B:
|
||||||
cd ..
|
cd ..
|
||||||
```
|
```
|
||||||
|
|
||||||
5. fine-tuning
|
5. pre-training
|
||||||
|
|
||||||
Config Qwen-7B fine-tuning script: examples/qwen/pretrain_qwen_7b_ptd.sh
|
Config Qwen-7B pre-training script: examples/qwen/pretrain_qwen_7b_ptd.sh
|
||||||
```shell
|
```shell
|
||||||
# modify the script according to your own ascend-toolkit path
|
# modify the script according to your own ascend-toolkit path
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||||
|
@ -154,7 +156,7 @@ Here's a hardware summary of pre-training Qwen-7B:
|
||||||
CKPT_LOAD_DIR="your megatron ckpt save path"
|
CKPT_LOAD_DIR="your megatron ckpt save path"
|
||||||
```
|
```
|
||||||
|
|
||||||
Launch Qwen-7B fine-tuning script: examples/qwen/pretrain_qwen_7b_ptd.sh
|
Launch Qwen-7B pre-training script: examples/qwen/pretrain_qwen_7b_ptd.sh
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
bash examples/qwen/pretrain_qwen_7b_ptd.sh
|
bash examples/qwen/pretrain_qwen_7b_ptd.sh
|
||||||
|
@ -342,9 +344,9 @@ Here's a hardware summary of pre-training Qwen-14B:
|
||||||
cd ..
|
cd ..
|
||||||
```
|
```
|
||||||
|
|
||||||
5. fine-tuning
|
5. pre-training
|
||||||
|
|
||||||
Config Qwen-14B fine-tuning script: examples/qwen/pretrain_qwen_14b_ptd.sh
|
Config Qwen-14B pre-training script: examples/qwen/pretrain_qwen_14b_ptd.sh
|
||||||
```shell
|
```shell
|
||||||
# modify the script according to your own ascend-toolkit path
|
# modify the script according to your own ascend-toolkit path
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||||
|
@ -356,7 +358,7 @@ Here's a hardware summary of pre-training Qwen-14B:
|
||||||
CKPT_LOAD_DIR="your megatron ckpt save path"
|
CKPT_LOAD_DIR="your megatron ckpt save path"
|
||||||
```
|
```
|
||||||
|
|
||||||
Launch Qwen-14B fine-tuning script: examples/qwen/pretrain_qwen_14b_ptd.sh
|
Launch Qwen-14B pre-training script: examples/qwen/pretrain_qwen_14b_ptd.sh
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
bash examples/qwen/pretrain_qwen_14b_ptd.sh
|
bash examples/qwen/pretrain_qwen_14b_ptd.sh
|
||||||
|
@ -430,9 +432,10 @@ bash ./tasks/evaluation/evaluate_qwen_14b_ptd.sh
|
||||||
|
|
||||||
Here's a hardware summary of pre-training Qwen-72B:
|
Here's a hardware summary of pre-training Qwen-72B:
|
||||||
|
|
||||||
| Hardware | Value |
|
| Hardware | Seq-length | Value |
|
||||||
| :------: |:-----------------:|
|
| :------: |:----------:|:-----------------:|
|
||||||
| NPU | 128 x Ascend NPUs |
|
| NPU | 8k | 64 x Ascend NPUs |
|
||||||
|
| NPU | 32k | 320 x Ascend NPUs |
|
||||||
|
|
||||||
### Script
|
### Script
|
||||||
|
|
||||||
|
@ -517,16 +520,16 @@ Here's a hardware summary of pre-training Qwen-72B:
|
||||||
--tokenizer-name-or-path ../qwen-72b-hf \
|
--tokenizer-name-or-path ../qwen-72b-hf \
|
||||||
--output-prefix ../dataset_qwen-72b/alpaca \
|
--output-prefix ../dataset_qwen-72b/alpaca \
|
||||||
--tokenizer-type PretrainedFromHF \
|
--tokenizer-type PretrainedFromHF \
|
||||||
--seq-length 32768 \
|
--seq-length 8192 \
|
||||||
--workers 4 \
|
--workers 4 \
|
||||||
--log-interval 1000 \
|
--log-interval 1000 \
|
||||||
|
|
||||||
cd ..
|
cd ..
|
||||||
```
|
```
|
||||||
|
|
||||||
5. fine-tuning
|
5. pre-training
|
||||||
|
|
||||||
Config Qwen-72B fine-tuning script: examples/qwen/pretrain_qwen_72b_ptd.sh
|
Config Qwen-72B pre-training script: examples/qwen/pretrain_qwen_72b_ptd.sh
|
||||||
```shell
|
```shell
|
||||||
# modify the script according to your own ascend-toolkit path
|
# modify the script according to your own ascend-toolkit path
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||||
|
@ -537,8 +540,17 @@ Here's a hardware summary of pre-training Qwen-72B:
|
||||||
DATA_PATH="./dataset_qwen-72b/alpaca_text_document" #processed dataset
|
DATA_PATH="./dataset_qwen-72b/alpaca_text_document" #processed dataset
|
||||||
CKPT_LOAD_DIR="your megatron ckpt save path"
|
CKPT_LOAD_DIR="your megatron ckpt save path"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To use a 32K sequence, turn on the re-computation feature and change the value of seq-length to 32768. The parameter configuration is as follows:
|
||||||
|
```shell
|
||||||
|
--seq-length 32768 \
|
||||||
|
|
||||||
Launch Qwen-72B fine-tuning script: examples/qwen/pretrain_qwen_72b_ptd.sh
|
--recompute-granularity full \
|
||||||
|
--recompute-method block \
|
||||||
|
--recompute-num-layers 2 \
|
||||||
|
```
|
||||||
|
|
||||||
|
Launch Qwen-72B pre-training script: examples/qwen/pretrain_qwen_72b_ptd.sh
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
bash examples/qwen/pretrain_qwen_72b_ptd.sh
|
bash examples/qwen/pretrain_qwen_72b_ptd.sh
|
||||||
|
@ -554,3 +566,54 @@ The performance of Qwen-72B in **Ascend NPU** and **Reference**:
|
||||||
|:---------:|:-------:|:--------------------------------:|:---------------------------------:|
|
|:---------:|:-------:|:--------------------------------:|:---------------------------------:|
|
||||||
| NPUs | Qwen-7B | 285 | -- |
|
| NPUs | Qwen-7B | 285 | -- |
|
||||||
| Reference | Qwen-7B | 345 | -- |
|
| Reference | Qwen-7B | 345 | -- |
|
||||||
|
|
||||||
|
|
||||||
|
## Inference
|
||||||
|
Config qwen-72b inference script: tasks/inference/generate_qwen_72b_ptd.sh
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# ascend-toolkit path
|
||||||
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||||
|
|
||||||
|
# modify script model path and tokenizer path
|
||||||
|
CHECKPOINT="your model directory path"
|
||||||
|
TOKENIZER_PATH=./qwen-72b-hf
|
||||||
|
```
|
||||||
|
|
||||||
|
Launch qwen-72b inference script: tasks/inference/generate_qwen_72b_ptd.sh
|
||||||
|
```bash
|
||||||
|
bash tasks/inference/generate_qwen_72b_ptd.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Some inference samples are as follows:
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
|
## Evaluation
|
||||||
|
We use the [CEval benchmark](https://huggingface.co/datasets/ceval/ceval-exam) and [MMLU benchmark](https://huggingface.co/datasets/cais/mmlu) to evaluate our model.
|
||||||
|
|
||||||
|
Config qwen-72b evaluation script: tasks/evaluation/evaluate_qwen_72b_ptd.sh
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# ascend-toolkit path
|
||||||
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||||
|
|
||||||
|
# Modify the model parameter path and vocabulary path
|
||||||
|
TOKENIZER_PATH=./qwen-72b-hf # vocabulary path
|
||||||
|
CHECKPOINT="your model directory path" # parameter path
|
||||||
|
|
||||||
|
# Configure the task type and dataset path
|
||||||
|
DATA_PATH="./mmlu/data/test/" # "./ceval/val/" for ceval task
|
||||||
|
TASK="mmlu" # "ceval" for ceval task
|
||||||
|
```
|
||||||
|
|
||||||
|
Launch qwen-72b evaluation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash ./tasks/evaluation/evaluate_qwen_72b_ptd.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
| Task | Subset | Question | OpenSource | NPU |
|
||||||
|
|:---:|:---:|:---:|:---:|:---:|
|
||||||
|
| CEval | 52 | 1346 | 83.3 | 81.8 |
|
||||||
|
| MMLU | 57 | 14042 | 77.4 | 74.6 |
|
|
@ -43,7 +43,7 @@ GPT_ARGS="
|
||||||
--global-batch-size 64 \
|
--global-batch-size 64 \
|
||||||
--make-vocab-size-divisible-by 32 \
|
--make-vocab-size-divisible-by 32 \
|
||||||
--lr 1.25e-6 \
|
--lr 1.25e-6 \
|
||||||
--train-iters 2000 \
|
--train-iters 1000 \
|
||||||
--lr-decay-style cosine \
|
--lr-decay-style cosine \
|
||||||
--untie-embeddings-and-output-weights \
|
--untie-embeddings-and-output-weights \
|
||||||
--disable-bias-linear \
|
--disable-bias-linear \
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||||
|
export NPU_ASD_ENABLE=0
|
||||||
|
|
||||||
NPUS_PER_NODE=8
|
NPUS_PER_NODE=8
|
||||||
MASTER_ADDR=localhost
|
MASTER_ADDR=localhost
|
||||||
MASTER_PORT=6000
|
MASTER_PORT=6000
|
||||||
NNODES=1
|
NNODES=8
|
||||||
NODE_RANK=0
|
NODE_RANK=0
|
||||||
WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
|
WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
|
||||||
|
|
||||||
|
@ -15,7 +16,7 @@ TOKENIZER_MODEL="your tokenizer path"
|
||||||
CKPT_LOAD_DIR="your model load ckpt path"
|
CKPT_LOAD_DIR="your model load ckpt path"
|
||||||
|
|
||||||
TP=8
|
TP=8
|
||||||
PP=1
|
PP=8
|
||||||
|
|
||||||
DISTRIBUTED_ARGS="
|
DISTRIBUTED_ARGS="
|
||||||
--nproc_per_node $NPUS_PER_NODE \
|
--nproc_per_node $NPUS_PER_NODE \
|
||||||
|
@ -36,7 +37,7 @@ GPT_ARGS="
|
||||||
--tokenizer-type PretrainedFromHF \
|
--tokenizer-type PretrainedFromHF \
|
||||||
--load ${CKPT_LOAD_DIR} \
|
--load ${CKPT_LOAD_DIR} \
|
||||||
--tokenizer-name-or-path ${TOKENIZER_MODEL} \
|
--tokenizer-name-or-path ${TOKENIZER_MODEL} \
|
||||||
--seq-length 32768 \
|
--seq-length 8192 \
|
||||||
--max-position-embeddings 32768 \
|
--max-position-embeddings 32768 \
|
||||||
--micro-batch-size 1 \
|
--micro-batch-size 1 \
|
||||||
--global-batch-size 16 \
|
--global-batch-size 16 \
|
||||||
|
|
|
@ -43,7 +43,7 @@ GPT_ARGS="
|
||||||
--global-batch-size 64 \
|
--global-batch-size 64 \
|
||||||
--make-vocab-size-divisible-by 16 \
|
--make-vocab-size-divisible-by 16 \
|
||||||
--lr 1.25e-6 \
|
--lr 1.25e-6 \
|
||||||
--train-iters 2000 \
|
--train-iters 1000 \
|
||||||
--lr-decay-style cosine \
|
--lr-decay-style cosine \
|
||||||
--untie-embeddings-and-output-weights \
|
--untie-embeddings-and-output-weights \
|
||||||
--disable-bias-linear \
|
--disable-bias-linear \
|
||||||
|
|
|
@ -14,4 +14,5 @@ six
|
||||||
torch==2.1.0
|
torch==2.1.0
|
||||||
torchvision==0.16.0
|
torchvision==0.16.0
|
||||||
protobuf
|
protobuf
|
||||||
peft==0.7.1
|
peft==0.7.1
|
||||||
|
tiktoken
|
Binary file not shown.
After Width: | Height: | Size: 57 KiB |
|
@ -68,7 +68,7 @@ class MmluEval(DatasetEval):
|
||||||
chat_results, rank = chat.chat(instruction=instructions, history=[])
|
chat_results, rank = chat.chat(instruction=instructions, history=[])
|
||||||
if chat_results:
|
if chat_results:
|
||||||
for index, chat_result in enumerate(chat_results):
|
for index, chat_result in enumerate(chat_results):
|
||||||
answer = chat_result[0]
|
answer = chat_result[0].lstrip()
|
||||||
try:
|
try:
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
logger.info(instruction)
|
logger.info(instruction)
|
||||||
|
|
|
@ -0,0 +1,55 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# The number of parameters is not aligned
|
||||||
|
export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
|
||||||
|
export HCCL_CONNECT_TIMEOUT=1200
|
||||||
|
export COMBINED_ENABLE=1
|
||||||
|
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||||
|
|
||||||
|
# Change for multinode config
|
||||||
|
MASTER_ADDR=localhost
|
||||||
|
MASTER_PORT=6001
|
||||||
|
NNODES=1
|
||||||
|
NODE_RANK=0
|
||||||
|
NPUS_PER_NODE=8
|
||||||
|
|
||||||
|
WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
|
||||||
|
|
||||||
|
DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
|
||||||
|
|
||||||
|
CHECKPOINT="your model directory path"
|
||||||
|
TOKENIZER_PATH="your tokenizer directory path"
|
||||||
|
DATA_PATH="./mmlu/data/test"
|
||||||
|
TASK="mmlu"
|
||||||
|
|
||||||
|
# Different task needs different max_new_tokens value, please follow the instruction in readme.
|
||||||
|
torchrun $DISTRIBUTED_ARGS ./tasks/evaluation/evaluation_llama.py \
|
||||||
|
--task-data-path $DATA_PATH \
|
||||||
|
--task $TASK\
|
||||||
|
--seq-length 8192 \
|
||||||
|
--max-new-tokens 1 \
|
||||||
|
--max-position-embeddings 32768 \
|
||||||
|
--tensor-model-parallel-size 8 \
|
||||||
|
--pipeline-model-parallel-size 1 \
|
||||||
|
--num-layers 80 \
|
||||||
|
--hidden-size 8192 \
|
||||||
|
--ffn-hidden-size 24576 \
|
||||||
|
--num-attention-heads 64 \
|
||||||
|
--disable-bias-linear \
|
||||||
|
--swiglu \
|
||||||
|
--position-embedding-type rope \
|
||||||
|
--load ${CHECKPOINT} \
|
||||||
|
--normalization RMSNorm \
|
||||||
|
--tokenizer-type PretrainedFromHF \
|
||||||
|
--tokenizer-name-or-path ${TOKENIZER_PATH} \
|
||||||
|
--tokenizer-not-use-fast \
|
||||||
|
--bf16 \
|
||||||
|
--micro-batch-size 1 \
|
||||||
|
--exit-on-missing-checkpoint \
|
||||||
|
--no-load-rng \
|
||||||
|
--no-load-optim \
|
||||||
|
--untie-embeddings-and-output-weights \
|
||||||
|
--add-qkv-bias \
|
||||||
|
--tokenizer-kwargs 'eos_token' '<|endoftext|>' 'pad_token' '<|extra_0|>' \
|
||||||
|
--make-vocab-size-divisible-by 64 \
|
||||||
|
--seed 42 | tee ./eval_qwen_72b_${TASK}.log
|
|
@ -0,0 +1,51 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# The number of parameters is not aligned
|
||||||
|
export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
|
||||||
|
export HCCL_CONNECT_TIMEOUT=1200
|
||||||
|
export COMBINED_ENABLE=1
|
||||||
|
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||||
|
|
||||||
|
# please fill these path configurations
|
||||||
|
CHECKPOINT="your model directory path"
|
||||||
|
TOKENIZER_PATH="your tokenizer path"
|
||||||
|
|
||||||
|
# Change for multinode config
|
||||||
|
MASTER_ADDR=localhost
|
||||||
|
MASTER_PORT=6001
|
||||||
|
NNODES=1
|
||||||
|
NODE_RANK=0
|
||||||
|
NPUS_PER_NODE=8
|
||||||
|
WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
|
||||||
|
|
||||||
|
DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
|
||||||
|
|
||||||
|
torchrun $DISTRIBUTED_ARGS inference_qwen.py \
|
||||||
|
--tensor-model-parallel-size 8 \
|
||||||
|
--pipeline-model-parallel-size 1 \
|
||||||
|
--num-layers 80 \
|
||||||
|
--hidden-size 8192 \
|
||||||
|
--num-attention-heads 64 \
|
||||||
|
--ffn-hidden-size 24576 \
|
||||||
|
--max-position-embeddings 32768 \
|
||||||
|
--seq-length 8192 \
|
||||||
|
--make-vocab-size-divisible-by 64 \
|
||||||
|
--untie-embeddings-and-output-weights \
|
||||||
|
--micro-batch-size 1 \
|
||||||
|
--swiglu \
|
||||||
|
--disable-bias-linear \
|
||||||
|
--tokenizer-type PretrainedFromHF \
|
||||||
|
--tokenizer-name-or-path ${TOKENIZER_PATH} \
|
||||||
|
--load ${CHECKPOINT} \
|
||||||
|
--normalization RMSNorm \
|
||||||
|
--position-embedding-type rope \
|
||||||
|
--norm-epsilon 1e-6 \
|
||||||
|
--hidden-dropout 0 \
|
||||||
|
--attention-dropout 0 \
|
||||||
|
--tokenizer-not-use-fast \
|
||||||
|
--add-qkv-bias \
|
||||||
|
--rotary-base 1000000 \
|
||||||
|
--tokenizer-kwargs 'eos_token' '<|endoftext|>' 'pad_token' '<|extra_0|>' \
|
||||||
|
--max-new-tokens 256 \
|
||||||
|
--seed 42 \
|
||||||
|
--bf16
|
Loading…
Reference in New Issue