-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
690 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
# CuteGPT | ||
|
||
An open-source conversational language model developed by the Knowledge Works Research Laboratory at Fudan University. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
# CuteGPT | ||
|
||
CuteGPT是[复旦大学知识工场实验室](http://kw.fudan.edu.cn/)推出的一个支持中英双语的开源对话语言模型,基于原版Llama进行改进和扩展,规模为13B(130亿)参数,可以在单张3090显卡上进行int8精度的推理。CuteGPT在原版Llama的基础上扩充了中文词表并进行了二次预训练,提高了对中文的理解能力,后续经过对话指令微调,提升了模型对指令的理解能力。 | ||
|
||
## 本地部署 | ||
|
||
### 安装依赖 | ||
|
||
```bash | ||
conda create -n cutegpt python=3.7 | ||
conda activate cutegpt | ||
pip install -r requirements.txt | ||
``` | ||
|
||
### 使用示例 | ||
|
||
```bash | ||
|
||
``` | ||
|
||
## 微调 | ||
|
||
### 数据格式 | ||
|
||
数据以 `.pkl`的形式存储,可以用以下代码查看数据: | ||
|
||
```python | ||
import pickle | ||
import random | ||
file = 'data/test/ift_data.pkl' | ||
with open(file, "rb") as f: | ||
datas = pickle.load(f) | ||
print(len(datas)) | ||
print(random.sample(datas,3)) | ||
``` | ||
|
||
数据由二维数组保存,`len(datas)`为样本的数量,`len(data[i])`代表第 `i`个样本的对话轮数,其中每轮对话由 `instruction`、`input`、`output`三个键值构成。示例如下: | ||
|
||
```python | ||
[[{'instruction': '请帮我生成一首诗。', 'input': '', 'output': '好的,请给我一些关键词来启发诗歌的创作。'}, | ||
{'instruction': '月亮、海浪、夜晚。', | ||
'input': '', | ||
'output': '心中的孤独由天空掩盖,\n夜幕笼罩下,海浪轻扬,\n月光照亮着沙滩,\n让我不再孤单。'}], | ||
[{'instruction': '根据给定的数据集,训练一个能够对垃圾邮件进行分类的模型。\n提供垃圾邮件和非垃圾邮件两类的数据集。', | ||
'input': '', | ||
'output': '要训练一个能...'}]] | ||
``` | ||
|
||
### 微调代码 | ||
|
||
预处理数据,将数据拼接成多轮对话的格式 | ||
|
||
```bash | ||
python code/convert_data.py \ | ||
tokenizer ziqingyang/chinese-llama-lora-7b \ | ||
max_length 2048 \ | ||
out_data_path data/test/ | ||
``` | ||
|
||
训练模型 | ||
|
||
```bash | ||
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed lora_llama_flashattn.py \ | ||
--master_port 12932 \ | ||
--save_steps 2000 \ | ||
--max_epoches 5 \ | ||
--save_name llama_lora_623v1 \ | ||
--model_path /data/xuyipei/my_llama/my_llama_13b/llama_13b_112/ \ | ||
--dataset_type DatasetIds_HQY \ | ||
--data_path ../weighted_dataset/623v1/llama_ift_data_ids.pkl \ | ||
--max_length 2048 \ | ||
--use_flash_attention | ||
``` | ||
|
||
参数说明 | ||
|
||
- `dataset_type`: |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
|
||
from peft import LoraConfig | ||
|
||
|
||
DS_CONFIG = { | ||
"bf16": { | ||
"enabled": True, | ||
}, | ||
"optimizer": { | ||
"type": "AdamW", | ||
"params": { | ||
"lr": 3e-5, | ||
"betas": [0.98, 0.999], | ||
"eps": 1e-9 | ||
} | ||
}, | ||
"scheduler": { | ||
"type": "WarmupLR", | ||
"params": { | ||
"warmup_min_lr": 1e-4, | ||
"warmup_max_lr": 3e-4, | ||
"warmup_num_steps": 300 | ||
} | ||
}, | ||
"zero_optimization": { | ||
"stage": 1, | ||
"allgather_partitions": True, | ||
"allgather_bucket_size": 2e8, | ||
"overlap_comm": True, | ||
"reduce_scatter": True, | ||
"reduce_bucket_size": 2e8, | ||
"contiguous_gradients": True, | ||
# "offload_optimizer": { | ||
# "device": "cpu" | ||
# }, | ||
"stage3_gather_16bit_weights_on_model_save": True | ||
}, | ||
"gradient_accumulation_steps": 8, | ||
"train_micro_batch_size_per_gpu": 3, | ||
"wall_clock_breakdown": False | ||
} | ||
|
||
LORA_R = 8 | ||
LORA_ALPHA = 16 | ||
LORA_DROPOUT = 0.05 | ||
TARGET_MODULES = [ | ||
"q_proj", | ||
"v_proj", | ||
] | ||
|
||
lora_config = LoraConfig( | ||
r=LORA_R, | ||
lora_alpha=LORA_ALPHA, | ||
target_modules=TARGET_MODULES, | ||
lora_dropout=LORA_DROPOUT, | ||
bias="none", | ||
task_type="CAUSAL_LM", | ||
) | ||
|
||
|
||
|
||
input_template_pool = [ | ||
"Input:{}", | ||
"Question:{}", | ||
"Problem:{}", | ||
"Q:{}", | ||
"*Question*:{}", | ||
"(Problem){}", | ||
"{}", | ||
"{}", | ||
"{}" | ||
] | ||
|
||
template_pool = { | ||
'wround_woinput':[ | ||
"问:{}\n答:{}\n", | ||
"Instruction:{}\Response:{}\n", | ||
"{}\n{}\n" | ||
], | ||
'wround_winput':[ | ||
"背景:{}\n{}\n答:{}\n", | ||
"已知:{}\n{}\n回答:{}\n", | ||
"问:{}\n{}\n答:{}\n", | ||
"Instruction:{}\n{}\nResponse:{}\n", | ||
"{}\n{}\n{}\n" | ||
], | ||
'woround_woinput':[ | ||
"问:{}\n答:{}\n", | ||
"Instruction:{}\nResponse:{}\n" | ||
"{}\n{}\n" | ||
], | ||
'woround_winput':[ | ||
"问:{}\n{}\n答:{}\n", | ||
"Instruction:{}\n{}\nResponse:{}\n" | ||
"{}\n{}\n{}\n" | ||
] | ||
} | ||
meta_prompt = '你是复旦大学知识工场实验室训练出来的语言模型CuteGPT。给定任务描述,请给出对应请求的回答。\n' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from transformers import AutoModelWithLMHead, T5Tokenizer, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer | ||
import transformers | ||
import pickle | ||
import random | ||
import copy | ||
from tqdm import tqdm | ||
from dataset import GPT2Dataset_onlyres | ||
import argparse | ||
from utils import get_multiround_data | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--tokenizer", type=str, default="ziqingyang/chinese-llama-lora-7b") | ||
parser.add_argument("--max_length",type=int,default=1024,help="max token length") | ||
parser.add_argument("--out_data_path",type=str,default='data/test/',help="the floader to load raw data and save preprocessed data") | ||
args = parser.parse_args() | ||
|
||
tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer) | ||
datas = get_multiround_data(args.out_data_path + 'ift_data.pkl') | ||
train_dataset = GPT2Dataset_onlyres(tokenizer, datas, args.max_length) | ||
|
||
pickle.dump( | ||
{ | ||
"input_ids": train_dataset.input_ids, | ||
"labels": train_dataset.labels, | ||
"attention_mask": train_dataset.attention_mask | ||
}, | ||
open(args.out_data_path + "llama_ift_data_ids.pkl", "wb") | ||
) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
import torch | ||
from torch.utils.data import Dataset | ||
from tqdm import tqdm | ||
import copy | ||
|
||
|
||
class GPT2Dataset_onlyres(Dataset): | ||
''' | ||
Dataset construction for training GPT-2 model, without padding. Truncation is done using the end-of-sequence (EOS) token, and only the loss for the response is computed. | ||
''' | ||
def __init__(self, tokenizer, datas, max_length): | ||
super().__init__() | ||
self.datas = datas | ||
self.tokenizer = tokenizer | ||
self.max_length = max_length | ||
self.index = 0 | ||
|
||
if not self.tokenizer.bos_token: | ||
self.tokenizer.bos_token = "<s>" | ||
if not self.tokenizer.eos_token: | ||
self.tokenizer.eos_token = "</s>" | ||
if not self.tokenizer.pad_token: | ||
self.tokenizer.pad_token = self.tokenizer.eos_token | ||
|
||
self._preprocess() | ||
|
||
def _preprocess(self): | ||
self.input_ids = [] | ||
|
||
self.labels = [] | ||
meta_prompt = self.datas[0][0] | ||
meta_tokens = self.tokenizer(meta_prompt, padding=False, truncation=False, add_special_tokens=False) | ||
meta_tokens = meta_tokens["input_ids"][-self.max_length//3:] | ||
|
||
for data in tqdm(self.datas): | ||
sample_input_ids = copy.copy(meta_tokens) | ||
sample_labels = [-100] * len(sample_input_ids) | ||
|
||
for idx, item in enumerate(data): | ||
if idx > 0: | ||
input, output = item[0], item[1] | ||
|
||
input_tokens = self.tokenizer(input, padding=False, truncation=False, add_special_tokens=False) | ||
input_tokens = input_tokens["input_ids"][:self.max_length//3] | ||
|
||
len_input = len(input_tokens) | ||
output_tokens = self.tokenizer(output, padding=False, truncation=False, add_special_tokens=False) | ||
output_tokens = output_tokens["input_ids"][:2 * (self.max_length//3) - 1] | ||
|
||
sample_input_ids += input_tokens + output_tokens | ||
sample_labels += [-100] * len_input + output_tokens | ||
|
||
if len(sample_input_ids) != len(meta_tokens): | ||
self.input_ids += sample_input_ids | ||
self.labels += sample_labels | ||
|
||
self.input_ids += [self.tokenizer.eos_token_id] | ||
self.labels += [self.tokenizer.eos_token_id] | ||
|
||
self.attention_mask = [1] * len(self.input_ids) | ||
|
||
def __len__(self): | ||
return len(self.input_ids) // self.max_length | ||
|
||
def __getitem__(self, index): | ||
return torch.tensor(self.input_ids[index * self.max_length : (index + 1) * self.max_length]), \ | ||
torch.tensor(self.labels[index * self.max_length : (index + 1) * self.max_length]), \ | ||
torch.tensor(self.attention_mask[index * self.max_length : (index + 1) * self.max_length]) | ||
|
||
|
||
class BertDataset_onlyres(Dataset): | ||
''' | ||
Padding is applied between each sample, and the length of each sample does not exceed max_length. Only the loss for the response is computed. | ||
''' | ||
def __init__(self, tokenizer, datas, max_length): | ||
super().__init__() | ||
self.datas = datas | ||
self.tokenizer = tokenizer | ||
self.max_length = max_length | ||
|
||
if not self.tokenizer.bos_token: | ||
self.tokenizer.bos_token = "<s>" | ||
if not self.tokenizer.eos_token: | ||
self.tokenizer.eos_token = "</s>" | ||
if not self.tokenizer.pad_token: | ||
self.tokenizer.pad_token = self.tokenizer.eos_token | ||
print('BertDataset_onlyres finished..') | ||
|
||
def __len__(self): | ||
return len(self.datas) | ||
|
||
def __getitem__(self, index): | ||
|
||
meta_prompt = self.datas[0][0] | ||
meta_tokens = self.tokenizer(meta_prompt, padding=False, truncation=False, add_special_tokens=False) | ||
meta_tokens = meta_tokens["input_ids"][-self.max_length//3:] | ||
|
||
data = self.datas[index] | ||
sample_input_ids = copy.copy(meta_tokens) | ||
sample_labels = [-100] * len(sample_input_ids) | ||
|
||
for idx, item in enumerate(data): | ||
if idx > 0: | ||
input, output = item[0], item[1] | ||
input_tokens = self.tokenizer(input, padding=False, truncation=False, add_special_tokens=False) | ||
input_tokens = input_tokens["input_ids"][:self.max_length//3] | ||
|
||
len_input = len(input_tokens) | ||
output_tokens = self.tokenizer(output, padding=False, truncation=False, add_special_tokens=False) | ||
output_tokens = output_tokens["input_ids"][:2 * (self.max_length//3) - 1] | ||
|
||
sample_input_ids += input_tokens + output_tokens | ||
sample_labels += [-100] * len_input + output_tokens | ||
|
||
sample_input_ids += [self.tokenizer.eos_token_id] | ||
sample_labels += [self.tokenizer.eos_token_id] | ||
sample_attention_mask = [1] * len(sample_input_ids) | ||
|
||
sample_input_ids += [self.tokenizer.pad_token_id] * (self.max_length - len(sample_input_ids)) | ||
sample_labels += [-100] * (self.max_length - len(sample_labels)) | ||
sample_attention_mask += [0] * (self.max_length - len(sample_attention_mask)) | ||
|
||
sample_input_ids = sample_input_ids[:self.max_length] | ||
sample_labels = sample_labels[:self.max_length] | ||
sample_attention_mask = sample_attention_mask[:self.max_length] | ||
|
||
|
||
return torch.tensor(sample_input_ids), torch.tensor(sample_labels), torch.tensor(sample_attention_mask) | ||
|
||
|
||
|
||
class DatasetIds(Dataset): | ||
''' | ||
Dataset construction for training GPT-2 model, without padding. Truncation is done using the end-of-sequence (EOS) token. | ||
This dataset directly loads preprocessed data, eliminating the need for waiting. | ||
''' | ||
def __init__(self, tokenizer, datas, max_length, **kwargs): | ||
super().__init__() | ||
self.input_ids = datas['input_ids'] | ||
self.attention_mask = datas['attention_mask'] | ||
self.labels = datas['labels'] | ||
self.max_length = max_length | ||
|
||
def __len__(self): | ||
return len(self.input_ids) // self.max_length | ||
|
||
def __getitem__(self, index): | ||
return torch.tensor(self.input_ids[index * self.max_length : (index + 1) * self.max_length]), \ | ||
torch.tensor(self.labels[index * self.max_length : (index + 1) * self.max_length]), \ | ||
torch.tensor(self.attention_mask[index * self.max_length : (index + 1) * self.max_length]) |
Oops, something went wrong.