Skip to content

Commit

Permalink
upload code and data
Browse files Browse the repository at this point in the history
  • Loading branch information
Abbey4799 committed Jun 26, 2023
1 parent d587b50 commit 6fb7468
Show file tree
Hide file tree
Showing 12 changed files with 690 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# CuteGPT

An open-source conversational language model developed by the Knowledge Works Research Laboratory at Fudan University.
77 changes: 77 additions & 0 deletions README_ch.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# CuteGPT

CuteGPT是[复旦大学知识工场实验室](http://kw.fudan.edu.cn/)推出的一个支持中英双语的开源对话语言模型,基于原版Llama进行改进和扩展,规模为13B(130亿)参数,可以在单张3090显卡上进行int8精度的推理。CuteGPT在原版Llama的基础上扩充了中文词表并进行了二次预训练,提高了对中文的理解能力,后续经过对话指令微调,提升了模型对指令的理解能力。

## 本地部署

### 安装依赖

```bash
conda create -n cutegpt python=3.7
conda activate cutegpt
pip install -r requirements.txt
```

### 使用示例

```bash

```

## 微调

### 数据格式

数据以 `.pkl`的形式存储,可以用以下代码查看数据:

```python
import pickle
import random
file = 'data/test/ift_data.pkl'
with open(file, "rb") as f:
datas = pickle.load(f)
print(len(datas))
print(random.sample(datas,3))
```

数据由二维数组保存,`len(datas)`为样本的数量,`len(data[i])`代表第 `i`个样本的对话轮数,其中每轮对话由 `instruction``input``output`三个键值构成。示例如下:

```python
[[{'instruction': '请帮我生成一首诗。', 'input': '', 'output': '好的,请给我一些关键词来启发诗歌的创作。'},
{'instruction': '月亮、海浪、夜晚。',
'input': '',
'output': '心中的孤独由天空掩盖,\n夜幕笼罩下,海浪轻扬,\n月光照亮着沙滩,\n让我不再孤单。'}],
[{'instruction': '根据给定的数据集,训练一个能够对垃圾邮件进行分类的模型。\n提供垃圾邮件和非垃圾邮件两类的数据集。',
'input': '',
'output': '要训练一个能...'}]]
```

### 微调代码

预处理数据,将数据拼接成多轮对话的格式

```bash
python code/convert_data.py \
tokenizer ziqingyang/chinese-llama-lora-7b \
max_length 2048 \
out_data_path data/test/
```

训练模型

```bash
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed lora_llama_flashattn.py \
--master_port 12932 \
--save_steps 2000 \
--max_epoches 5 \
--save_name llama_lora_623v1 \
--model_path /data/xuyipei/my_llama/my_llama_13b/llama_13b_112/ \
--dataset_type DatasetIds_HQY \
--data_path ../weighted_dataset/623v1/llama_ift_data_ids.pkl \
--max_length 2048 \
--use_flash_attention
```

参数说明

- `dataset_type`
Binary file added code/__pycache__/dataset.cpython-37.pyc
Binary file not shown.
Binary file added code/__pycache__/utils.cpython-37.pyc
Binary file not shown.
98 changes: 98 additions & 0 deletions code/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@

from peft import LoraConfig


DS_CONFIG = {
"bf16": {
"enabled": True,
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": 3e-5,
"betas": [0.98, 0.999],
"eps": 1e-9
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": 1e-4,
"warmup_max_lr": 3e-4,
"warmup_num_steps": 300
}
},
"zero_optimization": {
"stage": 1,
"allgather_partitions": True,
"allgather_bucket_size": 2e8,
"overlap_comm": True,
"reduce_scatter": True,
"reduce_bucket_size": 2e8,
"contiguous_gradients": True,
# "offload_optimizer": {
# "device": "cpu"
# },
"stage3_gather_16bit_weights_on_model_save": True
},
"gradient_accumulation_steps": 8,
"train_micro_batch_size_per_gpu": 3,
"wall_clock_breakdown": False
}

LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
TARGET_MODULES = [
"q_proj",
"v_proj",
]

lora_config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
target_modules=TARGET_MODULES,
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM",
)



input_template_pool = [
"Input:{}",
"Question:{}",
"Problem:{}",
"Q:{}",
"*Question*:{}",
"(Problem){}",
"{}",
"{}",
"{}"
]

template_pool = {
'wround_woinput':[
"问:{}\n答:{}\n",
"Instruction:{}\Response:{}\n",
"{}\n{}\n"
],
'wround_winput':[
"背景:{}\n{}\n答:{}\n",
"已知:{}\n{}\n回答:{}\n",
"问:{}\n{}\n答:{}\n",
"Instruction:{}\n{}\nResponse:{}\n",
"{}\n{}\n{}\n"
],
'woround_woinput':[
"问:{}\n答:{}\n",
"Instruction:{}\nResponse:{}\n"
"{}\n{}\n"
],
'woround_winput':[
"问:{}\n{}\n答:{}\n",
"Instruction:{}\n{}\nResponse:{}\n"
"{}\n{}\n{}\n"
]
}
meta_prompt = '你是复旦大学知识工场实验室训练出来的语言模型CuteGPT。给定任务描述,请给出对应请求的回答。\n'
30 changes: 30 additions & 0 deletions code/convert_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from transformers import AutoModelWithLMHead, T5Tokenizer, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
import transformers
import pickle
import random
import copy
from tqdm import tqdm
from dataset import GPT2Dataset_onlyres
import argparse
from utils import get_multiround_data

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--tokenizer", type=str, default="ziqingyang/chinese-llama-lora-7b")
parser.add_argument("--max_length",type=int,default=1024,help="max token length")
parser.add_argument("--out_data_path",type=str,default='data/test/',help="the floader to load raw data and save preprocessed data")
args = parser.parse_args()

tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer)
datas = get_multiround_data(args.out_data_path + 'ift_data.pkl')
train_dataset = GPT2Dataset_onlyres(tokenizer, datas, args.max_length)

pickle.dump(
{
"input_ids": train_dataset.input_ids,
"labels": train_dataset.labels,
"attention_mask": train_dataset.attention_mask
},
open(args.out_data_path + "llama_ift_data_ids.pkl", "wb")
)

150 changes: 150 additions & 0 deletions code/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
import copy


class GPT2Dataset_onlyres(Dataset):
'''
Dataset construction for training GPT-2 model, without padding. Truncation is done using the end-of-sequence (EOS) token, and only the loss for the response is computed.
'''
def __init__(self, tokenizer, datas, max_length):
super().__init__()
self.datas = datas
self.tokenizer = tokenizer
self.max_length = max_length
self.index = 0

if not self.tokenizer.bos_token:
self.tokenizer.bos_token = "<s>"
if not self.tokenizer.eos_token:
self.tokenizer.eos_token = "</s>"
if not self.tokenizer.pad_token:
self.tokenizer.pad_token = self.tokenizer.eos_token

self._preprocess()

def _preprocess(self):
self.input_ids = []

self.labels = []
meta_prompt = self.datas[0][0]
meta_tokens = self.tokenizer(meta_prompt, padding=False, truncation=False, add_special_tokens=False)
meta_tokens = meta_tokens["input_ids"][-self.max_length//3:]

for data in tqdm(self.datas):
sample_input_ids = copy.copy(meta_tokens)
sample_labels = [-100] * len(sample_input_ids)

for idx, item in enumerate(data):
if idx > 0:
input, output = item[0], item[1]

input_tokens = self.tokenizer(input, padding=False, truncation=False, add_special_tokens=False)
input_tokens = input_tokens["input_ids"][:self.max_length//3]

len_input = len(input_tokens)
output_tokens = self.tokenizer(output, padding=False, truncation=False, add_special_tokens=False)
output_tokens = output_tokens["input_ids"][:2 * (self.max_length//3) - 1]

sample_input_ids += input_tokens + output_tokens
sample_labels += [-100] * len_input + output_tokens

if len(sample_input_ids) != len(meta_tokens):
self.input_ids += sample_input_ids
self.labels += sample_labels

self.input_ids += [self.tokenizer.eos_token_id]
self.labels += [self.tokenizer.eos_token_id]

self.attention_mask = [1] * len(self.input_ids)

def __len__(self):
return len(self.input_ids) // self.max_length

def __getitem__(self, index):
return torch.tensor(self.input_ids[index * self.max_length : (index + 1) * self.max_length]), \
torch.tensor(self.labels[index * self.max_length : (index + 1) * self.max_length]), \
torch.tensor(self.attention_mask[index * self.max_length : (index + 1) * self.max_length])


class BertDataset_onlyres(Dataset):
'''
Padding is applied between each sample, and the length of each sample does not exceed max_length. Only the loss for the response is computed.
'''
def __init__(self, tokenizer, datas, max_length):
super().__init__()
self.datas = datas
self.tokenizer = tokenizer
self.max_length = max_length

if not self.tokenizer.bos_token:
self.tokenizer.bos_token = "<s>"
if not self.tokenizer.eos_token:
self.tokenizer.eos_token = "</s>"
if not self.tokenizer.pad_token:
self.tokenizer.pad_token = self.tokenizer.eos_token
print('BertDataset_onlyres finished..')

def __len__(self):
return len(self.datas)

def __getitem__(self, index):

meta_prompt = self.datas[0][0]
meta_tokens = self.tokenizer(meta_prompt, padding=False, truncation=False, add_special_tokens=False)
meta_tokens = meta_tokens["input_ids"][-self.max_length//3:]

data = self.datas[index]
sample_input_ids = copy.copy(meta_tokens)
sample_labels = [-100] * len(sample_input_ids)

for idx, item in enumerate(data):
if idx > 0:
input, output = item[0], item[1]
input_tokens = self.tokenizer(input, padding=False, truncation=False, add_special_tokens=False)
input_tokens = input_tokens["input_ids"][:self.max_length//3]

len_input = len(input_tokens)
output_tokens = self.tokenizer(output, padding=False, truncation=False, add_special_tokens=False)
output_tokens = output_tokens["input_ids"][:2 * (self.max_length//3) - 1]

sample_input_ids += input_tokens + output_tokens
sample_labels += [-100] * len_input + output_tokens

sample_input_ids += [self.tokenizer.eos_token_id]
sample_labels += [self.tokenizer.eos_token_id]
sample_attention_mask = [1] * len(sample_input_ids)

sample_input_ids += [self.tokenizer.pad_token_id] * (self.max_length - len(sample_input_ids))
sample_labels += [-100] * (self.max_length - len(sample_labels))
sample_attention_mask += [0] * (self.max_length - len(sample_attention_mask))

sample_input_ids = sample_input_ids[:self.max_length]
sample_labels = sample_labels[:self.max_length]
sample_attention_mask = sample_attention_mask[:self.max_length]


return torch.tensor(sample_input_ids), torch.tensor(sample_labels), torch.tensor(sample_attention_mask)



class DatasetIds(Dataset):
'''
Dataset construction for training GPT-2 model, without padding. Truncation is done using the end-of-sequence (EOS) token.
This dataset directly loads preprocessed data, eliminating the need for waiting.
'''
def __init__(self, tokenizer, datas, max_length, **kwargs):
super().__init__()
self.input_ids = datas['input_ids']
self.attention_mask = datas['attention_mask']
self.labels = datas['labels']
self.max_length = max_length

def __len__(self):
return len(self.input_ids) // self.max_length

def __getitem__(self, index):
return torch.tensor(self.input_ids[index * self.max_length : (index + 1) * self.max_length]), \
torch.tensor(self.labels[index * self.max_length : (index + 1) * self.max_length]), \
torch.tensor(self.attention_mask[index * self.max_length : (index + 1) * self.max_length])
Loading

0 comments on commit 6fb7468

Please sign in to comment.