-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
77e66b8
commit 416e3a8
Showing
3 changed files
with
435 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import json | ||
import pandas as pd | ||
|
||
def json_file_writer(file_path,data): | ||
with open(file_path, 'w+') as f: | ||
json.dump(data, f) | ||
|
||
|
||
def llama3_data_preparation(system_prompt,query_list,response_list,out_file_path): | ||
data = [] | ||
for query,response in zip(query_list,response_list): | ||
data_point= f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|> | ||
{system_prompt} <|eot_id|><|start_header_id|>user<|end_header_id|> | ||
{query} <|eot_id|><|start_header_id|>assistant<|end_header_id|> | ||
{response} <|eot_id|>''' | ||
data.append({"text":data_point}) | ||
ready_to_train={"version": "0.1.0", | ||
'data':data} | ||
|
||
json_file_writer(out_file_path,ready_to_train) | ||
|
||
|
||
## Only for consolidated_dev.csv | ||
def loading_inference_file(inference_file_path): | ||
df = pd.read_csv(inference_file_path) | ||
#Converting data frame into dict | ||
inference_dict = df.to_dict(orient='records') | ||
return inference_dict | ||
|
||
def formating_question(data_point): | ||
questtion = data_point['question'] + ' (1) '+str(data_point['opa']) + ' (2) '+str(data_point['opb']) + ' (3) '+str(data_point['opc']) + ' (4) '+str(data_point['opd']) | ||
return questtion | ||
|
||
def correct_option_extractor(data_point): | ||
return data_point['cop'] | ||
|
||
|
||
|
||
################ | ||
# VARIABLES | ||
################ | ||
|
||
# CSV file to load | ||
inference_file_path = "consolidated_dev.csv" | ||
|
||
# Save the output | ||
out_file_path = "ready_to_train.json" | ||
|
||
# | ||
system_prompt = "You are presented with the following multiple choice question. Think step by step and then select the best answer. just return the correct option with its number" | ||
|
||
################### | ||
# | ||
################## | ||
|
||
query_list = [] | ||
response_list = [] | ||
inference_dict = loading_inference_file(inference_file_path) | ||
|
||
for i in range(len(inference_dict)): | ||
query_list.append( formating_question(inference_dict[i]) ) | ||
response_list.append( correct_option_extractor(inference_dict[i]) ) | ||
|
||
|
||
# Create output training json | ||
llama3_data_preparation( system_prompt,query_list,response_list,out_file_path ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,281 @@ | ||
import os | ||
import torch | ||
from datasets import load_dataset,load_from_disk | ||
from transformers import ( | ||
AutoModelForCausalLM, | ||
AutoTokenizer, | ||
BitsAndBytesConfig, | ||
HfArgumentParser, | ||
TrainingArguments, | ||
pipeline, | ||
logging, | ||
) | ||
from peft import LoraConfig, PeftModel | ||
from trl import SFTTrainer | ||
import pandas as pd | ||
|
||
def loading_model_and_tokenizer(model_dir): | ||
model = AutoModelForCausalLM.from_pretrained( | ||
f'{model_dir}/model', | ||
load_in_4bit=True, | ||
device_map="auto", | ||
trust_remote_code=True, | ||
) | ||
model.config.use_cache = False | ||
model.config.pretraining_tp = 1 | ||
|
||
# Load LLaMA tokenizer | ||
tokenizer = AutoTokenizer.from_pretrained(f'{model_dir}/tokenizer', trust_remote_code=True) | ||
tokenizer.pad_token = tokenizer.eos_token | ||
tokenizer.padding_side = "right" | ||
return model,tokenizer | ||
|
||
|
||
def training_model(out_path,start_epoch,end_epoch,lora_r,lora_alpha,learning_rate,batch_size,logging_steps,save_steps,model,tokenizer,dataset): | ||
|
||
|
||
################################################################################ | ||
# QLoRA parameters | ||
################################################################################ | ||
|
||
|
||
lora_r = lora_r | ||
|
||
# Alpha parameterter for LoRA scaling | ||
lora_alpha = lora_alpha | ||
|
||
# Dropout probability for LoRA layers | ||
lora_dropout = 0.1 | ||
|
||
################################################################################ | ||
# bitsandbytes parameters | ||
################################################################################ | ||
|
||
# Activate 4-bit precision base model loading | ||
use_4bit = True | ||
|
||
# Compute dtype for 4-bit base models | ||
bnb_4bit_compute_dtype = "float16" | ||
|
||
# Quantization type (fp4 or nf4) | ||
bnb_4bit_quant_type = "nf4" | ||
|
||
# Activate nested quantization for 4-bit base models (double quantization) | ||
use_nested_quant = False | ||
|
||
################################################################################ | ||
# TrainingArguments parameters | ||
################################################################################ | ||
|
||
# Output directory where the model predictions and checkpoints will be stored | ||
output_dir = out_path+ "/checkpoint" | ||
|
||
# Check if the directory exists | ||
if not os.path.exists(output_dir): | ||
# Create the directory | ||
os.makedirs(output_dir) | ||
|
||
|
||
# Number of training epochs | ||
num_train_epochs = start_epoch | ||
|
||
# Enable fp16/bf16 training (set bf16 to True with an A100) | ||
fp16 = False | ||
bf16 = False | ||
|
||
# Batch size per GPU for training | ||
per_device_train_batch_size = batch_size | ||
|
||
# Batch size per GPU for evaluation | ||
per_device_eval_batch_size = batch_size | ||
|
||
# Number of update steps to accumulate the gradients for | ||
gradient_accumulation_steps = 1 | ||
|
||
# Enable gradient checkpointing | ||
gradient_checkpointing = True | ||
|
||
# Maximum gradient normal (gradient clipping) | ||
max_grad_norm = 0.3 | ||
|
||
# Initial learning rate (AdamW optimizer) | ||
learning_rate = learning_rate | ||
|
||
# Weight decay to apply to all layers except bias/LayerNorm weights | ||
weight_decay = 0.001 | ||
|
||
# Optimizer to use | ||
optim = "paged_adamw_32bit" | ||
|
||
# Learning rate schedule | ||
lr_scheduler_type = "cosine" | ||
|
||
# Number of training steps (overrides num_train_epochs) | ||
max_steps = -1 | ||
|
||
# Ratio of steps for a linear warmup (from 0 to learning rate) | ||
warmup_ratio = 0.03 | ||
|
||
# Group sequences into batches with same length | ||
# Saves memory and speeds up training considerably | ||
group_by_length = True | ||
|
||
# Save checkpoint every X updates steps | ||
save_steps = save_steps | ||
|
||
# Log every X updates steps | ||
logging_steps = logging_steps | ||
|
||
################################################################################ | ||
# SFT parameters | ||
################################################################################ | ||
|
||
# Maximum sequence length to use | ||
max_seq_length = None | ||
|
||
# Pack multiple short examples in the same input sequence to increase efficiency | ||
packing = False | ||
|
||
# Load the entire model on the GPU 0 | ||
device_map = {"": 0} | ||
|
||
|
||
|
||
|
||
compute_dtype = getattr(torch, bnb_4bit_compute_dtype) | ||
|
||
bnb_config = BitsAndBytesConfig( | ||
load_in_4bit=use_4bit, | ||
bnb_4bit_quant_type=bnb_4bit_quant_type, | ||
bnb_4bit_compute_dtype=compute_dtype, | ||
bnb_4bit_use_double_quant=use_nested_quant, | ||
) | ||
|
||
# Check GPU compatibility with bfloat16 | ||
if compute_dtype == torch.float16 and use_4bit: | ||
major, _ = torch.cuda.get_device_capability() | ||
if major >= 8: | ||
print("=" * 80) | ||
print("Your GPU supports bfloat16: accelerate training with bf16=True") | ||
print("=" * 80) | ||
|
||
|
||
# Load LoRA configuration | ||
peft_config = LoraConfig( | ||
lora_alpha=lora_alpha, | ||
lora_dropout=lora_dropout, | ||
r=lora_r, | ||
bias="none", | ||
task_type="CAUSAL_LM", | ||
) | ||
|
||
# Set training parameters | ||
report_to = None | ||
|
||
while start_epoch <=end_epoch: | ||
if start_epoch ==1 : | ||
resume_from_checkpoint=False | ||
else: | ||
resume_from_checkpoint=True | ||
|
||
|
||
training_arguments = TrainingArguments( | ||
output_dir=output_dir, | ||
num_train_epochs=start_epoch, | ||
per_device_train_batch_size=per_device_train_batch_size, | ||
gradient_accumulation_steps=gradient_accumulation_steps, | ||
optim=optim, | ||
save_steps=save_steps, | ||
logging_steps=logging_steps, | ||
learning_rate=learning_rate, | ||
weight_decay=weight_decay, | ||
fp16=fp16, | ||
bf16=bf16, | ||
max_grad_norm=max_grad_norm, | ||
max_steps=max_steps, | ||
warmup_ratio=warmup_ratio, | ||
group_by_length=group_by_length, | ||
lr_scheduler_type=lr_scheduler_type, | ||
report_to=report_to, | ||
) | ||
|
||
# Set supervised fine-tuning parameters | ||
trainer = SFTTrainer( | ||
model=model, | ||
train_dataset=dataset, | ||
peft_config=peft_config, | ||
dataset_text_field="text", | ||
max_seq_length=max_seq_length, | ||
tokenizer=tokenizer, | ||
args=training_arguments, | ||
packing=packing, | ||
) | ||
|
||
trainer.train(resume_from_checkpoint=resume_from_checkpoint) | ||
peft_model_id=out_path+f'/model_{start_epoch}epoch' | ||
trainer.model.save_pretrained(peft_model_id) | ||
tokenizer.save_pretrained(peft_model_id) | ||
loss_df=pd.DataFrame(trainer.state.log_history) | ||
loss_df.to_csv(out_path+f'/loss_{start_epoch}epoch.csv') | ||
start_epoch=start_epoch+1 | ||
|
||
|
||
|
||
##########################3 | ||
## VARIABLES | ||
##### | ||
#####################333333 | ||
|
||
# Path to the Llama3 training file. | ||
data_path = "medmcq-736_out_of_182k_ready_to_train.json" | ||
|
||
# Name of the Llama3 model you want to fine-tune. | ||
model_dir ="Meta-Llama-3-8B-Instruct" | ||
|
||
# Path to the directory where you want to save fine-tuned models and checkpoints. | ||
out_path = "output/Llama3_finetuning/Llama3-8b_instruct_r=20_a_40_b5_medmcq-736_out_of_182k" | ||
|
||
# Epoch number from which you want to start training. | ||
# If the start epoch is greater than 1, there should be a saved checkpoint for the previous epoch. | ||
# For example, if the start epoch is 2, there should be saved checkpoints for epoch 1. | ||
start_epoch = 1 | ||
|
||
# Final epoch at which you want to fine-tune your model. | ||
end_epoch = 2 | ||
|
||
# LoRA Rank | ||
lora_r = 20 | ||
|
||
#LoRA alpha | ||
lora_alpha = 40 | ||
|
||
#Learning rate | ||
learning_rate = 2e-4 | ||
|
||
#batch size. | ||
batch_size = 5 | ||
|
||
# Number of steps after which you want to save a checkpoint. | ||
save_steps = 150 | ||
|
||
# Number of steps after which you want to report loss. | ||
logging_steps = 1 | ||
|
||
|
||
|
||
################### | ||
# Dataset | ||
################### | ||
dataset = load_dataset("json", data_files=data_path, field="data") | ||
dataset = dataset['train'] | ||
|
||
if save_steps >= int( len(dataset) / batch_size ): | ||
save_steps = int( len(dataset) / batch_size ) - 5 | ||
print(f"Save steps changed to {save_steps}") | ||
|
||
|
||
########################### | ||
# Train | ||
########################### | ||
model,tokenizer = loading_model_and_tokenizer(model_dir) | ||
training_model(out_path,start_epoch,end_epoch,lora_r,lora_alpha,learning_rate,batch_size,logging_steps,save_steps,model,tokenizer,dataset) |
Oops, something went wrong.