Adapted from Unsloth AI's official fine-tuning scripts.
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install "torch==2.4.0" "xformers==0.0.27.post2" triton torchvision torchaudio
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"
!pip uninstall transformers -y && pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git"
!pip install langchain peft
import os
from unsloth import FastLanguageModel
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline
import pandas as pd
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
max_seq_length = 2048
dtype = None # Unsloth autodetects.
load_in_4bit = True
batch_size = 16
model = FastLanguageModel.get_peft_model(
model,
r = 16,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 16,
lora_dropout = 0, # Unsloth supports any, but = 0 is optimized
bias = "none", # Unsloth supports any, but = "none" is optimized
use_gradient_checkpointing = True,
random_state = 3407,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
_, tokenizer = FastLanguageModel.from_pretrained(
model_name = "jiviteshjn/pittsburgh-rag-qa-mistral-finetuned",
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
def process_example(example):
return {
"question": example["question"],
"answer": example["answer"],
"text_content": example["text_content"],
}
df = pd.read_json('/path/to/input', lines=True) # Fitting it in memory isn't a problem
dataset = Dataset.from_pandas(df)
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Context:
{}
### Q:
{}
### A:
{}"""
EOS_TOKEN = tokenizer.eos_token
def formatting_prompt(examples):
instruction = "Answer the following question based on the provided context. Answers must be from the context exactly and you must never use your own knowledge. Answer questions accurately and faithful to the context."
context = examples['text_content']
input = examples["question"]
output = examples["answer"]
text = prompt.format(instruction, context, input, output) + EOS_TOKEN
return { "text": text }
formatted_dataset = dataset.map(formatting_prompt, batched=False)
print(formatted_dataset)
train_test_split = formatted_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
train_dataset = train_dataset.shuffle(seed=42)
print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = train_dataset,
dataset_text_field = "text",
max_seq_length = max_seq_length,
dataset_num_proc = 2,
packing = False,
args = TrainingArguments(
per_device_train_batch_size = 1,
gradient_accumulation_steps = 4,
warmup_steps = 5,
max_steps = 3000,
learning_rate = 2e-4,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "outputs",
report_to = "none",
),
)
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
trainer_stats = trainer.train()
model.save_pretrained("pittsburgh-rag-qa-mistral-finetuned")
model.push_to_hub("jiviteshjn/pittsburgh-rag-qa-mistral-finetuned", token = os.environ["HF_TOKEN"])