PEFT fine-tune 4-bit quantized Mistral-7B for QA using Q-LoRA

Adapted from Unsloth AI's official fine-tuning scripts.

!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install "torch==2.4.0" "xformers==0.0.27.post2" triton torchvision torchaudio
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"
!pip uninstall transformers -y && pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git"
!pip install langchain peft

import os

from unsloth import FastLanguageModel
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline
import pandas as pd
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments

max_seq_length = 2048
dtype = None  # Unsloth autodetects.
load_in_4bit = True
batch_size = 16


model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Unsloth supports any, but = 0 is optimized
    bias = "none",    # Unsloth supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

_, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "jiviteshjn/pittsburgh-rag-qa-mistral-finetuned",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

def process_example(example):
    return {
        "question": example["question"],
        "answer": example["answer"],
        "text_content": example["text_content"],
    }

df = pd.read_json('/path/to/input', lines=True)  # Fitting it in memory isn't a problem
dataset = Dataset.from_pandas(df)

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Context:
{}

### Q:
{}

### A:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompt(examples):
    instruction = "Answer the following question based on the provided context. Answers must be from the context exactly and you must never use your own knowledge. Answer questions accurately and faithful to the context."
    context     = examples['text_content']
    input       = examples["question"]
    output      = examples["answer"]

    text = prompt.format(instruction, context, input, output) + EOS_TOKEN
    return { "text": text }


formatted_dataset = dataset.map(formatting_prompt, batched=False)
print(formatted_dataset)

train_test_split = formatted_dataset.train_test_split(test_size=0.2, seed=42)

train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

train_dataset = train_dataset.shuffle(seed=42)

print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")

Train loop

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 3000,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

trainer_stats = trainer.train()

model.save_pretrained("pittsburgh-rag-qa-mistral-finetuned")
model.push_to_hub("jiviteshjn/pittsburgh-rag-qa-mistral-finetuned", token = os.environ["HF_TOKEN"])