from contextlib import nullcontext import json import time, random, numpy as np, argparse, sys, re, os from types import SimpleNamespace import torch import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader from sklearn.metrics import classification_report, f1_score, recall_score, accuracy_score # change it with respect to the original model from classifier import LlamaZeroShotClassifier, LlamaEmbeddingClassifier from llama import Llama, load_pretrained from optimizer import AdamW from tokenizer import Tokenizer from tqdm import tqdm from typing import Optional TQDM_DISABLE=False # fix the random seed def seed_everything(seed=11711): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # create a custom Dataset Class to be used for the dataloader class LlamaDataset(Dataset): def __init__(self, dataset, args, eos=False): self.dataset = dataset self.p = args self.tokenizer = Tokenizer(max_len=args.max_sentence_len) self.eos = eos def __len__(self): return len(self.dataset) def __getitem__(self, idx): ele = self.dataset[idx] return ele def pad_data(self, data): sents = [x[0] for x in data] labels = [x[1] for x in data] encoding = [self.tokenizer.encode(s, bos=True, eos=self.eos) for s in sents] max_length_in_batch = max([len(sentence) for sentence in encoding]) encoding_padded = [sentence + [self.tokenizer.pad_id] * (max_length_in_batch - len(sentence)) for sentence in encoding] token_ids = torch.LongTensor(encoding_padded) labels = torch.LongTensor(labels) return token_ids, labels, sents def collate_fn(self, all_data): token_ids, labels, sents = self.pad_data(all_data) batched_data = { 'token_ids': token_ids, 'labels': labels, 'sents': sents, } return batched_data # create the data which is a list of (sentence, label, token for the labels) def create_data(filename, tokenizer: Tokenizer, flag: str ='train', lower: bool = False, eos: bool = True, prompt_suffix: Optional[str]=None): # specify the tokenizer num_labels = {} data = [] with open(filename, 'r') as fp: for line in fp: label, org_sent = line.split(' ||| ') if lower: org_sent = org_sent.lower() sent = org_sent.strip() if prompt_suffix is not None: sent = f"{sent} {prompt_suffix}" tokens = tokenizer.encode(sent, bos=True, eos=eos) label = int(label.strip()) if label not in num_labels: num_labels[label] = len(num_labels) data.append((sent, label, tokens)) print(f"load {len(data)} data from {filename}") if flag == 'train': return data, len(num_labels) else: return data # perform model evaluation in terms of the accuracy and f1 score. def model_eval(dataloader, model, device): model.eval() # switch to eval model, will turn off randomness like dropout y_true = [] y_pred = [] sents = [] for step, batch in enumerate(tqdm(dataloader, desc=f'eval', disable=TQDM_DISABLE)): b_ids, b_labels, b_sents = batch['token_ids'], batch['labels'], batch['sents'] b_ids = b_ids.to(device) logits = model(b_ids) logits = logits.detach().cpu().numpy() preds = np.argmax(logits, axis=1).flatten() b_labels = b_labels.flatten() y_true.extend(b_labels) y_pred.extend(preds) sents.extend(b_sents) f1 = f1_score(y_true, y_pred, average='macro') acc = accuracy_score(y_true, y_pred) return acc, f1, y_pred, y_true, sents def save_model(model, optimizer, args, config, filepath): save_info = { 'model': model.state_dict(), 'optim': optimizer.state_dict(), 'args': args, 'model_config': config, 'system_rng': random.getstate(), 'numpy_rng': np.random.get_state(), 'torch_rng': torch.random.get_rng_state(), } torch.save(save_info, filepath) print(f"save the model to {filepath}") def train(args): device = torch.device('cuda') if args.use_gpu else torch.device('cpu') #### Load data # create the data and its corresponding datasets and dataloader tokenizer = Tokenizer(args.max_sentence_len) train_data, num_labels = create_data(args.train, tokenizer, 'train') dev_data = create_data(args.dev, tokenizer, 'valid') train_dataset = LlamaDataset(train_data, args) dev_dataset = LlamaDataset(dev_data, args) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, collate_fn=train_dataset.collate_fn) dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=dev_dataset.collate_fn) #### Init model config = {'hidden_dropout_prob': args.hidden_dropout_prob, 'pretrained_model_path': args.pretrained_model_path, 'num_labels': num_labels, 'data_dir': '.', 'option': args.option} config = SimpleNamespace(**config) # initialize the Senetence Classification Model model = LlamaEmbeddingClassifier(config) model = model.to(device) lr = args.lr ## specify the optimizer optimizer = AdamW(model.parameters(), lr=lr) best_dev_acc = 0 ## run for the specified number of epochs for epoch in tqdm(range(args.epochs)): model.train() train_loss = 0 num_batches = 0 for step, batch in enumerate(tqdm(train_dataloader, desc=f'train-{epoch}', disable=TQDM_DISABLE)): b_ids, b_labels, b_sents = batch['token_ids'], batch['labels'], batch['sents'] b_ids = b_ids.to(device) b_labels = b_labels.to(device) optimizer.zero_grad() logits = model(b_ids) loss = F.nll_loss(logits, b_labels.view(-1), reduction='sum') / args.batch_size loss.backward() optimizer.step() train_loss += loss.item() num_batches += 1 train_loss = train_loss / (num_batches) train_acc, train_f1, *_ = model_eval(train_dataloader, model, device) dev_acc, dev_f1, *_ = model_eval(dev_dataloader, model, device) if dev_acc > best_dev_acc: best_dev_acc = dev_acc save_model(model, optimizer, args, config, args.filepath) print(f"epoch {epoch}: train loss :: {train_loss :.3f}, train acc :: {train_acc :.3f}, dev acc :: {dev_acc :.3f}") def generate_sentence(args, prefix, outfile, max_new_tokens = 75, temperature = 0.0): with torch.no_grad(): device = torch.device('cuda') if args.use_gpu else torch.device('cpu') ctx = torch.amp.autocast(device_type="cuda", dtype=torch.float32) if args.use_gpu else nullcontext() llama = load_pretrained(args.pretrained_model_path) llama = llama.to(device) print(f"load model from {args.pretrained_model_path}") enc = Tokenizer(args.max_sentence_len) start_ids = enc.encode(prefix, bos=True, eos=False) x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]) # run generation with torch.no_grad(): with ctx: y = llama.generate(x, max_new_tokens, temperature=temperature) sentence = enc.decode(y[0].tolist()) print(f"Temperature is {temperature}") print(sentence) print('---------------') writer = open(outfile, 'w') writer.write(sentence) print(f"Wrote generated sentence to {outfile}.") writer.close() def write_predictions_to_file(split: str, outfile: str, acc: float, pred: list[str], sents: list[str]): with open(outfile, "w+") as f: print(f"{split} acc :: {acc :.3f}") for s, p in zip(sents, pred): f.write(f"{p} ||| {s}\n") def test_with_prompting(args): assert args.dev_out.endswith("dev-prompting-output.txt"), 'For saving prompting results, please set the dev_out argument as "-dev-prompting-output.txt"' assert args.test_out.endswith("test-prompting-output.txt"), 'For saving prompting results, please set the test_out argument as "-test-prompting-output.txt"' with torch.no_grad(): device = torch.device('cuda') if args.use_gpu else torch.device('cpu') #### Load data # create the data and its corresponding datasets and dataloader tokenizer = Tokenizer(args.max_sentence_len) label_names = json.load(open(args.label_names, 'r')) _, num_labels = create_data(args.train, tokenizer, 'train') #### Init model config = {'pretrained_model_path': args.pretrained_model_path, 'label_names': label_names, 'num_labels': num_labels, 'data_dir': '.', 'option': args.option} config = SimpleNamespace(**config) if len(label_names) == 2: label_name_str = " or ".join(label_names) else: label_name_str = ", ".join(label_names[:-1]) + ", or " + label_names[-1] prompt_suffix=f"Is this movie {label_name_str}? This movie is " model = LlamaZeroShotClassifier(config, tokenizer, label_names) model = model.to(device) dev_data = create_data(args.dev, tokenizer, 'valid', eos=False, prompt_suffix=prompt_suffix) dev_dataset = LlamaDataset(dev_data, args, eos=False) dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=dev_dataset.collate_fn) test_data = create_data(args.test, tokenizer, 'test', eos=False, prompt_suffix=prompt_suffix) test_dataset = LlamaDataset(test_data, args, eos=False) test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=test_dataset.collate_fn) dev_acc, dev_f1, dev_pred, dev_true, dev_sents = model_eval(dev_dataloader, model, device) test_acc, test_f1, test_pred, test_true, test_sents = model_eval(test_dataloader, model, device) write_predictions_to_file("dev", args.dev_out, dev_acc, dev_pred, dev_sents) write_predictions_to_file("test", args.test_out, test_acc, test_pred, test_sents) def test(args): assert args.dev_out.endswith("dev-finetuning-output.txt"), 'For saving finetuning results, please set the dev_out argument as "-dev-finetuning-output.txt"' assert args.test_out.endswith("test-finetuning-output.txt"), 'For saving finetuning results, please set the test_out argument as "-test-finetuning-output.txt"' with torch.no_grad(): device = torch.device('cuda') if args.use_gpu else torch.device('cpu') saved = torch.load(args.filepath) config = saved['model_config'] model = LlamaEmbeddingClassifier(config) model.load_state_dict(saved['model']) model = model.to(device) print(f"load model from {args.filepath}") tokenizer = Tokenizer(args.max_sentence_len) dev_data = create_data(args.dev, tokenizer, 'valid') dev_dataset = LlamaDataset(dev_data, args) dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=dev_dataset.collate_fn) test_data = create_data(args.test, tokenizer, 'test') test_dataset = LlamaDataset(test_data, args) test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=test_dataset.collate_fn) dev_acc, dev_f1, dev_pred, dev_true, dev_sents = model_eval(dev_dataloader, model, device) test_acc, test_f1, test_pred, test_true, test_sents = model_eval(test_dataloader, model, device) write_predictions_to_file("dev", args.dev_out, dev_acc, dev_pred, dev_sents) write_predictions_to_file("test", args.test_out, test_acc, test_pred, test_sents) def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--train", type=str, default="data/cfimdb-train.txt") parser.add_argument("--dev", type=str, default="data/cfimdb-dev.txt") parser.add_argument("--test", type=str, default="data/cfimdb-test.txt") parser.add_argument("--label-names", type=str, default="data/cfimdb-label-mapping.json") parser.add_argument("--pretrained-model-path", type=str, default="stories42M.pt") parser.add_argument("--max_sentence_len", type=int, default=None) parser.add_argument("--seed", type=int, default=1337) parser.add_argument("--epochs", type=int, default=5) parser.add_argument("--option", type=str, help='prompt: the Llama parameters are frozen; finetune: Llama parameters are updated', choices=('generate', 'prompt', 'finetune'), default="generate") parser.add_argument("--use_gpu", action='store_true') parser.add_argument("--generated_sentence_low_temp_out", type=str, default="generated-sentence-temp-0.txt") parser.add_argument("--generated_sentence_high_temp_out", type=str, default="generated-sentence-temp-1.txt") parser.add_argument("--dev_out", type=str, default="cfimdb-dev-prompting-output.txt") parser.add_argument("--test_out", type=str, default="cfimdb-test-prompting-output.txt") # hyper parameters parser.add_argument("--batch_size", help='sst: 64, cfimdb: 8 can fit a 12GB GPU', type=int, default=8) parser.add_argument("--hidden_dropout_prob", type=float, default=0.3) parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5", default=2e-5) args = parser.parse_args() print(f"args: {vars(args)}") return args if __name__ == "__main__": args = get_args() args.filepath = f'{args.option}-{args.epochs}-{args.lr}.pt' # save path seed_everything(args.seed) # fix the seed for reproducibility if args.option == "generate": # Step 1 # Complete this sentence to test your implementation! prefix = "I have wanted to see this thriller for a while, and it didn't disappoint. Keanu Reeves, playing the hero John Wick, is" generate_sentence(args, prefix, args.generated_sentence_low_temp_out, max_new_tokens=75, temperature=0.0) generate_sentence(args, prefix, args.generated_sentence_high_temp_out, max_new_tokens=75, temperature=1.0) elif args.option == "prompt": # Step 2 # Solve this task with prompted language modeling test_with_prompting(args) elif args.option == "finetune": # Step 3 # Finetune a classification model train(args) # Step 4 # Evaluate your model on the dev and test sets test(args) else: raise ValueError(f"Invalid option: {args.option}")