RunModel.py · Ecuadorian-Manifesto-Classification



import numpy as np
import torch
from smaberta import TransformerModel
from sklearn import metrics
from sklearn.metrics import confusion_matrix
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed(1)


class RunModel: 
    def __init__(self, network, train_data, lr = 5e-5, epochs=25, output_dir='./saved_model/'):
        self.network= network
        self.train_data=train_data
        self.lr=lr
        self.epochs=epochs
        self.output_dir= output_dir
        
    def set_model(self):
        if self.network=="roberta":
            name='roberta'
            encoding='roberta-base'
        else: 
            name='distilbert'
            encoding='distilbert-base-uncased'
        
        self.model=TransformerModel(name, encoding, num_labels=8, reprocess_input_data=True, 
                                   num_train_epochs=self.epochs, learning_rate=self.lr, 
                  output_dir=self.output_dir, overwrite_output_dir=True, fp16=False, save_steps=50000)
    
    def train_model(self): 
        self.model.train(self.train_data['text'], self.train_data['cmp_code'], show_running_loss=True)
    
    def evaluate_model(self, valid_data):
        result, model_outputs, wrong_predictions = self.model.evaluate(valid_data['text'], valid_data['cmp_code'])
        preds = np.argmax(model_outputs, axis = 1)
        correct=0
        labels = valid_data['cmp_code'].tolist()
        assert len(labels)==len(preds), "the size of the prediction must be equal to the size of the labels"
        for j in range(len(labels)):
            if preds[j] == labels[j]:
                correct+=1
        accuracy = correct/len(labels)
        print("Accuracy_Valid: ", accuracy)
        
        print("Metrics")
        print(metrics.classification_report(labels, preds, digits=3))
        self.print_info(str(metrics.classification_report(labels, preds, digits=3)))
        
        print("Confusion Matrix")
        cm = confusion_matrix(labels,preds)
        print(cm)
        self.print_info(str(cm))
    
    def run_model(self, valid_data): 
        self.set_model()
        self.train_model()
        self.evaluate_model(valid_data)
    
    def print_info(self, data): 
        with open("results.txt", "a") as f: 
            f.write(data)