Ecuadorian-Manifesto-Classification / DataManager.py
DataManager.py
Raw
import pandas as pd
import numpy as np
import glob
from nltk.stem.snowball import SnowballStemmer
#!pip install simplemma
import simplemma

class DataManager:    
    """
    Reads manifesto from csv file and applies best preprocessing operations. 
    Return the csv file for fold/whole dataset after preprocessing
    path: path csv file
    num_fold: number of folds to create
    fold: number of the fold to return the dataset from 
    """
    def __init__(self, path, num_folds=None, fold=None):
        self.path=path
        if fold and num_folds is not None:
            assert fold<=num_folds, "number of fold should be less than or equal to the total number of folds"
        if num_folds is None or fold is None: 
            assert fold == num_folds, "num_folds and fold must be None"
        self.fold=fold
        self.num_folds=num_folds
    
    def read_csv(self):
        manifesto_dataset=pd.read_csv(self.path)
        return manifesto_dataset
    
    def clean_dataset(self, manifesto_dataset): 
        """
        Cleans the manifesto dataset by removing numbers, special characters, nan rows, and applying lowercase
        """
        manifesto_dataset["text"] = manifesto_dataset["text"].str.replace('\d+', '')
        manifesto_dataset["text"] = manifesto_dataset["text"].str.replace(r"\W"," ")
        manifesto_dataset = manifesto_dataset[manifesto_dataset["text"].notna()]
        manifesto_dataset["text"] = manifesto_dataset["text"].str.lower()
        return manifesto_dataset
    
    def apply_stemmer(self, manifesto_dataset): 
        stemmer = SnowballStemmer("spanish")
        manifesto_dataset['text'] = manifesto_dataset['text'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word
        return manifesto_dataset
    
    def lemmatize_text(self, text):
        return [simplemma.lemmatize(w, lang='es') for w in text]
    
    def apply_lemmatizer(self, manifesto_dataset):
        manifesto_dataset['text'] = manifesto_dataset.text.apply(self.lemmatize_text)
        return manifesto_dataset
    
    def preprocess_dataset(self, manifesto_dataset):    
        """
        Applies stemming and lemmatization to the dataset
        """
        manifesto_dataset['text'] = manifesto_dataset['text'].str.split()
        manifesto_dataset=self.apply_stemmer(manifesto_dataset)
        manifesto_dataset=self.apply_lemmatizer(manifesto_dataset)
        manifesto_dataset['text']= manifesto_dataset['text'].str.join(" ")
        return manifesto_dataset
        
    def generate_crossval(self, manifesto_dataset): 
        """
        Returns the training and validation set for fold
        """
        num_cases= len(manifesto_dataset)
        val_num=int(num_cases*(1/self.num_folds))
        all_cases= list(range(0, num_cases))
        # generates the list with observations for all folds
        val_list=np.random.RandomState(seed=0).choice(all_cases, size=num_cases,replace=False)

        start_fold=self.fold*val_num
        end_fold=start_fold+val_num
        val_fold=val_list[start_fold:end_fold]

        train_fold=[e for e in all_cases if e not in val_fold]
        train_fold=np.sort(train_fold)
        val_fold=np.sort(val_fold)
        val_dataset=manifesto_dataset.iloc[val_fold,]
        train_dataset=manifesto_dataset.iloc[train_fold,]


        return val_dataset, train_dataset
    
    def return_dataset(self): 
        """
        Reads the cvs file, cleans, preprocess and returns dataset
        """
        manifesto_dataset=self.read_csv()
        manifesto_dataset=self.clean_dataset(manifesto_dataset)
        manifesto_dataset=self.preprocess_dataset(manifesto_dataset)
        
        if self.fold is not None and self.num_folds is not None:
            val_dataset, train_dataset=self.generate_crossval(manifesto_dataset)
            return val_dataset, train_dataset
        else: 
            return manifesto_dataset