fake-news-mlt / ensemble_voter.py
ensemble_voter.py
Raw
import collections
import re
import string
from html import unescape

import emoji
import contractions
import langdetect
import nltk
from iso639 import languages
from langdetect import detect
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.base import BaseEstimator


class EnsembleVoter(BaseEstimator):
    def __init__(self):
        self.classifiers = []
        self.processing_args = {}

    @staticmethod
    def remove_emoji(text):
        return emoji.demojize(text)

    @staticmethod
    def remove_stops(text, language_data):
        try:
            language = detect(text)
            try:
                language = languages.get(alpha2=language).name
            except KeyError:
                pass
        except langdetect.LangDetectException:
            language = "unknown"

        if language == "Welsh":
            language = "English"
        language_data.append(language)
        stops = []
        if language.lower() in stopwords.fileids():
            stops = stopwords.words(language)
        elif language.lower() == "tagalog":
            stops = []
            with open("tagalog_stopwords.txt", "r") as f:
                for line in f.readlines():
                    stops.append(line.strip())
        if stops:
            text = " ".join(word for word in text.split(" ") if word not in stops)
        return text, language_data

    @staticmethod
    def process_dataframe(data, url=True, punctuation=True, emoji=True, contract=True, stops=True):
        processed = []
        parts_of_speech = []
        number_of_words = []
        number_of_characters = []
        number_of_capitals = []
        number_of_capital_words = []
        punctuation_count = []
        retweet = []
        sentence_count = []
        hashtags = []
        mentions = []
        stopword_count = []
        sentiment_polarity = []

        for index, row in data.iterrows():
            m = []
            h = []
            if row["label"] == "humor":
                row["label"] = "fake"
            text = row["tweetText"]

            # blob = TextBlob(text)
            # number_of_words.append(len(blob.words))
            # number_of_characters.append(len(text))
            # number_of_capitals.append(len([x for x in text if x.isupper()]))
            # number_of_capital_words.append(len([x for x in text.split() if x[0].isupper()]))
            # punctuation_count.append(len([x for x in text if x in string.punctuation]))
            # exp = re.compile(r'([“|"|”].*["|”|“]|\sRT\s)')
            # retweet.append(1 if exp.search(text) else 0)
            # sentence_count.append(len(blob.sentences))
            # mentions.append(len(re.findall(r'(@w[A-Za-z0-9]*)', text)))
            # hashtags.append(len(re.findall(r'(#w[A-Za-z0-9]*)', text)))
            # stopword_count.append(len([w for w in text if w in stopwords.words("english") or w in stopwords.words("spanish")]))
            # sentiment_polarity.append(blob.sentiment.polarity)

            text = text.lower()
            # swap link out for identifier
            if url:
                text = re.sub(r'https*:[\s]*(\\\/\\\/|\/\/)t.co(\\\/|\/)[a-zA-Z0-9]*', '', text)

            # remove punctuation
            if punctuation:
                text = re.sub(r"“|”", "", text)  # dear lord why is this a thing
                text = "".join(ch for ch in text if ch not in set(string.punctuation))

            # swap emojis out for identifier
            if emoji:
                text = EnsembleVoter.remove_emoji(text)

            # remove html characters
            text = unescape(text)

            # expand contractions
            if contract:
                expanded = []
                for word in text.split():
                    expanded.append(contractions.fix(word))
                text = " ".join(expanded)

            # remove stopwords for english and spanish
            if stops:
                text = " ".join(
                    w for w in text.split() if
                    w not in stopwords.words("english") and w not in stopwords.words("spanish"))

            # lemmatizer = WordNetLemmatizer()

            # remove extra whitespace
            text = text.strip()
            tokens = word_tokenize(text)
            # text = " ".join([lemmatizer.lemmatize(w) for w in tokens])
            pos = nltk.pos_tag(tokens)
            pos_string = " ".join(f"{x[0]}_{x[1]}" for x in pos)
            # parts_of_speech.append(pos_string)

            processed.append(text)
        data["processedTweets"] = processed
        # data["pos"] = parts_of_speech
        # data["number_of_words"] = number_of_words
        # data["number_of_characters"] = number_of_characters
        # data["number_of_capitals"] = number_of_capitals
        # data["number_of_capital_words"] = number_of_capital_words
        # data["punctuation_count"] = punctuation_count
        # data["retweets"] = retweet
        # data["sentence_count"] = sentence_count
        # data["hashtags"] = hashtags
        # data["mentions"] = mentions
        # data["stopword_count"] = stopword_count
        # data["sentiment"] = sentiment_polarity
        # data['avg_wordlength'] = data['number_of_characters'] / data['number_of_words']
        # data['avg_sentlength'] = data['number_of_words'] / data['sentence_count']
        # data['stopwords_vs_words'] = data['stopword_count'] / data['number_of_words']
        return data

    def fit(self, X, y):
        for classifier in self.classifiers:
            # X, y = check_X_y(X, y)
            classifier.fit(X, y)
        self.X_ = X
        self.y_ = y
        return self

    def add_classifier(self, classifier, processing_args):
        self.classifiers.append(classifier)
        self.processing_args[classifier] = processing_args

    def predict(self, testing_x):
        predictions = []
        final_predictions = []
        for classifier in self.classifiers:
            print(f"Predicting for {classifier}")
            test_data = EnsembleVoter.process_dataframe(testing_x, **self.processing_args[classifier])
            p = classifier.predict(test_data["processedTweets"])
            predictions.append(p)

        print("Collecting Votes")
        for i in range(len(predictions[0])):
            votes = collections.defaultdict(int)
            for j in range(len(predictions)):
                votes[predictions[j][i]] += 1
            final_predictions.append(max(votes, key=votes.get))

        return final_predictions