fake-news-mlt / main.py
main.py
Raw
import time

import pandas
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

from ensemble_voter import EnsembleVoter

pandas.set_option("display.max_columns", None)

pd = pandas.read_csv("mediaeval-2015-trainingset.txt", delimiter="	", quoting=3, encoding="utf8")

test_set = pandas.read_csv("mediaeval-2015-testset.txt", delimiter="	", quoting=3, encoding="utf8")


def make_multinomial(training, testing):
    training = EnsembleVoter.process_dataframe(training, url=False, punctuation=False, emoji=False)
    estimator = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', MultinomialNB()),
    ])
    estimator.fit(training["tweetText"], training["label"])
    return estimator


def make_k_neighbours(training):
    training = EnsembleVoter.process_dataframe(training, emoji=False)
    tf = TfidfVectorizer(**{'use_idf': True, 'norm': 'l2', 'ngram_range': (1, 2), 'max_df': 0.75})
    tf_tweet_features = tf.fit_transform(training["processedTweets"])
    clf = KNeighborsClassifier(**{'weights': 'uniform', 'n_neighbors': 9, 'n_jobs': -1})
    clf.fit(tf_tweet_features, training["label"])
    return tf, clf


def make_perceptron(training, testing):
    training = EnsembleVoter.process_dataframe(training)
    estimator = Pipeline([
        ('tfidf', TfidfVectorizer(**{'use_idf': True, 'norm': 'l1', 'ngram_range': (1, 1), 'max_df': 0.5})),
        ('clf', ),
    ])
    estimator.fit(training["processedTweets"], training["label"])
    return estimator


def make_passive(training):
    training = EnsembleVoter.process_dataframe(training)
    estimator = Pipeline([
        ('tfidf', TfidfVectorizer(**{'use_idf': True, 'norm': 'l2', 'ngram_range': (1, 1), 'max_df': 1.0})),
        ('clf', PassiveAggressiveClassifier(**{'C': 1, 'max_iter': 50, 'n_jobs': -1, 'fit_intercept': True})),
    ])
    estimator.fit(training["processedTweets"], training["label"])
    return estimator


def make_svc(training):
    training = EnsembleVoter.process_dataframe(training, **{"punctuation": False})
    estimator = Pipeline([
        ('tfidf', TfidfVectorizer(**{'use_idf': True, 'norm': 'l1', 'ngram_range': (1, 1), 'max_df': 1.0})),
        ('clf', LinearSVC(**{'C': 2, 'penalty': 'l1', 'fit_intercept': True, 'max_iter': 50, 'dual': False})),
    ])
    estimator.fit(training["processedTweets"], training["label"])
    return estimator

def validate_classifier(estimator, training_x, training_y, testing_x, testing_y):
    X_train, X_val, y_train, y_val = train_test_split(training_x, training_y, test_size=0.25,
                                                      random_state=1)
    estimator.fit(X_train, y_train)
    print("Validation Score: ")
    predictions = estimator.predict(X_val)
    print(metrics.classification_report(y_val, predictions, labels=["real", "fake"]))
    print("Test Set Score: ")
    estimator.fit(training_x, training_y)
    predictions = estimator.predict(testing_x)
    print(metrics.classification_report(testing_y, predictions, labels=["real", "fake"]))
    return estimator

def make_voting_classifier():
    voter = EnsembleVoter()
    knn = make_k_neighbours(pd, test_set)
    voter.add_classifier(knn, {"emoji": False})

    m = make_multinomial(pd, test_set)
    voter.add_classifier(m, {'url': False, 'punctuation': False, 'emoji': False})

    perc = make_perceptron(pd, test_set)
    voter.add_classifier(perc, {})

    passive = make_passive(pd)
    voter.add_classifier(passive, {})

    svc = make_svc(pd)
    voter.add_classifier(svc, {"punctuation": False})

    p = voter.predict(test_set)
    print(metrics.classification_report(test_set["label"], p, labels=["real", "fake"]))

if __name__ == "__main__":
    tf, knn = make_k_neighbours(pd)
    test = EnsembleVoter.process_dataframe(test_set, emoji=False)
    tweet_features = tf.transform(test["processedTweets"])
    p = knn.predict(tweet_features)
    print(metrics.classification_report(test["label"], p, labels=["real", "fake"]))