import time import pandas from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn import metrics from ensemble_voter import EnsembleVoter pandas.set_option("display.max_columns", None) pd = pandas.read_csv("mediaeval-2015-trainingset.txt", delimiter=" ", quoting=3, encoding="utf8") test_set = pandas.read_csv("mediaeval-2015-testset.txt", delimiter=" ", quoting=3, encoding="utf8") def make_multinomial(training, testing): training = EnsembleVoter.process_dataframe(training, url=False, punctuation=False, emoji=False) estimator = Pipeline([ ('tfidf', TfidfVectorizer()), ('clf', MultinomialNB()), ]) estimator.fit(training["tweetText"], training["label"]) return estimator def make_k_neighbours(training): training = EnsembleVoter.process_dataframe(training, emoji=False) tf = TfidfVectorizer(**{'use_idf': True, 'norm': 'l2', 'ngram_range': (1, 2), 'max_df': 0.75}) tf_tweet_features = tf.fit_transform(training["processedTweets"]) clf = KNeighborsClassifier(**{'weights': 'uniform', 'n_neighbors': 9, 'n_jobs': -1}) clf.fit(tf_tweet_features, training["label"]) return tf, clf def make_perceptron(training, testing): training = EnsembleVoter.process_dataframe(training) estimator = Pipeline([ ('tfidf', TfidfVectorizer(**{'use_idf': True, 'norm': 'l1', 'ngram_range': (1, 1), 'max_df': 0.5})), ('clf', ), ]) estimator.fit(training["processedTweets"], training["label"]) return estimator def make_passive(training): training = EnsembleVoter.process_dataframe(training) estimator = Pipeline([ ('tfidf', TfidfVectorizer(**{'use_idf': True, 'norm': 'l2', 'ngram_range': (1, 1), 'max_df': 1.0})), ('clf', PassiveAggressiveClassifier(**{'C': 1, 'max_iter': 50, 'n_jobs': -1, 'fit_intercept': True})), ]) estimator.fit(training["processedTweets"], training["label"]) return estimator def make_svc(training): training = EnsembleVoter.process_dataframe(training, **{"punctuation": False}) estimator = Pipeline([ ('tfidf', TfidfVectorizer(**{'use_idf': True, 'norm': 'l1', 'ngram_range': (1, 1), 'max_df': 1.0})), ('clf', LinearSVC(**{'C': 2, 'penalty': 'l1', 'fit_intercept': True, 'max_iter': 50, 'dual': False})), ]) estimator.fit(training["processedTweets"], training["label"]) return estimator def validate_classifier(estimator, training_x, training_y, testing_x, testing_y): X_train, X_val, y_train, y_val = train_test_split(training_x, training_y, test_size=0.25, random_state=1) estimator.fit(X_train, y_train) print("Validation Score: ") predictions = estimator.predict(X_val) print(metrics.classification_report(y_val, predictions, labels=["real", "fake"])) print("Test Set Score: ") estimator.fit(training_x, training_y) predictions = estimator.predict(testing_x) print(metrics.classification_report(testing_y, predictions, labels=["real", "fake"])) return estimator def make_voting_classifier(): voter = EnsembleVoter() knn = make_k_neighbours(pd, test_set) voter.add_classifier(knn, {"emoji": False}) m = make_multinomial(pd, test_set) voter.add_classifier(m, {'url': False, 'punctuation': False, 'emoji': False}) perc = make_perceptron(pd, test_set) voter.add_classifier(perc, {}) passive = make_passive(pd) voter.add_classifier(passive, {}) svc = make_svc(pd) voter.add_classifier(svc, {"punctuation": False}) p = voter.predict(test_set) print(metrics.classification_report(test_set["label"], p, labels=["real", "fake"])) if __name__ == "__main__": tf, knn = make_k_neighbours(pd) test = EnsembleVoter.process_dataframe(test_set, emoji=False) tweet_features = tf.transform(test["processedTweets"]) p = knn.predict(tweet_features) print(metrics.classification_report(test["label"], p, labels=["real", "fake"]))