from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer from sklearn import metrics from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB from sklearn.neighbors import KNeighborsClassifier, NearestCentroid from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC from itertools import product from main import pd, test_set from ensemble_voter import EnsembleVoter # test many classifiers on a given set of data # modified from https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html def test_classifiers(X_train, y_train, X_test, y_test): vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) def benchmark(clf): clf.fit(X_train, y_train) pred = clf.predict(X_test) score = metrics.accuracy_score(y_test, pred) f1_score = metrics.f1_score(y_test, pred, average="micro", labels=["real", "fake"]) clf_descr = str(clf).split("(")[0] return clf_descr, score, f1_score results = [] for clf, name in ( (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), (Perceptron(max_iter=50), "Perceptron"), (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(), "Random forest"), ): results.append(benchmark(clf)) for penalty in ["l2", "l1"]: # Train Liblinear model results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append(benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty=penalty))) # Train SGD with Elastic Net penalty results.append( benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty="elasticnet")) ) # Train NearestCentroid without threshold results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers results.append(benchmark(MultinomialNB())) results.append(benchmark(BernoulliNB())) results.append(benchmark(ComplementNB())) # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. results.append( benchmark( Pipeline( [ ( "feature_selection", SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3)), ), ("classification", LinearSVC(penalty="l2")), ] ) ) ) return list(filter(lambda x: x[2] > 0.75, results)) # test different combinations of preprocessing options with all classifiers to find best def test_config_combinations(): opts = {"url": (True, False), "punctuation": (True, False), "emoji": (True, False), "contract": (True, False), "stops": (True, False)} keys = opts.keys() values = (opts[key] for key in keys) combinations = [dict(zip(keys, combination)) for combination in product(*values)] results = [] for i, combo in enumerate(combinations): print(f"Trying combo {i}") training = EnsembleVoter.process_dataframe(pd, **combo) test = EnsembleVoter.process_dataframe(test_set, **combo) # make_cv_bayes_estimator(pd, test) X_train, y_train, X_test, y_test = training["processedTweets"], training["label"], test["processedTweets"], test["label"] results += (test_classifiers(X_train, y_train, X_test, y_test), combo) best_per_classifier = {} for i in range(0, len(results), 2): for name, _, f1_micro in results[i]: config = results[i+1] if best_per_classifier.get(name, (0, {}))[0] < f1_micro: best_per_classifier[name] = (f1_micro, config) for name, scores in sorted(best_per_classifier.items(), reverse=True, key=lambda x: x[1][0]): print(f"{name}: {round(scores[0], 3)} - {scores[1]}") def tune_multinomial(training, testing): training = EnsembleVoter.process_dataframe(training, url=False, punctuation=False, emoji=False) testing = EnsembleVoter.process_dataframe(testing, url=False, punctuation=False, emoji=False) cv_params = {"ngram_range": [(1, 1), (1, 2), (1, 3)], "max_df": (0.5, 0.75, 1.0)} cv_keys = cv_params.keys() cv_values = (cv_params[key] for key in cv_keys) cv_combinations = [dict(zip(cv_keys, combination)) for combination in product(*cv_values)] tf_params = {'use_idf': (True, False), 'norm': ('l1', 'l2')} tf_keys = tf_params.keys() tf_values = (tf_params[key] for key in tf_params) tf_combinations = [dict(zip(tf_keys, combination)) for combination in product(*tf_values)] clf_params = {"alpha": (1, 0.001, 0.0001, 0.00001, 0.000001)} clf_keys = clf_params.keys() clf_values = (clf_params[key] for key in clf_params) clf_combinations = [dict(zip(clf_keys, combination)) for combination in product(*clf_values)] best_f1 = 0 best_combos = () for clf_combo in clf_combinations: print(clf_combo) for tf_combo in tf_combinations: print(tf_combo) for cv_combo in cv_combinations: print(cv_combo) estimator = Pipeline([ ('vect', CountVectorizer(**cv_combo)), ('tfidf', TfidfTransformer(**tf_combo)), ('clf', MultinomialNB(**clf_combo)), ]) estimator.fit(training["processedTweets"], training["label"]) predictions = estimator.predict(testing["processedTweets"]) f1 = metrics.f1_score(testing["label"], predictions, average="micro", labels=["real", "fake"]) if f1 > best_f1: best_f1 = f1 best_combos = (clf_combo, tf_combo, cv_combo) print(best_f1) print(best_combos) def tune_clf(clf, clf_params, tf_params, training, testing, dataset_kwargs): training = EnsembleVoter.process_dataframe(training, **dataset_kwargs) testing = EnsembleVoter.process_dataframe(testing, **dataset_kwargs) tf_keys = tf_params.keys() tf_values = (tf_params[key] for key in tf_params) tf_combinations = [dict(zip(tf_keys, combination)) for combination in product(*tf_values)] clf_keys = clf_params.keys() clf_values = (clf_params[key] for key in clf_params) clf_combinations = [dict(zip(clf_keys, combination)) for combination in product(*clf_values)] best_f1 = 0 best_combos = () for clf_combo in clf_combinations: print(clf_combo) for tf_combo in tf_combinations: print(tf_combo) estimator = Pipeline([ ('tfidf', TfidfVectorizer(**tf_combo)), ('clf', clf(**clf_combo)), ]) estimator.fit(training["processedTweets"], training["label"]) predictions = estimator.predict(testing["processedTweets"]) f1 = metrics.f1_score(testing["label"], predictions, average="micro", labels=["real", "fake"]) if f1 > best_f1: best_f1 = f1 best_combos = (clf_combo, tf_combo) print(best_f1) print(best_combos) if __name__ == "__main__": # test knn """0.8687795010114634 ({'weights': 'uniform', 'n_neighbors': 8, 'n_jobs': -1}, {'use_idf': True, 'norm': 'l2', 'ngram_range': (1, 2), 'max_df': 0.75})""" knn_params = {"weights": ("uniform",), "n_neighbors": (7, 8, 9), "n_jobs": (-1,), "p": (1, 2)} knn_tf_params = {'use_idf': (True, False), 'norm': ('l2', 'l1'), "max_df": (0.7, 0.75, 0.8, 1) } tune_clf(KNeighborsClassifier, knn_params, knn_tf_params, pd, test_set, {"emoji": False}) # tune multinomial multinomial_params = {"alpha": (1, 0.001, 0.0001, 0.00001, 0.000001)} # tune_clf(MultinomialNB, multinomial_params, knn_tf_params, pd, test_set, {'url': False, 'punctuation': False, 'emoji': False})" # tune perceptron """0.8755259942989004 ({'penalty': 'l2', 'alpha': 0.0001, 'n_jobs': -1, 'max_iter': 50}, {'use_idf': True, 'norm': 'l1', 'ngram_range': (1, 1), 'max_df': 0.5})""" perceptron_params = {"penalty": ("l2", "l1", "elasticnet"), "alpha": (0.0001, 0.00001, 0.001, 0.1), "n_jobs": (-1,), "max_iter": (50, 100, 200)} # tune_clf(Perceptron, perceptron_params, knn_tf_params, pd, test_set, {}) #tune passive aggressive classifier """0.8310415248468347 ({'C': 1, 'max_iter': 50, 'n_jobs': -1, 'fit_intercept': True}, {'use_idf': True, 'norm': 'l2', 'ngram_range': (1, 1), 'max_df': 1.0})""" passive_params = {"C": (0.1, 0.5, 1, 2), "max_iter": (50, 100, 200), "n_jobs": (-1,), "fit_intercept": (True, False)} # tune_clf(PassiveAggressiveClassifier, passive_params, knn_tf_params, pd, test_set, {}) # tune linearsvc """0.8267022696929239 ({'C': 2, 'penalty': 'l1', 'fit_intercept': True, 'max_iter': 50, 'dual': False}, {'use_idf': True, 'norm': 'l1', 'ngram_range': (1, 1), 'max_df': 1.0})""" svc_params = {"C": (0.5, 1, 2), "penalty": ("l1", "l2"), "fit_intercept": (True, False), "max_iter": (50, 100, 200), "dual": (False,)} # tune_clf(LinearSVC, svc_params, knn_tf_params, pd, test_set, {"punctuation": False})