import collections import re import string from html import unescape import emoji import contractions import langdetect import nltk from iso639 import languages from langdetect import detect from nltk import word_tokenize from nltk.corpus import stopwords from sklearn.base import BaseEstimator class EnsembleVoter(BaseEstimator): def __init__(self): self.classifiers = [] self.processing_args = {} @staticmethod def remove_emoji(text): return emoji.demojize(text) @staticmethod def remove_stops(text, language_data): try: language = detect(text) try: language = languages.get(alpha2=language).name except KeyError: pass except langdetect.LangDetectException: language = "unknown" if language == "Welsh": language = "English" language_data.append(language) stops = [] if language.lower() in stopwords.fileids(): stops = stopwords.words(language) elif language.lower() == "tagalog": stops = [] with open("tagalog_stopwords.txt", "r") as f: for line in f.readlines(): stops.append(line.strip()) if stops: text = " ".join(word for word in text.split(" ") if word not in stops) return text, language_data @staticmethod def process_dataframe(data, url=True, punctuation=True, emoji=True, contract=True, stops=True): processed = [] parts_of_speech = [] number_of_words = [] number_of_characters = [] number_of_capitals = [] number_of_capital_words = [] punctuation_count = [] retweet = [] sentence_count = [] hashtags = [] mentions = [] stopword_count = [] sentiment_polarity = [] for index, row in data.iterrows(): m = [] h = [] if row["label"] == "humor": row["label"] = "fake" text = row["tweetText"] # blob = TextBlob(text) # number_of_words.append(len(blob.words)) # number_of_characters.append(len(text)) # number_of_capitals.append(len([x for x in text if x.isupper()])) # number_of_capital_words.append(len([x for x in text.split() if x[0].isupper()])) # punctuation_count.append(len([x for x in text if x in string.punctuation])) # exp = re.compile(r'([“|"|”].*["|”|“]|\sRT\s)') # retweet.append(1 if exp.search(text) else 0) # sentence_count.append(len(blob.sentences)) # mentions.append(len(re.findall(r'(@w[A-Za-z0-9]*)', text))) # hashtags.append(len(re.findall(r'(#w[A-Za-z0-9]*)', text))) # stopword_count.append(len([w for w in text if w in stopwords.words("english") or w in stopwords.words("spanish")])) # sentiment_polarity.append(blob.sentiment.polarity) text = text.lower() # swap link out for identifier if url: text = re.sub(r'https*:[\s]*(\\\/\\\/|\/\/)t.co(\\\/|\/)[a-zA-Z0-9]*', '', text) # remove punctuation if punctuation: text = re.sub(r"“|”", "", text) # dear lord why is this a thing text = "".join(ch for ch in text if ch not in set(string.punctuation)) # swap emojis out for identifier if emoji: text = EnsembleVoter.remove_emoji(text) # remove html characters text = unescape(text) # expand contractions if contract: expanded = [] for word in text.split(): expanded.append(contractions.fix(word)) text = " ".join(expanded) # remove stopwords for english and spanish if stops: text = " ".join( w for w in text.split() if w not in stopwords.words("english") and w not in stopwords.words("spanish")) # lemmatizer = WordNetLemmatizer() # remove extra whitespace text = text.strip() tokens = word_tokenize(text) # text = " ".join([lemmatizer.lemmatize(w) for w in tokens]) pos = nltk.pos_tag(tokens) pos_string = " ".join(f"{x[0]}_{x[1]}" for x in pos) # parts_of_speech.append(pos_string) processed.append(text) data["processedTweets"] = processed # data["pos"] = parts_of_speech # data["number_of_words"] = number_of_words # data["number_of_characters"] = number_of_characters # data["number_of_capitals"] = number_of_capitals # data["number_of_capital_words"] = number_of_capital_words # data["punctuation_count"] = punctuation_count # data["retweets"] = retweet # data["sentence_count"] = sentence_count # data["hashtags"] = hashtags # data["mentions"] = mentions # data["stopword_count"] = stopword_count # data["sentiment"] = sentiment_polarity # data['avg_wordlength'] = data['number_of_characters'] / data['number_of_words'] # data['avg_sentlength'] = data['number_of_words'] / data['sentence_count'] # data['stopwords_vs_words'] = data['stopword_count'] / data['number_of_words'] return data def fit(self, X, y): for classifier in self.classifiers: # X, y = check_X_y(X, y) classifier.fit(X, y) self.X_ = X self.y_ = y return self def add_classifier(self, classifier, processing_args): self.classifiers.append(classifier) self.processing_args[classifier] = processing_args def predict(self, testing_x): predictions = [] final_predictions = [] for classifier in self.classifiers: print(f"Predicting for {classifier}") test_data = EnsembleVoter.process_dataframe(testing_x, **self.processing_args[classifier]) p = classifier.predict(test_data["processedTweets"]) predictions.append(p) print("Collecting Votes") for i in range(len(predictions[0])): votes = collections.defaultdict(int) for j in range(len(predictions)): votes[predictions[j][i]] += 1 final_predictions.append(max(votes, key=votes.get)) return final_predictions