Twitter-Mar-Menor / Sentiment_analysis.py
Sentiment_analysis.py
Raw
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from langdetect import detect
from sentiment_analysis_spanish import sentiment_analysis

#Spanish sentiment analyzer (Bello, H. https://pypi.org/project/sentiment-analysis-spanish/ )
sentiment = sentiment_analysis.SentimentAnalysisSpanish()

#English sentiment analyzer (VADER, Hutto & Gilbert (2014, May))
# https://predictivehacks.com/how-to-run-sentiment-analysis-in-python-using-vader/

nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()


for year in range(2010,2023):
    #year = 2022
    iinput = "Mar Menor " + str(year) + " tweets.xlsx"
    data = pd.read_excel(iinput)

    data['is_positive'] = 0
    data['is_negative'] = 0
    data['is_neutral'] = 0
    data['Text_eng'] = ''

    for index, row in data.iterrows():
        try:
            language = detect(row['Text'])
        except:
            language = ''
        #data.loc[index, 'Text_eng'] = language
        if language == 'es':
            prob_positive = sentiment.sentiment(str(row['Text']))
            if prob_positive > 0.6:
                data.loc[index, 'is_positive'] = 1
            elif prob_positive < 0.4:
                data.loc[index, 'is_negative'] = 1
            else:
                data.loc[index, 'is_neutral'] = 1
        if language == 'en':
            compound = sid.polarity_scores(str(row['Text']))['compound']
            if compound > 0:
                data.loc[index, 'is_positive'] = 1
            elif compound < 0:
                data.loc[index, 'is_negative'] = 1
            elif compound == 0:
                data.loc[index, 'is_neutral'] = 1
        else:
            pass

    means_by_day = data[['Datetime', 'is_positive', 'is_negative', 'is_neutral']].groupby(
        "Datetime", as_index=False).mean() #as_index false allow us to keep datetime


    tweets_by_day = data[['Datetime', 'is_positive', 'is_negative', 'is_neutral']].groupby("Datetime", as_index=False).sum()  

    for x in ['is_positive', 'is_negative', 'is_neutral']:
        means_by_day[x] = means_by_day.Datetime.map(tweets_by_day.set_index('Datetime')[x]) #By datetime, assign values of tweets_by_day to means_by_day

    tweets_by_day = data[['Datetime', 'Text']].groupby("Datetime", as_index=False).count()  #The maximum is in Twitter_data_collection.py, and it is set to 100.000
    means_by_day['number of tweets'] = means_by_day.Datetime.map(tweets_by_day.set_index('Datetime')['Text'])

    means_by_day['date'] = means_by_day['Datetime'].dt.date

    output = "Mar Menor " + str(year) + " sentiment.xlsx"

    means_by_day.to_excel(output, index=False)
    print(year)
    print("done")