lda / en_analysis.r
en_analysis.r
Raw
# # # # # # # # # # # # # # # #
#         EN ANALYSIS         #
# # # # # # # # # # # # # # # #


# # PACKAGES USED

library(rstatix)
library(dplyr)
library(car)
library(lsr)
library(pastecs)


# # DATA

eptic_en <- read.csv("texts_with_sttr_en.csv")

summary(eptic_en)
#  direction          text_type              sttr       
# Length:72          Length:72          Min.   :0.3820  
# Class :character   Class :character   1st Qu.:0.4146  
# Mode  :character   Mode  :character   Median :0.4274  
#                                       Mean   :0.4295  
#                                       3rd Qu.:0.4441  
#                                       Max.   :0.4839 


# # OUTLIERS

# Identifying outliers

outliers <- eptic_en %>%
  group_by(text_type) %>%
  identify_outliers(sttr) # https://www.rdocumentation.org/packages/rstatix/versions/0.7.2/topics/identify_outliers

# Removing outliers 

eptic_en <- eptic_en %>%
  anti_join(outliers, by = c("text_type", "direction", "sttr")) # https://search.r-project.org/CRAN/refmans/dplyr/html/filter-joins.html


# # DESCRIPTIVES

stats <- by(eptic_en$sttr, eptic_en$text_type, stat.desc) # https://www.rdocumentation.org/packages/pastecs/versions/1.3.21/topics/stat.desc
rounded_stats <- lapply(stats, round, 3)
rounded_stats 
# $en_sp_st
#      nbr.val     nbr.null       nbr.na          min          max        range          sum       median         mean      SE.mean CI.mean.0.95          var      std.dev     coef.var 
#       20.000        0.000        0.000        0.382        0.478        0.096        8.517        0.422        0.426        0.006        0.012        0.001        0.025        0.058 

# $en_sp_tt
#      nbr.val     nbr.null       nbr.na          min          max        range          sum       median         mean      SE.mean CI.mean.0.95          var      std.dev     coef.var 
#       15.000        0.000        0.000        0.387        0.428        0.041        6.151        0.408        0.410        0.003        0.007        0.000        0.013        0.031 

# $en_wr_st
#      nbr.val     nbr.null       nbr.na          min          max        range          sum       median         mean      SE.mean CI.mean.0.95          var      std.dev     coef.var 
#       18.000        0.000        0.000        0.403        0.477        0.074        7.867        0.435        0.437        0.004        0.009        0.000        0.019        0.043 

# $en_wr_tt
#      nbr.val     nbr.null       nbr.na          min          max        range          sum       median         mean      SE.mean CI.mean.0.95          var      std.dev     coef.var 
#       17.000        0.000        0.000        0.421        0.484        0.063        7.552        0.442        0.444        0.004        0.008        0.000        0.016        0.037 


# # NORMALITY TESTS

# Checking normality of the STTRs, by text_type 

tapply(eptic_en$sttr, eptic_en$text_type, shapiro.test)

# $en_sp_st
# data:  X[[i]]
# W = 0.97619, p-value = 0.876

# $en_sp_tt
# data:  X[[i]]
# W = 0.95245, p-value = 0.5638

# $en_wr_st
# data:  X[[i]]
# W = 0.94034, p-value = 0.2937

# $en_wr_tt
# data:  X[[i]]
# W = 0.9447, p-value = 0.3782


# # VARIANCES

leveneTest(eptic_en$sttr ~ eptic_en$text_type)

# Levene's Test for Homogeneity of Variance (center = median)
#       Df F value Pr(>F)
# group  3  2.0487 0.1155
#       66               
    

# # BOXPLOT

boxplot(eptic_en$sttr ~ eptic_en$text_type,
        main = "Boxplots of STTR by Text Type",
        xlab = "Text Type",
        ylab = "STTR",
        col = c(gray(0.9), gray(0.7), gray(0.5), gray(0.3)))
		 

# # ANOVA

anova_english <- aov(eptic_en$sttr ~ eptic_en$text_type) # ANOVA assumptions are met so we can use aov()

summary(anova_english)

#             Df  Sum Sq  Mean Sq F value   Pr(>F)    
# text_type    3 0.01064 0.003547   9.584 2.45e-05 ***
# Residuals   66 0.02442 0.000370                     
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


# # ANOVA EFFECT SIZE (OMEGA SQUARED) 

# The function is from here https://stats.stackexchange.com/questions/2962/omega-squared-for-measure-of-effect-in-r

omega_sq <- function(aovm){
    sum_stats <- summary(aovm)[[1]]
    SSm <- sum_stats[["Sum Sq"]][1]
    SSr <- sum_stats[["Sum Sq"]][2]
    DFm <- sum_stats[["Df"]][1]
    MSr <- sum_stats[["Mean Sq"]][2]
    W2 <- (SSm-DFm*MSr)/(SSm+SSr+MSr)
    return(W2)
}

omega_sq(anova_english)
# 0.2689463


# # T-TESTS (UNCORRECTED)

t.test(eptic_en$sttr[eptic_en$text_type=="en_sp_tt"], 
		eptic_en$sttr[eptic_en$text_type=="en_wr_tt"],
		alternative = "two.sided",
		var.equal = TRUE)
# t = -6.5267, df = 30, p-value = 3.231e-07

t.test(eptic_en$sttr[eptic_en$text_type=="en_sp_st"], 
		eptic_en$sttr[eptic_en$text_type=="en_wr_st"],
		alternative = "two.sided",
		var.equal = TRUE)
# t = -1.5494, df = 36, p-value = 0.13	

t.test(eptic_en$sttr[eptic_en$text_type=="en_sp_tt"], 
		eptic_en$sttr[eptic_en$text_type=="en_sp_st"],
		alternative = "two.sided",
		var.equal = TRUE)
# t = -2.242, df = 33, p-value = 0.0318

		
t.test(eptic_en$sttr[eptic_en$text_type=="en_wr_tt"], 
		eptic_en$sttr[eptic_en$text_type=="en_wr_st"],
		alternative = "two.sided",
		var.equal = TRUE)
# t = 1.194, df = 33, p-value = 0.241


# # BONF. CORRECTIONS 

p.adjust(3.231e-07, method='bonferroni', n=4) # https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/p.adjust
# 1.2924e-06

p.adjust(0.13, method='bonferroni', n=4)
# 0.52

p.adjust(0.0318, method='bonferroni', n=4)
# 0.1272

p.adjust(0.241, method='bonferroni', n=4)
# 0.964


# # T-TESTS EFFECT SIZE (COHEN'S D)

cohensD(eptic_en$sttr[eptic_en$text_type=="en_sp_tt"], 
		eptic_en$sttr[eptic_en$text_type=="en_wr_tt"])
# 2.312066

cohensD(eptic_en$sttr[eptic_en$text_type=="en_sp_st"], 
		eptic_en$sttr[eptic_en$text_type=="en_wr_st"])
# 0.5033751

cohensD(eptic_en$sttr[eptic_en$text_type=="en_sp_tt"], 
		eptic_en$sttr[eptic_en$text_type=="en_sp_st"])
# 0.7657748

cohensD(eptic_en$sttr[eptic_en$text_type=="en_wr_tt"], 
		eptic_en$sttr[eptic_en$text_type=="en_wr_st"])
# 0.4038092