lda / it_analysis.r
it_analysis.r
Raw
# # # # # # # # # # # # # # # #
#         IT ANALYSIS         #
# # # # # # # # # # # # # # # #


# # PACKAGES USED

library(rstatix) 
library(dplyr) 
library(car)
library(lsr)
library(pastecs) 


# # DATA 

eptic_it <- read.csv("texts_with_sttr_it.csv")

summary(eptic_it)
#  direction          text_type              sttr       
# Length:69          Length:69          Min.   :0.4290  
# Class :character   Class :character   1st Qu.:0.4730  
# Mode  :character   Mode  :character   Median :0.4940  
#                                       Mean   :0.4897  
#                                       3rd Qu.:0.5030  
#                                       Max.   :0.5590


# # OUTLIERS

# Identifying outliers

outliers <- eptic_it %>%
  group_by(text_type) %>%
  identify_outliers(sttr) # https://www.rdocumentation.org/packages/rstatix/versions/0.7.2/topics/identify_outliers

# Removing outliers 

eptic_it <- eptic_it %>%
  anti_join(outliers, by = c("text_type", "direction", "sttr")) # https://search.r-project.org/CRAN/refmans/dplyr/html/filter-joins.html


# # DESCRIPTIVES

stats <- by(eptic_it$sttr, eptic_it$text_type, stat.desc) # https://www.rdocumentation.org/packages/pastecs/versions/1.3.21/topics/stat.desc
rounded_stats <- lapply(stats, round, 3)
rounded_stats 
# $it_sp_st
#      nbr.val     nbr.null       nbr.na          min          max        range          sum       median         mean      SE.mean CI.mean.0.95          var      std.dev     coef.var 
#       14.000        0.000        0.000        0.473        0.510        0.037        6.947        0.498        0.496        0.003        0.006        0.000        0.010        0.020 

# $it_sp_tt
#      nbr.val     nbr.null       nbr.na          min          max        range          sum       median         mean      SE.mean CI.mean.0.95          var      std.dev     coef.var 
#       17.000        0.000        0.000        0.429        0.508        0.079        7.805        0.465        0.459        0.006        0.012        0.001        0.023        0.050 

# $it_wr_st
#      nbr.val     nbr.null       nbr.na          min          max        range          sum       median         mean      SE.mean CI.mean.0.95          var      std.dev     coef.var 
#       17.000        0.000        0.000        0.463        0.538        0.075        8.488        0.503        0.499        0.005        0.010        0.000        0.019        0.038 

# $it_wr_tt
#      nbr.val     nbr.null       nbr.na          min          max        range          sum       median         mean      SE.mean CI.mean.0.95          var      std.dev     coef.var 
#       17.000        0.000        0.000        0.463        0.527        0.064        8.453        0.495        0.497        0.004        0.009        0.000        0.018        0.036 


# # NORMALITY TESTS

# Checking normality of the STTRs, by text_type 

tapply(eptic_it$sttr, eptic_it$text_type, shapiro.test)

# $it_sp_st
# data:  X[[i]]
# W = 0.9421, p-value = 0.4459

# $it_sp_tt
# data:  X[[i]]
# W = 0.92009, p-value = 0.1482

# $it_wr_st
# data:  X[[i]]
# W = 0.96484, p-value = 0.7235

# $it_wr_tt
# data:  X[[i]]
# W = 0.96958, p-value = 0.811


# # VARIANCES

leveneTest(eptic_it$sttr ~ eptic_it$text_type)

# Levene's Test for Homogeneity of Variance (center = median)
#       Df F value  Pr(>F)  
# group  3   2.511 0.06696 .
#       61                 
    

# # BOXPLOT

boxplot(eptic_it$sttr ~ eptic_it$text_type,
        main = "Boxplots of STTR by Text Type",
        xlab = "Text Type",
        ylab = "STTR",
        col = c(gray(0.9), gray(0.7), gray(0.5), gray(0.3)))
		 

# # ANOVA

anova_italian <- aov(eptic_it$sttr ~ eptic_it$text_type) # ANOVA assumptions are met so we can use aov()

summary(anova_italian)

#             Df  Sum Sq  Mean Sq F value   Pr(>F)    
# text_type    3 0.01873 0.006244   18.42 1.27e-08 ***
# Residuals   61 0.02068 0.000339                     
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


# # ANOVA EFFECT SIZE (OMEGA SQUARED) 

# The function is from here https://stats.stackexchange.com/questions/2962/omega-squared-for-measure-of-effect-in-r

omega_sq <- function(aovm){
    sum_stats <- summary(aovm)[[1]]
    SSm <- sum_stats[["Sum Sq"]][1]
    SSr <- sum_stats[["Sum Sq"]][2]
    DFm <- sum_stats[["Df"]][1]
    MSr <- sum_stats[["Mean Sq"]][2]
    W2 <- (SSm-DFm*MSr)/(SSm+SSr+MSr)
    return(W2)
}

omega_sq(anova_italian)
# 0.4456538


# # T-TESTS (UNCORRECTED)

t.test(eptic_it$sttr[eptic_it$text_type=="it_sp_tt"], 
		eptic_it$sttr[eptic_it$text_type=="it_wr_tt"],
		alternative = "two.sided",
		var.equal = TRUE)
# t = -5.3921, df = 32, p-value = 6.349e-06

t.test(eptic_it$sttr[eptic_it$text_type=="it_sp_st"], 
		eptic_it$sttr[eptic_it$text_type=="it_wr_st"],
		alternative = "two.sided",
		var.equal = TRUE)
# t = -0.54585, df = 29, p-value = 0.5893	
		
t.test(eptic_it$sttr[eptic_it$text_type=="it_sp_tt"], 
		eptic_it$sttr[eptic_it$text_type=="it_sp_st"],
		alternative = "two.sided",
		var.equal = TRUE)
# t = -5.6272, df = 29, p-value = 4.449e-06
		
t.test(eptic_it$sttr[eptic_it$text_type=="it_wr_tt"], 
		eptic_it$sttr[eptic_it$text_type=="it_wr_st"],
		alternative = "two.sided",
		var.equal = TRUE)
# t = -0.32368, df = 32, p-value = 0.7483


# # BONF. CORRECTIONS 

p.adjust(6.349e-06, method='bonferroni', n=4)
# 2.5396e-05

p.adjust(0.5893, method='bonferroni', n=4)
# 1

p.adjust(4.449e-06, method='bonferroni', n=4)
# 1.7796e-05 

p.adjust(0.7483, method='bonferroni', n=4)
# 1


# # T-TESTS EFFECT SIZE (COHEN'S D)

cohensD(eptic_it$sttr[eptic_it$text_type=="it_sp_tt"], 
		eptic_it$sttr[eptic_it$text_type=="it_wr_tt"])
# 1.849465

cohensD(eptic_it$sttr[eptic_it$text_type=="it_sp_st"], 
		eptic_it$sttr[eptic_it$text_type=="it_wr_st"])
# 0.1970006

cohensD(eptic_it$sttr[eptic_it$text_type=="it_sp_tt"], 
		eptic_it$sttr[eptic_it$text_type=="it_sp_st"])
# 2.030874

cohensD(eptic_it$sttr[eptic_it$text_type=="it_wr_tt"], 
		eptic_it$sttr[eptic_it$text_type=="it_wr_st"])
# 0.1110215