Scripts/Script for AMOVA, Fst, Ho,He, Fis and DAPC.R · PopGenPaper

library(vcfR)
library(adegenet)
library(poppr)
library(mmod)
library(ape)
library(magrittr)
library(ggplot2)

setwd("/home/sergio/OneDrive/DocumentosData 1/Proyectos/Proyecto Salvador")
#vcf <- read.vcfR("batch_1NombreCorto (1).vcf", verbose = FALSE )
#vcf <- read.vcfR("batch_1NombreCorto_(1)_filtered (1).vcf", verbose = FALSE)
vcf <- read.vcfR("batch_1NombreCorto_(1)_filteredMAF0.01.vcf", verbose = FALSE)
vcfsamplenames <- colnames(vcf@gt)
#the object SNPSmsSnpInd has the list of selected individuals with missingness less than 0.2
#taken from the genetic filtering analysis using "rownames(SNPSmsSnpInd)":
samples50 <- c("A9944", "CHJ0007_L", "CHJ011_A", "CHJ083_A", "CHJ155_A", "CHJ188_A", "S174_A", "S236_A", 
               "S327_A", "S328_A", "S333_A", "S336_A", "S358_A", "S359_A", "S443_A", "S445_A", 
               "S446_A", "S447_A", "S448_A", "S451_A", "S455_A", "S476_A", "S478_A", "S486_A", 
               "S507a_A", "S507b_A", "S649_A", "TEX0010", "TEX0023", "TPG1017_A", "TPG154", "TPG715", 
               "TPS0053", "TPS0068", "TPS0180", "TPS0225", "TPS0330", "CHJ1016_L", "CHJ168_A", "CHJ169_A", 
               "CHJ172_A", "CHJ360_A", "CHJ362_A", "CHJ363_A", "CHJ364_A", "CHJ366_A", "CHJ367_A", "CHJ376_A", 
               "CHJ378_A", "CHJ386_A")
#these are the excluded samples:
setdiff(vcfsamplenames, samples50)



#vcf2 has here extraction of the individuals that are listed in the samples of SNPSmsSnpInd, and preserving
#the first column named "format"

vcf2 <- vcf[,c(1,match(samples50,vcfsamplenames))]
#write.vcf(vcf2, file = "batch_1NombreCorto_(1)_filteredMAF0.01ind50.vcf.gz")
#rownames(SNPSmsSnpInd)
#colnames(vcf@gt)[-1]
strata <-as.data.frame(read.csv2("GeneindStrata.csv",sep = ","))


strata2 <- strata[match(colnames(vcf2@gt)[-1],strata$Sample),]

#to correct the error I needed to change the vcf ID column to something unique https://www.biostars.org/p/305137/
#this was taken from filtering script: y$ID <- paste(y$X.CHROM,"_",y$POS,"_",y$ID,sep="")
vcf2@fix[,"ID"] = paste0(vcf@fix[,"CHROM"],"_",vcf@fix[,"POS"],vcf@fix[,"ID"])
#I used this before, and also works vcf2@fix[,"ID"] = paste0(vcf@fix[,"CHROM"],"_",vcf@fix[,"POS"],"_",1:2)
#head(vcf)
#vcf@fix[,"ID"]

my_genind <- vcfR2genind(vcf2)

# to see if there are duplicates:https://stackoverflow.com/questions/16905425/find-duplicate-values-in-r
n_occur = data.frame(table(vcf2@fix[,"ID"]))
n_occur[n_occur$Freq > 1,]
#Here the ID variable have only unique values.  Error corrected.

my_genind <- vcfR2genind(vcf2)
#names(as.data.frame(vcf@gt)) #to get sample names, but better use:
indNames(my_genind)
#to extract individuals see: https://lists.r-forge.r-project.org/pipermail/adegenet-forum/2015-December/001388.html
#I got sample names as an object and the exported as csv, I got the GeneindStrata.csv from Google Docs and
#exported "GeneindStrata.csv".  Then I imported it.

#strata(my_genind) = as.data.frame(strata$VillageDepartmentCountry)
#splitStrata(my_genind, ~Country/Department/Village, sep = "_")

strata(my_genind) = data.frame(Country = strata2$Country,Department = strata2$DepartmentSA,Village =strata2$Village.1)

setPop(my_genind) <- ~Country/Department
#from https://github.com/mossmatters/Structure-Pipeline/blob/master/structureplot.R
#poppr(my_genind)
#To save my_genind: save(my_genind,file="genindfiltered50ind.robj"), taken from: https://lists.r-forge.r-project.org/pipermail/adegenet-forum/2015-August/001227.html
#to open the object: load("genindfiltered50ind.robj")
#https://grunwaldlab.github.io/Population_Genetics_in_R/Pop_Structure.html
#my_genind_dist <- provesti.dist(my_genind)
#par(mar = c(1,1,1,1),xpd=TRUE)
# theTree <- my_genind_dist %>%
#   nj() %>%    # calculate neighbor-joining tree
#   ladderize() # organize branches by clade
# plot(theTree)
# add.scale.bar(length = 0.05) # add a scale bar showing 5% difference.

# Analysis from https://grunwaldlab.github.io/Population_Genetics_in_R/Pop_Structure.html
par(mar = c(1,1,3,1),xpd=TRUE)
set.seed(999)
my_genind %>%
  genind2genpop(pop = ~Country/Department) %>%
  aboot(cutoff = 50, quiet = TRUE, sample = 1000, distance = nei.dist)


my_genind %>%
  genind2genpop(pop = ~Country/Department/Village) %>%
  aboot(cutoff = 50, quiet = TRUE, sample = 1000, distance = nei.dist)

#AMOVA 
My_Genind_amova <- poppr.amova(my_genind, ~Country/Department)
My_Genind_amova
My_Genind_amova$results
My_Genind_amova$componentsofcovariance
#from https://grunwaldlab.github.io/Population_Genetics_in_R/AMOVA.html:
My_Genind_amova_test    <- randtest(My_Genind_amova, nrepet = 999)
My_Genind_amova_test

#https://rdrr.io/cran/hierfstat/man/basic.stats.html
#calculate Fst between populations
library("hierfstat")
mybasicstats = basic.stats(my_genind)
mybasicstats$overall #to see the global Ho, Hs, HT, Dst, etc.
t(summary(mybasicstats$Ho)[4,])
t(summary(mybasicstats$Hs)[4,])
t(summary(mybasicstats$Fis)[4,])

#Mann Whitney U for Ho != Hs
for (i in 1:4) {
print(colnames(mybasicstats$Ho)[i]);
print(wilcox.test(mybasicstats$Ho[,i], mybasicstats$Hs[,i]))
}

#from: https://statsandr.com/blog/one-sample-wilcoxon-test-in-r/
for (i in 1:4) {
  print(colnames(mybasicstats$Fis)[i]);
  print(wilcox.test(mybasicstats$Fis[,i]
                    ))
}


#https://www.geeksforgeeks.org/how-to-find-confidence-intervals-in-r/
# R program to find the confidence interval
# Calculate the mean and standard error

model <- lm(mybasicstats$Ho ~ 1)
# Find the confidence interval of Ho
confint(model, level=0.95)

model <- lm(mybasicstats$Hs ~ 1)
# Find the confidence interval of Hs
confint(model, level=0.95)

model <- lm(mybasicstats$Fis ~ 1)
# Find the confidence interval of Fis
confint(model, level=0.95)
model

#calculate number of non NaN elements in Fis column https://stackoverflow.com/questions/24027605/determine-the-number-of-na-values-in-a-column
length(mybasicstats$Fis) - sum(is.na(mybasicstats$Fis))
length(mybasicstats$Ho) - sum(is.na(mybasicstats$Ho))
length(mybasicstats$Hs) - sum(is.na(mybasicstats$Hs))

par(mar=c(2, 2, 2, 2))
Ho = as.data.frame(mybasicstats$Ho)
par(mfrow = c(2, 2))
for(i in 1:length(Ho)) {
  hist(as.numeric(unlist(Ho[i]) ) ,main=paste0("Histogram of Ho of ",names(Ho)[i]),
       ylim = c(0, 800))
}
Hs = as.data.frame(mybasicstats$Hs)
for(i in 1:length(Hs)) {
  hist(as.numeric(unlist(Hs[i]) ) ,main=paste0("Histogram of Hs of ",names(Hs)[i]),
       ylim = c(0, 800))
}

#The overlapping Ho and He in the same plots.
#https://www.dataanalytics.org.uk/plot-two-overlapping-histograms-on-one-chart-in-r/#:~:text=To%20make%20sure%20that%20both,combined%20range%20of%20the%20samples.
ax <- pretty(0:1,n=12)
c1 <- rgb(173,216,230,max = 255, alpha = 80, names = "lt.blue")
c2 <- rgb(255,192,203, max = 255, alpha = 80, names = "lt.pink")
#reorder columns:
Ho <- Ho[,c(4,1,2,3)]
Hs <- Hs[,c(4,1,2,3)]
departmentsforgraph <- gsub("_"," ",colnames(Ho))
departmentsforgraph <- gsub("lS","l S",departmentsforgraph,ignore.case = FALSE)
departmentsforgraph <- gsub("eA","e-A",departmentsforgraph,ignore.case = FALSE)
departmentsforgraph <- gsub("Salvador","Salvador,",departmentsforgraph,ignore.case = FALSE)
departmentsforgraph <- gsub("Guatemala","Guatemala,",departmentsforgraph,ignore.case = FALSE)

#Histograms of Ho and Hs
par(mfrow = c(1, 1))
# #for common legend: https://stackoverflow.com/questions/10389967/common-legend-for-multiple-plots-in-r
# m <- matrix(c(1,2,3,1,4,5,1,6,6),nrow = 3,ncol = 3,byrow = TRUE)
# layout(mat = m,heights = c(0.4,0.4,0.2),widths = c(0.1,0.45,0.45))
# plot(1, type = "n", axes=N, xlab="", ylab="SNPs number", bty = 'n')
# text(x=1,y=1.1,"Number of SNPs",srt=90,cex=2)
# #legend(x="left", legend="SNPs number",cex=2, bty = "n",seg.len=1, xpd = TRUE, horiz = F)
# 
# for(i in 1:length(Ho)) {
#   histHo <- hist(as.numeric(unlist(Ho[i])), breaks = ax, plot = FALSE) #first histogram
#   histHs <- hist(as.numeric(unlist(Hs[i])), breaks = ax, plot = FALSE) #second histogram
#     plot(histHo, col = "yellow",ylim = c(0,800), main = paste0("\n\n\n",departmentsforgraph[i]),xlab="", ylab="") #https://stackoverflow.com/questions/33480500/is-there-a-way-to-show-overlapping-histograms-in-r-without-adjusting-transparenc
#   plot(histHs, col = "red",ylim = c(0,800), add = TRUE, xlab="", ylab="")
#   plot(histHo, col = "yellow",ylim = c(0,800), add = TRUE, xlab="", ylab="")  
#   plot(histHs, col = rgb(1,0,0,0.5),ylim = c(0,800), add = TRUE, xlab="", ylab="")
# 
# }
# 
# plot(1, type = "n", axes=FALSE, xlab="", ylab="", bty = 'n')
# plot_colors <- c("yellow","red")
# legend(x="top", y=NULL, legend = c("Ho","Hs"), fill = c("yellow","red"), 
#        cex = 1.8,horiz = TRUE,inset = -0,bty = "n" )
# legend(x="bottom", legend="Heterozygocities",cex=2, bty = "n",seg.len=1, xpd = TRUE)

#for common legend: https://stackoverflow.com/questions/10389967/common-legend-for-multiple-plots-in-r
m <- matrix(c(1,2,3,1,4,5,1,6,6),nrow = 3,ncol = 3,byrow = TRUE)
layout(mat = m,heights = c(0.4,0.4,0.2),widths = c(0.1,0.45,0.45))
plot(1, type = "n", axes=FALSE, xlab="", ylab="SNPs number", bty = 'n')
text(x=1,y=1.1,"Number of SNPs",srt=90,cex=2)
#legend(x="left", legend="SNPs number",cex=2, bty = "n",seg.len=1, xpd = TRUE, horiz = F)

for(i in 1:length(Ho)) {
  histHo <- hist(as.numeric(unlist(Ho[i])), breaks = ax, plot = FALSE) #first histogram
  histHs <- hist(as.numeric(unlist(Hs[i])), breaks = ax, plot = FALSE) #second histogram
  plot(histHo, col = "yellow",ylim = c(0,800), main = paste0("\n",departmentsforgraph[i]),
       xlab="", ylab="",cex.main=1.5) #https://stackoverflow.com/questions/33480500/is-there-a-way-to-show-overlapping-histograms-in-r-without-adjusting-transparenc
  plot(histHs, col = "red",ylim = c(0,800), add = TRUE, xlab="", ylab="")
  plot(histHo, col = "yellow",ylim = c(0,800), add = TRUE, xlab="", ylab="")  
  plot(histHs, col = rgb(1,0,0,0.5),ylim = c(0,800), add = TRUE, xlab="", ylab="")
  # if(i==2) {
  #   legend(x="topright",legend=c("Ho","Hs"),fill=c("yellow","red"),cex=1.8)
  # }
  
}

plot(1, type = "n", axes=FALSE, xlab="", ylab="", bty = 'n')
plot_colors <- c("yellow","red")
legend(x="top", y=NULL, legend = c("Ho","Hs"), fill = c("yellow","red"), 
       cex = 1.8,horiz = TRUE,inset = -0.5,bty = "n" )
legend(x="top", legend="Heterozygocities",cex=2, bty = "n",seg.len=1, xpd = TRUE)












# #for common legend: https://stackoverflow.com/questions/10389967/common-legend-for-multiple-plots-in-r
# m <- matrix(c(1,2,3,1,4,5,1,6,6),nrow = 3,ncol = 3,byrow = TRUE)
# layout(mat = m,heights = c(0.4,0.4,0.2),widths = c(0.1,0.45,0.45))
# plot(1, type = "n", axes=N, xlab="", ylab="SNPs number", bty = 'n')
# text(x=1,y=1.1,"Number of SNPs",srt=90,cex=2)
# #legend(x="left", legend="SNPs number",cex=2, bty = "n",seg.len=1, xpd = TRUE, horiz = F)
# 
# for(i in 1:length(Ho)) {
#   histHo <- hist(as.numeric(unlist(Ho[i])), breaks = ax, plot = FALSE) #first histogram
#   histHs <- hist(as.numeric(unlist(Hs[i])), breaks = ax, plot = FALSE) #second histogram
#   plot(histHo, col = "yellow",ylim = c(0,800), main = paste0("\n\n\n",departmentsforgraph[i]),xlab="", ylab="") #https://stackoverflow.com/questions/33480500/is-there-a-way-to-show-overlapping-histograms-in-r-without-adjusting-transparenc
#   plot(histHs, col = "red",ylim = c(0,800), add = TRUE, xlab="", ylab="")
#   plot(histHo, col = "yellow",ylim = c(0,800), add = TRUE, xlab="", ylab="")  
#   plot(histHs, col = rgb(1,0,0,0.5),ylim = c(0,800), add = TRUE, xlab="", ylab="")
#   if(i==2) {
#     legend(x="topright",legend=c("Ho","Hs"),fill=c("yellow","red"),cex=1.8)
#   }
#   
# }
# 
# plot(1, type = "n", axes=FALSE, xlab="", ylab="", bty = 'n')
# #plot_colors <- c("yellow","red")
# #legend(x="top", y=NULL, legend = c("Ho","Hs"), fill = c("yellow","red"), 
# #       cex = 1.8,horiz = TRUE,inset = -0,bty = "n" )
# legend(x="top", legend="Heterozygocities",cex=2, bty = "n",seg.len=1, xpd = TRUE)



par(mfrow = c(1, 1))
FstDepart  <- pairwise.neifst(my_genind)
plot(hclust(as.dist(FstDepart)), main="Fst dendrogram between departments")
# to get confidence intervals for pairwise Fst:
boot.ppfst(my_genind)  #https://rdrr.io/cran/hierfstat/man/boot.ppfst.html


setPop(my_genind) <- ~Country/Department/Village
FstVillage  <- pairwise.neifst(my_genind) #write.csv(FstVillage,"FstVillage.csv")
#plot(hclust(as.dist(FstVillage))) #now it gives an error which is corrected below
#erase columns and rows with NAs https://stackoverflow.com/questions/54153670/remove-na-values-from-distance-matrix-in-r
FstVillageNA = FstVillage[rowSums(is.na(FstVillage)) == 1, colSums(is.na(FstVillage)) == 1, drop = FALSE]
plot(hclust(as.dist(FstVillageNA)), main="Fst dendrogram between villages")
#write.csv(FstVillageNA,"FstVillageNA.csv")

#there was an error since there are some NaNs in the matrix 
#https://github.com/jokergoo/ComplexHeatmap/issues/155
#I found this function: 
#' Hclust cannot handle matrices in which for some pairs of rows and columns,
#' only 1 or fewer shared values are non-NA. This function recurrently
#' identifies the most aggravating column/row, excludes that column/row and checks
#' whether more columns/rows need to be excluded
#'
#' @param mat Matrix to investigate
#' @param min_shared_fields Minimum number of positions that are not NA in both
#' vectors in order not to flag the vector pair as problematic
#'
identify_problematic_combs <- function(mat, min_shared_fields = 1) {
  exclude_rows <- NULL
  exclude_cols <- NULL
  stopifnot(is.matrix(mat))
  
  ## Loop over candidate removals
  for (k in 1:nrow(mat)) {
    candidate_rows <- setdiff(1:nrow(mat), exclude_rows)
    problem_row_combs <- NULL
    for (i in candidate_rows) {
      i_idx <- which(candidate_rows == i)
      for (j in candidate_rows[i_idx:length(candidate_rows)]) {
        if (sum(!is.na(mat[i, ]) & !is.na(mat[j, ])) <= min_shared_fields) {
          problem_row_combs <- rbind(problem_row_combs, c(i, j))
        }
      }
    }
    if (is.null(problem_row_combs)) break
    exclude_rows <- c(exclude_rows,
                      as.integer(names(which.max(table(problem_row_combs)))))
  }
  
  for (k in 1:ncol(mat)) {
    candidate_cols <- setdiff(1:ncol(mat), exclude_cols)
    problem_col_combs <- NULL
    for (i in candidate_cols) {
      i_idx <- which(candidate_cols == i)
      for (j in candidate_cols[i_idx:length(candidate_cols)]) {
        if (sum(!is.na(mat[, i]) & !is.na(mat[, j])) <= min_shared_fields) {
          problem_col_combs <- rbind(problem_col_combs, c(i, j))
        }
      }
    }
    if (is.null(problem_col_combs)) break
    exclude_cols <- c(exclude_cols,
                      as.integer(names(which.max(table(problem_col_combs)))))
  }
  
  return(list('row' = exclude_rows, 'column' = exclude_cols))
}


remove_problematic_combs <- function() {
  problematic_combs <- identify_problematic_combs(
    mat = mat, min_shared_fields = min_shared_fields)
  if (!is.null(problematic_combs$row)) {
    mat <- mat[-problematic_combs$row, ]
  }
  if (!is.null(problematic_combs$column)) {
    mat <- mat[, -problematic_combs$column]
  }
  return(mat)
}
formals(remove_problematic_combs) <- formals(identify_problematic_combs)
#use the functions to eliminate the problematic rows(columns)
identify_problematic_combs(FstVillage,10)
par(mar = c(3,3,3,3),xpd=TRUE)
FstVillage <- remove_problematic_combs(FstVillage,10)

#plot without three problematic villages
plot(hclust(as.dist(FstVillage)))

# #https://popgen.nescent.org/DifferentiationSNP.html
# #unsupervised clustering
# # using Kmeans and DAPC in adegenet 
# set.seed(20160308) # Setting a seed for a consistent result
# grp <- find.clusters(my_genind, max.n.clust = 10, n.pca = 20, choose.n.clust = FALSE) 
# names(grp)
# grp$grp
# par(mar=c(0,0,0,0))
# dapc1 <- dapc(my_genind, grp$grp, n.pca = 20, n.da = 6) 
# scatter(dapc1) # plot of the group

#https://grunwaldlab.github.io/Population_Genetics_in_R/DAPC.html
#DAPC using the Departments
#dapc <- dapc(my_genind, var.contrib = TRUE, scale = FALSE, n.pca = 30, n.da = nPop(my_genind) - 1)
#scatter(dapc, cell = 0, pch = 18:23, cstar = 0, mstree = TRUE, lwd = 2, lty = 2, posi.da = "topright")

setPop(my_genind) <- ~Country/Department
dapc <- dapc(my_genind, var.contrib = TRUE, scale = FALSE, n.pca = 30, n.da = nPop(my_genind) - 1)
scatter(dapc, cell = 0, pch = 18:23, cstar = 0, mstree = TRUE, lwd = 2, lty = 2, posi.da = "topright")

#from https://popgen.nescent.org/startMicrosatellite.html
div = summary(my_genind)
plot(div$Hobs, xlab="Loci number", ylab="Observed Heterozygosity", 
     main="Observed heterozygosity per locus")
plot(div$Hobs, div$Hexp, xlab="Observed Heterozygosity", ylab="Expected Heterozygosity", 
     main="Expected heterozygosity as a function of observed heterozygosity per locus")

basicstat <- basic.stats(my_genind,diploid=TRUE,digits=4)
names(basicstat)
Fis = summary(basicstat$Fis)
Fis = as.data.frame(Fis)
names(Fis) = c("No", "Dept", "Fis")
Fis
Fis = Fis[grep("Mean",Fis$Fis),]
Fis$Fis = gsub("Mean   : ", "", Fis$Fis)

#to get the mean and the sd in two tables:
summaryHoHeFismean = matrix(, nrow = 4, ncol = 3)
summaryHoHeFisSD = matrix(, nrow = 4, ncol = 3)
for (j in 3:5) {
  for (i in 1:4) {
    print(paste(colnames(basicstat[[j]])[i],names(basicstat)[j]))
    print(paste("mean = ",mean(basicstat[[j]][,i],na.rm=T)))
    print(paste("sd = ",sd(basicstat[[j]][,i],na.rm=T)))
    summaryHoHeFismean[i,j-2]=mean(basicstat[[j]][,i],na.rm=T)
    summaryHoHeFisSD[i,j-2]=sd(basicstat[[j]][,i],na.rm=T)
  }}
colnames(summaryHoHeFismean) = names(basicstat[3:5])
rownames(summaryHoHeFismean) = colnames(basicstat[[3]])
summaryHoHeFismean =summaryHoHeFismean[c(4,1,3,2),]

colnames(summaryHoHeFisSD) = names(basicstat[3:5])
rownames(summaryHoHeFisSD) = colnames(basicstat[[3]])
summaryHoHeFisSD =summaryHoHeFisSD[c(4,1,3,2),]
summaryHoHeFismean = round(summaryHoHeFismean,4)
summaryHoHeFisSD = round(summaryHoHeFisSD,4)
summaryHoHeFismean
summaryHoHeFisSD


#DAPC taken from the tutorial chrome-extension://oemmndcbldboiebfnladdacbdfmadadm/https://adegenet.r-forge.r-project.org/files/tutorial-dapc.pdf
setPop(my_genind) <- ~Country/Department
grp <- find.clusters(my_genind, max.n.clust=40)
# I chose number of groups as 2 and 4.
table(pop(my_genind), grp$grp)
#table.value(table(pop(my_genind),grp$grp), col.lab=paste("inf", 1:4),row.lab=paste("ori", 1:4))
table.value(table(pop(my_genind),grp$grp), col.lab=paste("inf", 1:4),row.lab=unique(pop(my_genind)))

#erase strata data from genind object and add the pop$pop as the population
strata(my_genind) = NULL
pop(my_genind) = grp$grp


#To see number of PC that should be retained:
par(mar = c(5,5,1,1),xpd=TRUE)


dapc1 <- dapc(my_genind, grp = grp$grp,n.pca = 10, n.da = length(unique(grp$grp)-1))
#nevertheless, there is not a clear limit under 62.
#it is easier to use about 10 pcs retained and 4 DF retained.
#scatter(dapc1, posi.da = "topright")
#scatter(dapc1,1,1, col=myCol, bg="white",
#        scree.da=FALSE, legend=TRUE, solid=.4)
scatter(dapc1, posi.da="bottomright", bg="white", pch=17:20, col = c("blue", "red", "darkgreen", "black"),
        cex=1,clab=0, leg=TRUE,txt.leg=paste("Cluster",1:4), cell=0, cstar=0)

#scatter(dapc(my_genind, var.contrib = TRUE, scale = FALSE, n.pca = 30, n.da = length(unique(grp$grp)-1)),
#        posi.da = "topright", grp = grp$grp)

#For different number of groups, in this case 10 groups:
#reset strata and pop data for genind object
strata(my_genind) = data.frame(Country = strata$Country,Department = strata$DepartmentSA,Village =strata$Village.1)

setPop(my_genind) <- ~Country/Department
#obtain 10 groups
grp <- find.clusters(my_genind, max.n.clust=40,n.pca = 62, n.clust = 10)

table(pop(my_genind), grp$grp)

table.value(table(pop(my_genind),grp$grp), col.lab=paste("inf", 1:10),row.lab=unique(pop(my_genind)))

#erase strata data from genind object and add the pop$pop as the population
strata(my_genind) = NULL
pop(my_genind) = grp$grp


#To see number of PC that should be retained:
par(mar = c(5,5,1,1),xpd=TRUE)

dapc2 <- dapc(my_genind, grp = grp$grp,n.pca = 10, n.da = (length(unique(grp$grp))-1))
scatter(dapc2, posi.da="bottomright", bg="white", pch =15:20 , col = c("blue", "red", "darkgreen", "black","pink","orange",
                                                                             "magenta","brown","brown3","purple"),
                                                                             cex=1,clab=0, leg=TRUE,txt.leg=paste("Cluster",unique(grp$grp)), cell=0, cstar=0)

scatter(dapc2, posi.da="bottomright", bg="white", pch =15:20 , col = c("blue", "red", "darkgreen", "black","pink","orange",
                                                                             "magenta","brown","brown3","purple"),
                                                                             cex=1,clab=0.5, leg=TRUE,txt.leg=paste("Cluster",unique(grp$grp)), cell=0, cstar=0)

#groups from 2 to 10

set.seed(100)

for (x in 3:10) {
  set.seed(100)
  grp <- find.clusters(my_genind, max.n.clust=40,n.pca = dim(dataDAPC)[1], n.clust = x)
  strata(my_genind) = data.frame(Country = strata2$Country,Department = strata2$DepartmentSA,Village =strata2$Village.1)
  setPop(my_genind) <- ~Country/Department
  table(pop(my_genind), grp$grp)
  
  table.value(table(pop(my_genind),grp$grp), col.lab=paste("inf", 1:10),row.lab=unique(pop(my_genind)))
  
  #erase strata data from genind object and add the pop$pop as the population
  strata(my_genind) = NULL
  pop(my_genind) = grp$grp
  
  
  par(mar = c(5,5,1,1),xpd=TRUE)
  
  dapc2 <- dapc(my_genind, grp = grp$grp,n.pca = 10, n.da = (length(unique(grp$grp))-1))
  
  # scatter(dapc2, posi.da="bottomright", bg="white", pch =15:20 , col = c("blue", "red", "darkgreen", "black","pink","orange",
  #                                                                        "magenta","brown","brown3","purple"),
  #         cex=1,clab=0, leg=TRUE,txt.leg=paste("Cluster",unique(grp$grp)), cell=0, cstar=0)
  # 
  scatter(dapc2, posi.da="bottomleft", bg="white", pch =15:20 , col = c("blue", "red", "darkgreen", "black","pink","orange",
                                                                              "magenta","brown","brown3","purple"),
                                                                              cex=1,clab=0.5, leg=TRUE,txt.leg=paste("Cluster",unique(grp$grp)), cell=0, cstar=0)
  # scatter(dapc2, posi.da="bottomright", bg="white", pch =15:20 , col = c("blue", "red", "darkgreen", "black","pink","orange",
  #                                                                       "magenta","brown","brown3","purple"),
  #         cex=1,clab=0.5, leg=TRUE,txt.leg=paste("Cluster",unique(grp$grp)), cell=0, cstar=0, xlim = c(-0.1,0.1),ylim = c(-5,5))
}
#######################################33333
x=3
set.seed(100)
grp <- find.clusters(my_genind, max.n.clust=40,n.pca = dim(dataDAPC)[1], n.clust = x)
strata(my_genind) = data.frame(Country = strata2$Country,Department = strata2$DepartmentSA,Village =strata2$Village.1)
setPop(my_genind) <- ~Country/Department
table(pop(my_genind), grp$grp)

table.value(table(pop(my_genind),grp$grp), col.lab=paste("inf", 1:10),row.lab=unique(pop(my_genind)))

#erase strata data from genind object and add the pop$pop as the population
strata(my_genind) = NULL
pop(my_genind) = grp$grp


par(mar = c(5,5,1,1),xpd=TRUE)

dapc2 <- dapc(my_genind, grp = grp$grp,n.pca = 10, n.da = (length(unique(grp$grp))-1))
############################*****************


dataDAPC <- dapc2$ind.coord
dataDAPC <- as.data.frame(dataDAPC)
dataDAPC$Dept <- as.factor(strata2$DepartmentSA)
#dataDAPC$DeptCol <- colors$DepartmentColor
dataDAPC$DeptCol <- rainbow(dim(dataDAPC)[1])
#plot(dataDAPC$LD1, dataDAPC$LD2, col=dataDAPC$DeptColor, pch=16)
# https://www.geeksforgeeks.org/how-to-color-scatter-plot-points-in-r/
#in a for loop, ggplot needs "print()": https://stackoverflow.com/questions/15678261/ggplot-does-not-work-if-it-is-inside-a-for-loop-although-it-works-outside-of-it
print(ggplot(dataDAPC, aes(x=LD1, y=LD2,group=Dept))+ 
        geom_point(aes(color=Dept)) + ggtitle(paste0("K = ",x)))# +xlim(-25,75)+ylim(-25,75)

#################################################33
#This is the DAPC graph with asterix pointing to El Carrizal
dataDAPC <- dapc2$ind.coord
dataDAPC <- as.data.frame(dataDAPC)
dataDAPC$Village <- as.factor(strata2$Village )
dataDAPC$Dept <- strata2$Department
dataDAPC$Country <- strata2$Country
dataDAPC$Dept[dataDAPC$Dept== "Santa_Ana"] <- "Santa Ana"
dataDAPC$DeptCol <-   ifelse(dataDAPC$Dept == "Jutiapa", "blue2",
                                                ifelse(dataDAPC$Dept == "Chiquimula", "dodgerblue3",
                                                ifelse(dataDAPC$Dept == "Sonsonate", "magenta",
                                                ifelse(dataDAPC$Dept == "Ahuachapan","orange4",
                                                ifelse(dataDAPC$Dept == "Santa Ana","deeppink3",NA)))))

dataDAPC$VillageIDs <- ifelse(dataDAPC$Village == "Santa_Ana", "",
                      ifelse(dataDAPC$Village == "El Zacatal, Amatepec", "EZa",
                      ifelse(dataDAPC$Village == "El Jute", "EJu",
                      ifelse(dataDAPC$Village == "El Aguacate, Azacualpa, Municipio de Armenia", "Aza",
                      ifelse(dataDAPC$Village == "Chilguyo, Texistepeque", "Chi",
                      ifelse(dataDAPC$Village == "Caserio Sarita Loma, Aldea La Primavera", "LPi",
                      ifelse(dataDAPC$Village == "Caserio San Raymundo, Canton Llano", "LDM",
                      ifelse(dataDAPC$Village == "Cantón Las Flores Cerro Alto, Municipio Caluco ", "CAl",
                      ifelse(dataDAPC$Village == "Llano Santa María", "LSM",
                      ifelse(dataDAPC$Village == "El_Chaperno", "ECh",
                      ifelse(dataDAPC$Village == "El Cerron, Olopa","ECe",
                      ifelse(dataDAPC$Village == "La Prensa, Olopa","LPr",
                      ifelse(dataDAPC$Village == "El_Carrizal","ECa",""
                      )))))))))))))
dataDAPC$group <- c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "A", "B", "A", "A", "B", "A", "B", "B", "A", "B", "B", "B", "B", "B", "B", "B", "B", "A", "A", "C", "C", "C", "A", "B", "A", "A", "A", "C", "A", "A", "A", "C", "C", "C", "C", "C", "C", "C", "C", "C")


dataDAPC$VillageCol = ifelse(dataDAPC$Village == "Santa Ana", "gray62",
                      ifelse(dataDAPC$Village == "El Zacatal, Amatepec", "hotpink",
                      ifelse(dataDAPC$Village == "El Jute", "orange3",
                      ifelse(dataDAPC$Village == "El Zacatal, Amatepec", "orange4",
                      ifelse(dataDAPC$Village == "El Jute", "magenta",
                      ifelse(dataDAPC$Village == "El Aguacate, Azacualpa, Municipio de Armenia", "red",
                      ifelse(dataDAPC$Village == "Chilguyo, Texistepeque", "coral1",
                      ifelse(dataDAPC$Village == "Caserio Sarita Loma, Aldea La Primavera", "deeppink",
                      ifelse(dataDAPC$Village == "Caserio San Raymundo, Canton Llano", "coral3",
                      ifelse(dataDAPC$Village == "Cantón Las Flores Cerro Alto, Municipio Caluco ", "coral4",
                      ifelse(dataDAPC$Village == "Llano Santa María", "cyan3",
                      ifelse(dataDAPC$Village == "El_Chaperno", "green",
                      ifelse(dataDAPC$Village == "El Cerron, Olopa","blue",
                      ifelse(dataDAPC$Village == "La Prensa, Olopa","blueviolet",
                      ifelse(dataDAPC$Village == "El_Carrizal","dodgerblue",
                      "gray62"
                      )))))))))))))))
dataDAPC$VillageShort <- ifelse(dataDAPC$Village == "Santa Ana", "",
                        ifelse(dataDAPC$Village == "El Zacatal, Amatepec", "El Zacatal",
                        ifelse(dataDAPC$Village == "El Jute", "El Jute",
                        ifelse(dataDAPC$Village == "El Aguacate, Azacualpa, Municipio de Armenia", "Azacualpa",
                        ifelse(dataDAPC$Village == "Chilguyo, Texistepeque", "Chilcuyo",
                        ifelse(dataDAPC$Village == "Caserio Sarita Loma, Aldea La Primavera", "La Primavera",
                        ifelse(dataDAPC$Village == "Caserio San Raymundo, Canton Llano", "Llano de Dona María",
                        ifelse(dataDAPC$Village == "Cantón Las Flores Cerro Alto, Municipio Caluco ", "Cerro Alto",
                        ifelse(dataDAPC$Village == "Llano Santa María", "Llano Santa María",
                        ifelse(dataDAPC$Village == "El_Chaperno", "El Chaperno",
                        ifelse(dataDAPC$Village == "El Cerron, Olopa","El Cerron",
                        ifelse(dataDAPC$Village == "La Prensa, Olopa","La Prensa",
                        ifelse(dataDAPC$Village == "El_Carrizal","El Carrizal",""
                        )))))))))))))
dataDAPC$VillageCol 
#change the order of departmentshttps://www.datanovia.com/en/blog/how-to-change-ggplot-legend-order/
dataDAPC$Dept <- factor(dataDAPC$Dept, levels = c("Chiquimula" ,"Jutiapa" ,"Ahuachapan" ,   "Santa Ana" , "Sonsonate" ))
dataDAPC$DeptCol <- factor(dataDAPC$DeptCol, levels = c("dodgerblue" ,"blue2" ,"orange4" ,   "deeppink3" , "magenta" ))

dataDAPC$VillageShort <- factor(dataDAPC$VillageShort, levels = (c("El Cerron",
                                                                   "La Prensa",
                                                                 "El Carrizal",
                                                                 "El Chaperno",
                                                                 "Llano Santa María",
                                                                 "Llano de Dona María",
                                                                 "Chilcuyo",
                                                                 "El Jute",
                                                                 "El Zacatal",
                                                                 "La Primavera",
                                                                 "Azacualpa",
                                                                 "Cerro Alto",
                                                                 NA
                                                                 )))
dataDAPC$legend <- paste(dataDAPC$VillageShort, dataDAPC$VillageIDs)
#to change NA in dataDAPC$legend to "El Salvador":
dataDAPC$legend[is.na(dataDAPC$legend)]<-"El Salvador"
dataDAPC$legend[dataDAPC$legend=="NA "]<-"El Salvador"

dataDAPC$legend <- factor(dataDAPC$legend, levels = (c("El Cerron ECe",
                                                                   "La Prensa LPr",
                                                                   "El Carrizal ECa",
                                                                   "El Chaperno ECh",
                                                                   "Llano Santa María LSM",
                                                                   "Llano de Dona María LDM",
                                                                   "Chilcuyo Chi",
                                                                   "El Jute EJu",
                                                                   "El Zacatal EZa",
                                                                   "La Primavera LPi",
                                                                   "Azacualpa Aza",
                                                                   "Cerro Alto CAl",
                                                                   "El Salvador"
)))

dataDAPC$VillageCol <- factor(dataDAPC$VillageCol, levels = (c("blue",
                                                               "blueviolet",
                                                               "dodgerblue",
                                                               "green",
                                                               "cyan3",
                                                               "coral3",
                                                               "coral1",
                                                               "orange3",
                                                               "hotpink",
                                                               "deeppink",
                                                               "red",
                                                               "coral4",
                                                               "gray"
)))

#https://www.statology.org/r-add-column-to-data-frame-based-on-other-columns/
dataDAPC$Villagepch <- with(dataDAPC,ifelse(dataDAPC$Village=="El_Carrizal",8,19))
#dataDAPC$DeptCol <- colors$DepartmentColor
#dataDAPC$DeptCol <- rainbow(dim(dataDAPC)[1])
#plot(dataDAPC$LD1, dataDAPC$LD2, col=dataDAPC$DeptColor, pch=16)
# https://www.geeksforgeeks.org/how-to-color-scatter-plot-points-in-r/
#in a for loop, ggplot needs "print()": https://stackoverflow.com/questions/15678261/ggplot-does-not-work-if-it-is-inside-a-for-loop-although-it-works-outside-of-it
library(ggrepel)
library(ggforce)
image = ggplot(dataDAPC, aes(x=LD1, y=LD2,group=Dept,label=dataDAPC$VillageIDs))+ 
#  ggplot(dataDAPC, aes(x=LD1, y=LD2,group=Dept,label=dataDAPC$group))+ 
        geom_point(aes(color=legend), #shape=dataDAPC$Villagepch, 
                   cex=2.5, stroke = 1) + #stroke: https://stackoverflow.com/questions/51644418/how-to-change-the-linewidth-of-point-using-the-geom-point
        scale_color_manual(values = levels(unique(dataDAPC$VillageCol)))+ #https://stackoverflow.com/questions/15965870/fill-and-border-colour-in-geom-point-scale-colour-manual-in-ggplot
        ggtitle(paste0("K = ",x)) + # +xlim(-25,75)+ylim(-25,75) 
  labs(colour = "Village")+
    theme(text = element_text(size = 15))
#print(image)
image + geom_text_repel() + annotate(geom="text", x=-2, y=3, label="A", cex=10,color="black")+
              annotate(geom="text", x=-5, y=0.5, label="B", cex=10,color="black")+
              annotate(geom="text", x=6, y=1, label="C", cex=10,color="black") +
              stat_ellipse(mapping = aes(group=group),color="darkgray",level = 0.9) +  #https://r-charts.com/correlation/scatter-plot-ellipses-ggplot2/
              xlab("DAPC1 (74.56%)")+ ylab("DAPC2 (25.44%)")
# to get Ho and Hs from a genind object by village:
strata(my_genind) = data.frame(Country = strata2$Country,Department = strata2$DepartmentSA,Village =strata2$Village.1)
strata(my_genind)
setPop(my_genind) = ~Country/Department/Village
pop(my_genind)
Hs(my_genind)
Ho(my_genind)
VillageHsHo = data.frame(Hs = Hs(my_genind), Ho = Ho(my_genind))
stats = basic.stats(my_genind)
VillageHsHoFis = data.frame(Hs=colMeans(stats$Hs,na.rm = TRUE), #https://stackoverflow.com/questions/21807987/calculate-the-mean-for-each-column-of-a-matrix-in-r
                            Ho=colMeans(stats$Ho,na.rm = TRUE),
                            Fis=colMeans(stats$Fis,na.rm = TRUE))
VillageHsHoFis #this is for Mantel test.
# library(datapasta)
# df_paste()

#  https://stackoverflow.com/questions/52171700/r-draw-ellipse-in-scatterplot-not-stat-ellipse
#  geom_ellipse(aes(x0=0, y0=0, a=1, b=2,angle=30))#+geom_polygon()

  
# image + geom_label_repel(box.padding   = 0.35, #https://stackoverflow.com/questions/15624656/label-points-in-geom-point
#                          point.padding = 0.5,
#                          segment.color = 'grey50')
#image + geom_text(check_overlap = TRUE)
#ggsave(file="DAPC20230510.svg", plot=image, width=8, height=6)
#################################################


# strata(my_genind) = data.frame(Country = strata2$Country,Department = strata2$DepartmentSA,Village =strata2$Village.1)
# setPop(my_genind) <- ~Country/Department
# 
# 
# #read csv "colors" to get information about houses  and departments:
# colors <- read.csv("colors.csv")
# par(mar = c(5,5,5,5),xpd=TRUE)
# 
# #Graphs with departmente based coloring
# for (x in 3:10) {
#  
# grp <- find.clusters(my_genind, max.n.clust=40,n.pca = dim(dataDAPC)[1], n.clust = x )
# #erase strata data from genind object and add the pop$pop as the population
# strata(my_genind) = NULL
# pop(my_genind) = grp$grp
# table(pop(my_genind), grp$grp)
# dapc2 <- dapc(my_genind, grp = grp$grp,n.pca = 10, n.da = (length(unique(grp$grp))-1))
# dataDAPC <- dapc2$ind.coord
# dataDAPC <- as.data.frame(dataDAPC)
# dataDAPC$Dept <- as.factor(strata2$DepartmentSA)
# #dataDAPC$DeptCol <- colors$DepartmentColor
# dataDAPC$DeptCol <- rainbow(dim(dataDAPC)[1])
# #plot(dataDAPC$LD1, dataDAPC$LD2, col=dataDAPC$DeptColor, pch=16)
# # https://www.geeksforgeeks.org/how-to-color-scatter-plot-points-in-r/
# #in a for loop, ggplot needs "print()": https://stackoverflow.com/questions/15678261/ggplot-does-not-work-if-it-is-inside-a-for-loop-although-it-works-outside-of-it
# print(ggplot(dataDAPC, aes(x=LD1, y=LD2,group=Dept))+ geom_point(aes(color=Dept)) + ggtitle(paste0("K = ",x)))# +xlim(-25,75)+ylim(-25,75)
# #print(ggplot(dataDAPC, aes(x=LD1, y=LD2,group=Dept))+ geom_point(aes(color=Dept)) +xlim(-5,5)+ylim(-5,5)+ ggtitle(paste0("K = ",x)))
# }
# #warnings are due to some points taken out of the graphs when zooming in.

#things to try: confidence intervals https://rdrr.io/cran/hierfstat/man/boot.vc.html,
#https://grunwaldlab.github.io/Population_Genetics_in_R/Pop_Structure.html
#https://grunwaldlab.github.io/Population_Genetics_in_R/AMOVA.html