PopGenPaper / Scripts / Pairwise relatedness.R
Pairwise relatedness.R
Raw
#Relatedness analysis 20230324
#if (!requireNamespace("BiocManager", quietly=TRUE))
#   install.packages("BiocManager")
# BiocManager::install("gdsfmt")
# BiocManager::install("SNPRelate")
library(SNPRelate)
library(here)
library(dendextend)
library(circlize)


#Here the analysis was done with VCF filtered for missingness and only 50 individuals, MAF=0.01

setwd("/home/sergio/OneDrive/DocumentosData 1/Proyectos/Proyecto Salvador")
vcffile =  "batch_1NombreCorto_(1)_filteredMAF0.01.vcf"
gdsfile = "snpsNCMAF0.01.gds"
snpgdsVCF2GDS(vcffile, gdsfile, verbose=FALSE)
genofile <- snpgdsOpen(gdsfile)
#when I repeated this, I needed to use showfile.gds(closeall=TRUE) to close the gbs file, from https://github.com/zhengxwen/gdsfmt/issues/17
#to see the sample ids: read.gdsn(index.gdsn(genofile, "sample.id")) taken from https://www.bioconductor.org/packages/devel/bioc/vignettes/SNPRelate/inst/doc/SNPRelate.html
RV <- snpgdsIBS(genofile)
colors <- read.csv("colors.csv")
m <- 1 - RV$ibs
#colnames(m) <- rownames(m) <- RV$sample.id
colnames(m) <- rownames(m) <- colors$SampleID
#select only the samples with low missingness taken with match(rownames(SNPSmsSnpInd),vcfsamplenames)-1
samples50 <- c("A9944", "CHJ0007", "CHJ0011", "CHJ0083", "CHJ0155", "CHJ0188", "S174", "S236", "S327",
               "S328", "S333", "S336", "S358", "S359", "S443", "S445", "S446", "S447",
               "S448", "S451", "S455", "S476", "S478", "S486", "S507a", "S507b", "S649",
               "TEX0010", "TEX0023", "TPG1017", "TPG154", "TPG715", "TPS0053", "TPS0068", "TPS0180", "TPS0225",
               "TPS0330", "CHJ1016", "CHJ0168", "CHJ0169", "CHJ0172", "CHJ0360", "CHJ0362", "CHJ0363", "CHJ0364",
               "CHJ0366", "CHJ0367", "CHJ0376", "CHJ0378", "CHJ0386")
match(samples50,rownames(m))

GeneticDistance <- as.dist(m[match(samples50,rownames(m)),match(samples50,rownames(m))]) #these numbers have to be changed to adjust to the number of samples
HC <- hclust(GeneticDistance, "ave")
#labels(GeneticDistance) #to see the labels
#graphic HC with font size 0.6
HCD <- as.dendrogram(HC)
#exclude samples from colors object
colors <- colors[match(samples50,rownames(m)),] #these numbers have to be changed to adjust to the number of samples

labels_colors(HCD) <- "black"#colors$HouseColor[order.dendrogram(HCD)]
  


plot(HC, cex = 0.8)
par(cex=0.7)
plot(HCD, horiz = TRUE)
#house
#par(mfrow=c(2,2))
par(xpd=TRUE)
par(cex=0.4)
par(mar = c(4,4,1,18),xpd=TRUE)
HCD <- set(HCD, "labels_cex", 0.5)
plot(HCD, horiz =TRUE)
colored_bars(cbind(colors[,c(10,8,6,4)]), HCD, rowLabels = c("Country","Department","Village","House"),horiz = TRUE)
legend("topright", inset = c(-0.2,0), fill = c("green","blue"), legend = levels(factor(colors$Country)),cex = 0.5, inset = c(-0.63,0), title = "Country", xjust = 0)
legend("topright", fill = unique(colors$DepartmentColor), legend = unique(colors$Department),cex = 0.5, inset=c(-0.64,0.11), title = "Department", xjust = 0)
legend("topright", fill = unique(colors$VillageColor)[-3], legend = unique(colors$Village)[-3], cex = 0.5, inset=c(-1.0,0.3), xpd=TRUE, title = "Village", xjust = 0)
rect.dendrogram(HCD,h=0.07,horiz=TRUE,which = 59)
rect.dendrogram(HCD,h=0.08,horiz=TRUE,which = 28)
rect.dendrogram(HCD,h=0.1,horiz=TRUE,which = 20)
#colores: https://stackoverflow.com/questions/18802519/label-and-color-leaf-dendrogram
#y https://stackoverflow.com/questions/34539746/color-side-bar-dendrogram-plot/
#legend outside plot: https://stackoverflow.com/questions/3932038/plot-a-legend-outside-of-the-plotting-area-in-base-graphics

#https://r-charts.com/es/parte-todo/dendrograma-circular/
#color branches and other formats https://cran.r-project.org/web/packages/dendextend/vignettes/dendextend.html#the-set-function

#House with village.
par(mar=c(3,0,3,10),xpd=TRUE)
colors$HouseColor[colors$HouseColor=="white"] <- 'gray70'
  labels_colors(HCD) <- colors$HouseColor[order.dendrogram(HCD)]
  HCD <- set(HCD, "labels_cex", 0.7)
  HCD <- set(HCD, "branches_lwd", 3)
  #HCD <- set(HCD, "branches_col", c(rep(1,33),"aquamarine4",2,1,"aquamarine4",rep(1,100)))
  HCD %>% set("branches_col", c(1,1,1,1,1,"aquamarine",1,1,rep(1,26),"black","aquamarine4","aquamarine4",
                                rep(1,3),rep("chocolate2",2),1,1,"chocolate2",rep(1,4),
                                "green","green",1,"chartreuse4","chartreuse",1,1,1,1,1,1,1,1,
                                "chartreuse2","darkolivegreen2",
                                rep(1,27),"aquamarine",rep(1,19),"aquamarine",
                                1,1,1,1,1,1,1,1,1,"antiquewhite4","antiquewhite3",rep(1,100))) %>% 
    set("leaves_pch", c(rep(19,2),8,
                        rep(19,13),17,17,
                        19,rep(17,3),19,17,17,
                        9,9,rep(19,3),
                        9,9,rep(19,13),8,
                        rep(19,9),8,rep(19,6))) %>% 
    set("leaves_col", colors$VillageColor[order.dendrogram(HCD)]) %>%
    circlize_dendrogram()  
  #plot(horiz=TRUE)
  legend("topright", fill = unique(colors$VillageColor)[-3], legend = unique(colors$Village)[-3],cex = 0.5, 
         inset=c(-0.5,0), 
         title = "Village", xjust = 0)
  legend("topright",  legend = c("Same house clustered","Same village","Same house dispersed"),
         cex = 0.5, 
         inset=c(-0.24,0.8), 
         title = "Symbols", xjust = 0,pch = c(17, 9, 8))
  
  #Village, Department, Country
  par(mar=c(3,0,3,10),expand=TRUE)
  labels_colors(HCD) <- colors$CountryColor[order.dendrogram(HCD)]
  HCD %>% set("branches_col", c(1,"gray70",1,1,"darkorange","cornsilk3",1,"deeppink",
                                rep(1,2),"bisque",1,"brown3",1,"bisque","cornsilk3",
                                1,"coral1",1,1,"deeppink","cornsilk3",1,"burlywood",1,1,
                                "burlywood",1,"burlywood","cornsilk3",1,
                                "black",1,1,1,"cornsilk3","cornsilk3",
                                1,"burlywood",1,"cyan3","cyan3",1,
                                1,"cyan3",1,1,"cornflowerblue",1,
                                "cornflowerblue",
                                "cornflowerblue",1,"cornflowerblue",
                                "cornflowerblue",1,"deeppink",1,1,
                                "cornflowerblue",1,"cornflowerblue",
                                1,"cornflowerblue","cornflowerblue",1,
                                "brown1",1,1,1,"cornsilk3",1,"bisque2",
                                1,1,"orange","orange",1,1,"brown1","brown1",
                                1,"coral1",1,1,"bisque2",1,rep("bisque2",2),
                                1,1,"brown1","cornsilk3",1,"darkorange",
                                1,"brown1",1,
                                "brown1",1,"brown1",1,"bisque2",1,"gray",
                                1,"black",1,"bisque",1,1,"coral1","cornsilk3",
                                1,1,"coral1","coral1",1,"bisque2",1,1,1,"aquamarine",
                                "bisque",
                                rep(1,100))) %>% 
    set("leaves_pch", 19) %>% 
    set("leaves_col", colors$DepartmentColor[order.dendrogram(HCD)]) %>%
    
    #plot(horiz = TRUE)
    circlize_dendrogram()
  legend("topright", fill = c("green","blue"), legend = levels(factor(colors$Country)),cex = 0.5, inset=c(-0.15,-0.15), title = "Country (labels)", xjust = 0)
  legend("topright", fill = unique(colors$DepartmentColor), legend = unique(colors$Department),cex = 0.5, inset=c(-0.18,0.06), title = "Department (leaves)", xjust = 0)
  legend("topright", fill = unique(colors$VillageColor)[-3], legend = unique(colors$Village)[-3], cex = 0.5, inset=c(-0.51,0.4), xpd=TRUE, title = "Village (branches)", xjust = 0)
  
  
  #HCD <- set(HCD, "branches_col", c(rep(1,4),2,rep(1,4),3))
  
  #house
  #dev.new(width=10, height=10)
  
  #par(mfrow=c(2,2))
  par(mar=c(4,4,4,4),expand=TRUE)
  
  labels_colors(HCD) <- colors$HouseColor[order.dendrogram(HCD)]
  #plot(HCD, horiz =TRUE)
  circlize_dendrogram(HCD)
  
  
  #village
  
  
  par(mar=c(4,4,4,4),expand=TRUE)
  labels_colors(HCD) <- colors$VillageColor[order.dendrogram(HCD)]
  HCD %>% set("branches_col", c(1,"gray70",1,1,"darkorange","cornsilk3",1,"deeppink",
                                rep(1,2),"bisque",1,"brown3",1,"bisque","cornsilk3",
                                1,"coral1",1,1,"deeppink","cornsilk3",1,"burlywood",1,1,
                                "burlywood",1,"burlywood","cornsilk3",1,
                                "black",1,1,1,"cornsilk3","cornsilk3",
                                1,"burlywood",1,"cyan3","cyan3",1,
                                1,"cyan3",1,1,"cornflowerblue",1,
                                "cornflowerblue",
                                "cornflowerblue",1,"cornflowerblue",
                                "cornflowerblue",1,"deeppink",1,1,
                                "cornflowerblue",1,"cornflowerblue",
                                1,"cornflowerblue","cornflowerblue",1,
                                "brown1",1,1,1,"cornsilk3",1,"bisque2",
                                1,1,"orange","orange",1,1,"brown1","brown1",
                                1,"coral1",1,1,"bisque2",1,rep("bisque2",2),
                                1,1,"brown1","cornsilk3",1,"darkorange",
                                1,"brown1",1,
                                "brown1",1,"brown1",1,"bisque2",1,"gray",
                                1,"black",1,"bisque",1,1,"coral1","cornsilk3",
                                1,1,"coral1","coral1",1,"bisque2",1,1,1,"aquamarine",
                                "bisque",
                                rep(1,100))) %>% 
    #plot(horiz = TRUE)
    circlize_dendrogram()
  legend("topright", fill = c("green","blue"), legend = levels(factor(colors$Country)),cex = 0.5, inset=c(-0.63,0), title = "Country", xjust = 0)
  legend("topright", fill = unique(colors$DepartmentColor), legend = unique(colors$Department),cex = 0.5, inset=c(-0.64,0.11), title = "Department", xjust = 0)
  legend("topright", fill = unique(colors$VillageColor)[-3], legend = unique(colors$Village)[-3], cex = 0.5, inset=c(-1.0,0.3), xpd=TRUE, title = "Village", xjust = 0)
  
  
  #Department
  par(mar=c(4,4,4,4),expand=TRUE)
  labels_colors(HCD) <- colors$DepartmentColor[order.dendrogram(HCD)]
  HCD %>% set("branches_col", c(1,"chocolate2",1,1,"yellow","chocolate2",1,"yellow",
                                rep(1,2),"red",1,"red",1,"red","chocolate2",
                                1,"yellow",1,1,"yellow","chocolate2",1,"yellow",1,1,
                                "yellow",1,"yellow","chocolate2",1,
                                "green",1,1,1,"chocolate2","chocolate2",
                                1,"yellow",1,"chocolate2","chocolate2",1,
                                1,"chocolate2",1,1,"chocolate2",1,
                                "chocolate2",
                                "chocolate2",1,"chocolate2",
                                "chocolate2",1,"yellow",1,1,
                                "chocolate2",1,"chocolate2",
                                1,"chocolate2","chocolate2",1,
                                "green",1,1,"yellow","chocolate2",1,"yellow",
                                1,1,"aquamarine","aquamarine",1,1,"green","green",
                                1,"yellow",1,1,"yellow",1,rep("yellow",2),
                                1,1,"green","chocolate2",1,"yellow",
                                1,"green",1,
                                "green",1,"green",1,"yellow",1,"green",
                                1,"green",1,"red",1,1,"yellow","chocolate2",
                                1,1,"yellow","yellow",1,"yellow",1,"green",1,"red",
                                "red",
                                rep(1,100))) %>% 
    #plot(horiz = TRUE)
    circlize_dendrogram()
  
  #Country
  par(mar=c(4,4,4,4),expand=TRUE)
  labels_colors(HCD) <- colors$CountryColor[order.dendrogram(HCD)]
  HCD %>% set("branches_col", c(1,"blue",1,1,"green","blue",1,"green",
                                rep(1,2),"blue",1,"blue",1,"blue","blue",
                                1,"green",1,1,"green","blue",1,"green",1,1,
                                "green",1,"green","blue",1,
                                "green",1,1,1,"blue","blue",
                                1,"green",1,"blue","blue",1,
                                1,"blue",1,1,"blue",1,
                                "blue",
                                "blue",1,"blue",
                                "blue",1,"green",1,1,
                                "blue",1,"blue",
                                1,"blue","blue",1,
                                "green",1,1,"green","blue",1,"green",
                                1,1,"green","green",1,1,"green","green",
                                1,"green",1,1,"green",1,rep("green",2),
                                1,1,"green","blue",1,"green",
                                1,"green",1,
                                "green",1,"green",1,"green",1,"green",
                                1,"green",1,"blue",1,1,"green","blue",
                                1,1,"green","green",1,"green",1,1,1,"blue",
                                "blue",
                                rep(1,100))) %>% 
    #plot(horiz = TRUE)
    circlize_dendrogram()
  
  # Patricias' proposal
  #Country and Dept. 
  #labels of the dendrogram as sample ID
  HC$labels <- colors$SampleID
  HCD <- as.dendrogram(HC)

  HCD <- set(HCD, "labels_cex", 0.9)
  HCD <- set(HCD, "branches_lwd", 3)
  colors$CountryColor2 = ifelse(colors$Country == "Guatemala", "darkblue", "brown3")
  colors$DepartmentColor2 = ifelse(colors$Department == "Jutiapa", "blue2",
                                   ifelse(colors$Department == "Chiquimula", "dodgerblue3",
                                          ifelse(colors$Department == "Sonsonate", "magenta",
                                                 ifelse(colors$Department == "Ahuachapan","orange4",
                                                        ifelse(colors$Department == "Santa_Ana","deeppink3",NA
                                                        )))))
  
  par(mar=c(4,2,4,4),expand=TRUE)
  labels_colors(HCD) <- colors$DepartmentColor2[order.dendrogram(HCD)]
 HCD %>% set("branches_col", c("brown3",rep("darkblue",3),rep("brown3",7),rep("darkblue",5),
                                rep("brown3",12),
                                rep("brown3",8),rep("darkblue",3),"brown3",rep("darkblue",22),
                                rep("brown3",3),rep("darkblue",7),rep("brown3",6),"darkblue",
                                rep("brown3",100))) %>%
    set("leaves_pch", 19) %>% 
    set("leaves_col", colors$CountryColor2[order.dendrogram(HCD)]) %>%
    #plot(horiz = TRUE)
    circlize_dendrogram()
  legend("topright", fill = unique(colors$CountryColor2), legend = rev(levels(factor(colors$Country))),cex = 1, inset=c(-0.2,0), title = "\n Country \n (branches and leaves)", xjust = 0)
  legend("topright", fill = unique(colors$DepartmentColor2)[c(5,1,4,3,2)], legend = unique(colors$Department)[c(5,1,4,3,2)],cex = 1, inset=c(-0.2,0.5), title = "\n Department \n (labels)", xjust = 0)
  

  
  #Villages
  #https://stackoverflow.com/questions/66923282/create-new-column-based-on-another-variable
  colors$VillageColor2 = ifelse(colors$Village == "Santa_Ana", "orange",
                         ifelse(colors$Village == "El Zacatal, Amatepec", "hotpink",
                         ifelse(colors$Village == "El_Jute", "orange3",
                         ifelse(colors$Village == "El Zacatal, Amatepec", "orange4",
                         ifelse(colors$Village == "El_Jute", "magenta",
                         ifelse(colors$Village == "El Aguacate, Azacualpa, Municipio de Armenia", "red",
                         ifelse(colors$Village == "Chilguyo, Texistepeque", "coral1",
                         ifelse(colors$Village == "Caserio Sarita Loma, Aldea La Primavera", "deeppink",
                         ifelse(colors$Village == "Caserio San Raymundo, Canton Llano", "coral3",
                         ifelse(colors$Village == "Cantón Las Flores Cerro Alto, Municipio Caluco ", "coral4",
                         ifelse(colors$Village == "Llano_Santa_María", "cyan3",
                         ifelse(colors$Village == "El_Chaperno", "green",
                         ifelse(colors$Village == "El_Cerrón,_Olopa","blue",
                         ifelse(colors$Village == "La_Prensa,_Olopa","blueviolet",
                         ifelse(colors$Village == "El_Carrizal","dodgerblue",
                                "gray62"
                                                            )))))))))))))))
  
  
  #change house colors:
  colors$HouseColor = ifelse(colors$House == 'JUT-D-FM', 'deeppink2',
                      ifelse(colors$House == 'CHAP-D-65', 'deepskyblue4',
                      ifelse(colors$House == 'CHAP-D-125', 'cyan',
                      ifelse(colors$House == 'CHAP-D-155', 'cyan4',
                      ifelse(colors$House == 'CHAP-D-114-115', 'dodgerblue',
                      ifelse(colors$House == 'CHAP-D-54', 'dodgerblue4',
                      ifelse(colors$House == 'CHAP-D-64', 'deepskyblue',
                      ifelse(colors$House == '', 'gray62',
                      ifelse(colors$House == 'CHI-D137A', 'green',
                      ifelse(colors$House == 'CHI-D204', 'green3',
                      ifelse(colors$House == 'CHI-D48', 'green4',
                      ifelse(colors$House == 'CHI-D13', 'darkolivegreen',
                      ifelse(colors$House == 'CARR-D-39', 'deeppink2',
                      ifelse(colors$House == 'CHAP-D-81', 'purple',
                      ifelse(colors$House == 'CHAP-D-156', 'darkturquoise',
                      ifelse(colors$House == 'CARR-D-63', 'deeppink',
                      ifelse(colors$House == 'CARR-D-58', 'deeppink4',
                      ifelse(colors$House == 'CARR-D-57', 'darkorchid',
                      ifelse(colors$House == 'CARR-D-95', 'darkorchid4',
                      ifelse(colors$House == 'CARR-D-93', 'firebrick1',
                      ifelse(colors$House == 'CARR-D-18A', 'firebrick4',
                      ifelse(colors$House == 'CHI-D19', 'royalblue4',
                      NA))))))))))))))))))))))
  
  colors$VillageIDs = ifelse(colors$Village == "Santa_Ana", "SAn",
                                ifelse(colors$Village == "El Zacatal, Amatepec", "EZa",
                                ifelse(colors$Village == "El_Jute", "EJu",
                                ifelse(colors$Village == "El Aguacate, Azacualpa, Municipio de Armenia", "EAg",
                                ifelse(colors$Village == "Chilguyo, Texistepeque", "ChT",
                                ifelse(colors$Village == "Caserio Sarita Loma, Aldea La Primavera", "CSL",
                                ifelse(colors$Village == "Caserio San Raymundo, Canton Llano", "CSR",
                                ifelse(colors$Village == "Cantón Las Flores Cerro Alto, Municipio Caluco ", "CLF",
                                ifelse(colors$Village == "Llano_Santa_María", "LSM",
                                ifelse(colors$Village == "El_Chaperno", "ECh",
                                ifelse(colors$Village == "El_Cerrón,_Olopa","ECe",
                                ifelse(colors$Village == "La_Prensa,_Olopa","LPr",
                                ifelse(colors$Village == "El_Carrizal","ECa",""
                                )))))))))))))
  #create new variable using "unknown" for villages from El Salvador and keeping
  #Guatemalan villages.
  colors$Village2 = colors$Village
  colors$Village2 = ifelse(colors$Country == "El_Salvador","Unknown",
                           colors$Village2)
  #
  #change the labels for HC with village ids
  HC$labels <- paste0(colors$SampleID," ", colors$VillageIDs)
  HCD <- as.dendrogram(HC)
  
  par(mar=c(5,0,4,13),xpd=TRUE)
  colors$HouseColor[colors$HouseColor=="white"] <- 'gray62'
    labels_colors(HCD) <- colors$HouseColor[order.dendrogram(HCD)]
    HCD <- set(HCD, "labels_cex", 1)
    HCD <- set(HCD, "branches_lwd", 4)
    #HCD <- set(HCD, "branches_col", c(rep(1,33),"aquamarine4",2,1,"aquamarine4",rep(1,100)))
    HCD %>% 
      set("branches_col", c(rep(1,7),1,rep(1,3),rep("purple",3),rep(1,12),1,1,
                            rep(1,10),rep(1,7),rep(1,4),
                            1,1,rep("firebrick1",3),rep(1,1),rep("firebrick4",4),
                            rep(1,100)
      )) %>%
      set("leaves_cex", 2) %>% 
      set("leaves_bg" , colors$VillageColor2[order.dendrogram(HCD)]) %>% 
      #in order to know the houses with more than 1 bug I used:
      #table(colors$House)
      #then I used:
      #colors[which(colors$House=="CARR-D-18A"),]
      #colors[which(colors$House=="CARR-D-93"),]
      #colors[which(colors$House=="CHAP-D-81"),]
      
      set("leaves_pch", c(rep(19,3),rep(13,2),rep(8,2),rep(19,3),
                          rep(13,4),rep(19,6),rep(13,3),
                          rep(8,6), rep(19,3),rep(13,4),rep(19,4),rep(13,6),
                          rep(19,4)
      )) %>%
      
      #set("leaves_pch", 19) %>% 
      set("leaves_col", colors$VillageColor2[order.dendrogram(HCD)]) %>%
      circlize_dendrogram()  
    #plot(horiz=TRUE)
    legend("topright",fill = unique(colors$VillageColor2)[c(7, 5, 6, 4, 8, 13, 9, 10, 14, 11, 2, 12, 1)], 
           legend = paste(unique(colors$Village)[c(7, 5, 6, 4, 8, 13, 9, 10, 14, 11, 2, 12, 1)],"(",unique(colors$VillageIDs)[c(7, 5, 6, 4, 8, 13, 9, 10, 14, 11, 2, 12, 1)],")"),cex = 0.8, 
           inset=c(-0.6,-0.2), 
           title = "\n Village \n (leaves)", xjust = 0)
    
    legend("topright", fill = unique(colors$HouseColor)[-7], legend = unique(colors$House)[-7],cex = 0.8, 
           inset=c(-0.4,0.39), 
           title = "\n House (branches \n and labels)", xjust = 0)
    
    
    legend("topright",  legend = c("Same house genetically similar","Same village genetically similar"),
           cex = 0.7, 
           inset=c(-0.1,1.005), 
           title = "Symbols", xjust = 0,pch = c(8, 13))
    
    # add table in legend: https://stackoverflow.com/questions/15406969/add-table-aligned-text-blocks-to-plot-in-r
    # with colors: https://stackoverflow.com/questions/61288600/how-to-put-a-table-below-a-ggplot-and-colour-the-rows-by-the-same-grouping-facto
    # https://stackoverflow.com/questions/52491085/plot-the-table-with-colorized-cells-in-r
    
    
    
    
    #exportar a un archivo, hay que transformar primero clase "dist" a "data.frame"
    #write.table(as.matrix(GeneticDistance), "/media/sergio/OS/Users/SergioAlejandro/Documents/Proyectos/Lenap 2015/Proyecto SNP/Parentesco dentro de casas/distanciasNC.txt", sep="\t")
    #save an svg image of the plot 
    #dev.copy(svg, "dendrogram20220823.svg")
    #create the plot and then:
    #dev.off()
    #save an png image of the plot
    #dev.copy(png, "dendrogram.png")
    #dev.off()