Automated-Ki67-Labelling-Index / Final figures Ki67 prostate paper.R
Final figures Ki67 prostate paper.R
Raw
#############################################################################
####################### automated Ki67-Scores in Prostate final (Paper) ###################
#############################################################################
# 18.02.2021  EV, NCB
#Cohort consits of 11845 prostate cancer patients <- Slide xxx +xxxx +xxxx +xxx (Exclude xxxx)
   # and Heteroprognosis cohort with 630 patients (Pro 19) with 580 TMA Gleason scores made by Sauter


##############################################################################################################################################################################################
##############################################################################################################################################################################################
###################################### Figure 2 (Survival curves + 5years numeric suvrial ###########################################################
##############################################################################################################################################################################################
##############################################################################################################################################################################################

###Load raw data from VERSA 8  
Time1<-Sys.time()
setwd("/Users/niclasblessin/MS08_Prostate ki67 score/")   ####change your working direction here
###Import and select Data
detach("package:xlsx")
library("openxlsx")
Header = TRUE 
A.data.patients=read.xlsx("Blessin_autoKi67LI_per patient data.xlsx", sheet="Blessin_autoKi67LI", colNames = TRUE)
#A.data.patients <- A.data.patients[1:800,]

### Include patients like the JMP-Datafilter
#B.data <- A.data.patients [A.data.patients$`Filter.(less.than.200.AE1/3.cells)` == "include",] ### I leave it on purpose so that we see later that we have excluded the AE1/3 cells to 100%
#B.data <- B.data [B.data$Final.filter == "include",]

##### pick up the data for analysis
C.data <-A.data.patients%>% select("BCR.censor"="BCR.censor","BCR.months"="BCR.months",
                          "Ki67.LI"=`Ki67.Li.(after.23um.exclusion)`)


####load packages for KM curves 
library(survival)
library(ggplot2)
library(rms)
library(survminer)
library(dplyr)
library(plotrix)
library(reshape2)
library(ggtext)

######################################                                       #########################################################################
######################################      Survival curves (NEJM style)WL    #########################################################################
######################################                                       #########################################################################

######## PSA recurrence-free survival  ############

######################## 4 groups ("<2.0%","2.0-5.0%","5.0-15.0%",">15.0%") #######################################
######################## 4 groups ("<2.0%","2.0-5.0%","5.0-15.0%",">15.0%") #######################################
######################## 4 groups ("<2.0%","2.0-5.0%","5.0-15.0%",">15.0%") #######################################

#####create groups
C.data$marker_groups = cut(C.data$Ki67.LI, breaks = c(-1, 2.0001, 5.00001, 15.0001, 101),                      ##### HERE to change  grouping ###
                           labels= c("\u200B< 2.0%","2.0-5.0%","5.1-15.0%","\u200B> 15.0%")) 



SurvObj <- with(C.data, Surv(BCR.months, BCR.censor == 0))
Ba.km.curve<- survfit(SurvObj ~ marker_groups, data = C.data, conf.type = "log-log")
ggsurv<-ggsurvplot(Ba.km.curve, size = 1.4,conf.int=FALSE,  fun=NULL, title="", linetype = 1, censor=FALSE, 
                   xlim=c(0,90),
                   xlab = "Survival months", ylab = "PSA recurence-free survival",
                   break.x.by = 12, break.y.by = 0.1,
                   font.main = c(25, "bold", "black"),font.x = c(25, "bold", "black"), font.y = c(25, "bold", "black"),font.tickslab = c(25, "bold", "black"),
                   legend=c(0.15, 0.2), legend.labs=c("\u200B< 2.0%","2.0-5.0%","5.1-15.0%","\u200B> 15.0%"),   legend.title="",font.legend = c(25,"bold", "black"), 
                   palette=c("royalblue3","mediumseagreen","orange","firebrick"),
                   pval=TRUE,pval.size = 6, pval.coord = c(80, 0.9), 
                   tables.height = 0.1,
                   risk.table="abs_pct", risk.table.fontsize = 4.5, risk.table.y.text = TRUE,risk.table.y.text.col = TRUE,
                   risk.table.height=.25, surv.plot.height = 0.75)
Bb.sum <- summary(Ba.km.curve)$table

### Hazard plot ###
fit.coxph<-coxph(SurvObj ~ marker_groups, data = C.data)
#names(summary(fit.coxph))
C.data$marker_groups_2 <- NA
C.data$marker_groups_2[C.data$marker_groups =="\u200B< 2.0%" ] <- "1"                
C.data$marker_groups_2[C.data$marker_groups =="2.0-5.0%" ] <- "2"
fit.coxph<-coxph(SurvObj ~ marker_groups_2, data = C.data)
HR.out.1<-summary(fit.coxph)$conf.int[,1]
low95.out.1<-summary(fit.coxph)$conf.int[,3]
high95.out.1<-summary(fit.coxph)$conf.int[,4]
p.out.1<-summary(fit.coxph)$coefficients[,5]

C.data$marker_groups_2 <- NA
C.data$marker_groups_2[C.data$marker_groups =="2.0-5.0%" ] <- "1"
C.data$marker_groups_2[C.data$marker_groups =="5.1-15.0%" ] <- "2"
fit.coxph<-coxph(SurvObj ~ marker_groups_2, data = C.data)
HR.out.2<-summary(fit.coxph)$conf.int[,1]
low95.out.2<-summary(fit.coxph)$conf.int[,3]
high95.out.2<-summary(fit.coxph)$conf.int[,4]
p.out.2<-summary(fit.coxph)$coefficients[,5]


C.data$marker_groups_2 <- NA
C.data$marker_groups_2[C.data$marker_groups =="5.0-15.0%" ] <- "1"
C.data$marker_groups_2[C.data$marker_groups =="\u200B> 15.0%" ] <- "2" 
fit.coxph<-coxph(SurvObj ~ marker_groups_2, data = C.data)
HR.out.3<-summary(fit.coxph)$conf.int[,1]
low95.out.3<-summary(fit.coxph)$conf.int[,3]
high95.out.3<-summary(fit.coxph)$conf.int[,4]
p.out.3<-summary(fit.coxph)$coefficients[,5]


### add text 

ggsurv$plot <- ggsurv$plot +
  ggplot2::annotate("text", 
                    x = 60, y = 0.3, # x and y coordinates of the text
                    label = c(paste("HR=",round(HR.out.1, digits = 2),"(95% CI,",round(low95.out.1, digits = 2),"-",round(high95.out.1, digits = 2),"), p=", signif(p.out.1,digits=1))), size = 4.5)+
  ggplot2::annotate("text", 
                    x = 60, y = 0.2, # x and y coordinates of the text
                    label = c(paste("HR=",round(HR.out.2, digits = 2),"(95% CI,",round(low95.out.2, digits = 2),"-",round(high95.out.2, digits = 2),"), p=", signif(p.out.2,digits=1))), size = 4.5)+
  ggplot2::annotate("text", 
                    x = 60, y = 0.1, # x and y coordinates of the text
                    label = c(paste("HR=",round(HR.out.3, digits = 2),"(95% CI,",round(low95.out.3, digits = 2),"-",round(high95.out.3, digits = 2),"), p=", signif(p.out.3,digits=1))), size = 4.5)

### show the plot
ggsurv







######## PSA recurrence-free survival  ############
######################## 9 groups ("<1.0%","1.0-2.5%","2.5-4%","4-7%","7-10%","10-13%","13-23%","23-40%",">40%") #######################################
######################## 9 groups ("<1.0%","1.0-2.5%","2.5-4%","4-7%","7-10%","10-13%","13-23%","23-40%",">40%") #######################################
######################## 9 groups ("<1.0%","1.0-2.5%","2.5-4%","4-7%","7-10%","10-13%","13-23%","23-40%",">40%") #######################################

#####create groups
C.data$marker_groups = cut(C.data$Ki67.LI, breaks = c(-1, 1.0001, 2.50001, 4.0001,7.0001, 10.0001,13.0001,23.0001,40.0001,101),                      ##### HERE to change  grouping ###
                           labels= c("\u200B< 1.0%","1.0-2.5%","2.6-4.0%","4.1-7.0%","7.1-10.0%","10.1-13.0%","13.1-23.0%","23.1-40.0%","\u200B> 40.0%")) 



SurvObj <- with(C.data, Surv(BCR.months, BCR.censor == 0))
Ba.km.curve<- survfit(SurvObj ~ marker_groups, data = C.data, conf.type = "log-log")
ggsurv<-ggsurvplot(Ba.km.curve, size = 1.4,conf.int=FALSE,  fun=NULL, title="", linetype = 1, censor=FALSE, 
                   xlim=c(0,90),
                   xlab = "Survival months", ylab = "PSA recurence-free survival",
                   break.x.by = 12, break.y.by = 0.1,
                   font.main = c(25, "bold", "black"),font.x = c(25, "bold", "black"), font.y = c(25, "bold", "black"),font.tickslab = c(25, "bold", "black"),
                   legend=c(0.08, 0.25), legend.labs=c("\u200B<1.0%","1.0-2.5%","2.6-4.0%","4.1-7.0%","7.1-10.0%","10.1-13.0%","13.1-23.0%","23.1-40.0%","\u200B> 40.0%"),   legend.title="",font.legend = c(15,"bold", "black"), 
                   palette=c("black","darkblue","royalblue3","forestgreen","mediumseagreen","orange","gold3","red","firebrick"),
                   pval=TRUE,pval.size = 6, pval.coord = c(75, 0.9), 
                   tables.height = 0.1,
                   risk.table="abs_pct", risk.table.fontsize = 4.5, risk.table.y.text = TRUE,risk.table.y.text.col = TRUE,
                   risk.table.height=.25, surv.plot.height = 0.75)
Bb.sum <- summary(Ba.km.curve)$table




### show the plot
ggsurv



### Hazard plot ###
fit.coxph<-coxph(SurvObj ~ marker_groups, data = C.data)
#names(summary(fit.coxph))
C.data$marker_groups_2 <- NA
C.data$marker_groups_2[C.data$marker_groups =="\u200B< 1.0%" ] <- "1"
C.data$marker_groups_2[C.data$marker_groups =="1.0-2.5%" ] <- "2"
fit.coxph<-coxph(SurvObj ~ marker_groups_2, data = C.data)
HR.out.1<-summary(fit.coxph)$conf.int[,1]
low95.out.1<-summary(fit.coxph)$conf.int[,3]
high95.out.1<-summary(fit.coxph)$conf.int[,4]
p.out.1<-summary(fit.coxph)$coefficients[,5]

C.data$marker_groups_2 <- NA
C.data$marker_groups_2[C.data$marker_groups =="1.0-2.5%" ] <- "1"
C.data$marker_groups_2[C.data$marker_groups =="2.6-4.0%" ] <- "2"
fit.coxph<-coxph(SurvObj ~ marker_groups_2, data = C.data)
HR.out.2<-summary(fit.coxph)$conf.int[,1]
low95.out.2<-summary(fit.coxph)$conf.int[,3]
high95.out.2<-summary(fit.coxph)$conf.int[,4]
p.out.2<-summary(fit.coxph)$coefficients[,5]


C.data$marker_groups_2 <- NA
C.data$marker_groups_2[C.data$marker_groups =="2.5-4.0%"] <- "1"
C.data$marker_groups_2[C.data$marker_groups =="4.1-7.0%" ] <- "2"
fit.coxph<-coxph(SurvObj ~ marker_groups_2, data = C.data)
HR.out.3<-summary(fit.coxph)$conf.int[,1]
low95.out.3<-summary(fit.coxph)$conf.int[,3]
high95.out.3<-summary(fit.coxph)$conf.int[,4]
p.out.3<-summary(fit.coxph)$coefficients[,5]



C.data$marker_groups_2 <- NA
C.data$marker_groups_2[C.data$marker_groups =="4.1-7.0%" ] <- "1"
C.data$marker_groups_2[C.data$marker_groups =="7.1-10.0%" ] <- "2"
fit.coxph<-coxph(SurvObj ~ marker_groups_2, data = C.data)
HR.out.4<-summary(fit.coxph)$conf.int[,1]
low95.out.4<-summary(fit.coxph)$conf.int[,3]
high95.out.4<-summary(fit.coxph)$conf.int[,4]
p.out.4<-summary(fit.coxph)$coefficients[,5]



C.data$marker_groups_2 <- NA
C.data$marker_groups_2[C.data$marker_groups =="7.1-10.0%" ] <- "1"
C.data$marker_groups_2[C.data$marker_groups =="10.1-13.0%" ] <- "2"
fit.coxph<-coxph(SurvObj ~ marker_groups_2, data = C.data)
HR.out.5<-summary(fit.coxph)$conf.int[,1]
low95.out.5<-summary(fit.coxph)$conf.int[,3]
high95.out.5<-summary(fit.coxph)$conf.int[,4]
p.out.5<-summary(fit.coxph)$coefficients[,5]


C.data$marker_groups_2 <- NA
C.data$marker_groups_2[C.data$marker_groups =="10.1-13.0%" ] <- "1"
C.data$marker_groups_2[C.data$marker_groups =="13.1-23.0%" ] <- "2"
fit.coxph<-coxph(SurvObj ~ marker_groups_2, data = C.data)
HR.out.6<-summary(fit.coxph)$conf.int[,1]
low95.out.6<-summary(fit.coxph)$conf.int[,3]
high95.out.6<-summary(fit.coxph)$conf.int[,4]
p.out.6<-summary(fit.coxph)$coefficients[,5]



C.data$marker_groups_2 <- NA
C.data$marker_groups_2[C.data$marker_groups =="13.1-23.0%" ] <- "1"
C.data$marker_groups_2[C.data$marker_groups =="23.1-40.0%" ] <- "2"
fit.coxph<-coxph(SurvObj ~ marker_groups_2, data = C.data)
HR.out.7<-summary(fit.coxph)$conf.int[,1]
low95.out.7<-summary(fit.coxph)$conf.int[,3]
high95.out.7<-summary(fit.coxph)$conf.int[,4]
p.out.7<-summary(fit.coxph)$coefficients[,5]



C.data$marker_groups_2 <- NA
C.data$marker_groups_2[C.data$marker_groups =="23.1-40.0%" ] <- "1"
C.data$marker_groups_2[C.data$marker_groups =="\u200B> 40.0%" ] <- "2"
fit.coxph<-coxph(SurvObj ~ marker_groups_2, data = C.data)
HR.out.8<-summary(fit.coxph)$conf.int[,1]
low95.out.8<-summary(fit.coxph)$conf.int[,3]
high95.out.8<-summary(fit.coxph)$conf.int[,4]
p.out.8<-summary(fit.coxph)$coefficients[,5]


### add text 
ggsurv$plot <- ggsurv$plot +
  ggplot2::annotate("text", 
                    x = 50, y = 0.35, # x and y coordinates of the text
                    label = c(paste("HR=",round(HR.out.1, digits = 2),"(95% CI,",round(low95.out.1, digits = 2),"-",round(high95.out.1, digits = 2),"), p=", signif(p.out.1,digits=2))), size = 4.5)+
  ggplot2::annotate("text", 
                    x = 50, y = 0.30, # x and y coordinates of the text
                    label = c(paste("HR=",round(HR.out.2, digits = 2),"(95% CI,",round(low95.out.2, digits = 2),"-",round(high95.out.2, digits = 2),"), p=", signif(p.out.2,digits=2))), size = 4.5)+
  ggplot2::annotate("text", 
                    x = 50, y = 0.25, # x and y coordinates of the text
                    label = c(paste("HR=",round(HR.out.3, digits = 2),"(95% CI,",round(low95.out.3, digits = 2),"-",round(high95.out.3, digits = 2),"), p=", signif(p.out.3,digits=2))), size = 4.5)+
  ggplot2::annotate("text", 
                    x = 50, y = 0.20, # x and y coordinates of the text
                    label = c(paste("HR=",round(HR.out.4, digits = 2),"(95% CI,",round(low95.out.4, digits = 2),"-",round(high95.out.4, digits = 2),"), p=", signif(p.out.4,digits=2))), size = 4.5)+
  ggplot2::annotate("text", 
                    x = 50, y = 0.15, # x and y coordinates of the text
                    label = c(paste("HR=",round(HR.out.5, digits = 2),"(95% CI,",round(low95.out.5, digits = 2),"-",round(high95.out.5, digits = 2),"), p=", signif(p.out.5,digits=2))), size = 4.5)+
  ggplot2::annotate("text", 
                    x = 50, y = 0.1, # x and y coordinates of the text
                    label = c(paste("HR=",round(HR.out.6, digits = 2),"(95% CI,",round(low95.out.6, digits = 2),"-",round(high95.out.6, digits = 2),"), p=", signif(p.out.6,digits=2))), size = 4.5)+
  ggplot2::annotate("text", 
                    x = 50, y = 0.05, # x and y coordinates of the text
                    label = c(paste("HR=",round(HR.out.7, digits = 2),"(95% CI,",round(low95.out.7, digits = 2),"-",round(high95.out.7, digits = 2),"), p=", signif(p.out.7,digits=2))), size = 4.5)+
  ggplot2::annotate("text", 
                    x = 50, y = 0.0, # x and y coordinates of the text
                    label = c(paste("HR=",round(HR.out.8, digits = 2),"(95% CI,",round(low95.out.8, digits = 2),"-",round(high95.out.8, digits = 2),"), p=", signif(p.out.8,digits=2))), size = 4.5)




### show the plot
ggsurv

######exporting size 10*12  

##### real TMA
######################################                                        #########################################################################
######################################    Numeric (Cutoff free) 5yersRFS     #########################################################################
######################################                                        #########################################################################
######## PSA recurrence-free survival  ############



D.data <- C.data%>% select("BCR.censor","BCR.months","Ki67.LI")

### test for NA and remove them
#my.data <-na.omit(my.data)
any(is.na(D.data))
sum(is.na(D.data))
D.dta <- D.data[complete.cases(D.data),]
any(is.na(D.dta))
summary(D.dta)
                                  
seq.range<-function (x, ...) {
  rx <- range(x, na.rm = TRUE)
  seq(rx[1], rx[2], ...)
}


### survival-Variablen 

D.dta$status<-1-as.numeric(D.dta$BCR.censor)
D.dta$time<-D.dta$BCR.months
D.dta$Ki67.LI<-as.numeric(D.dta$Ki67.LI)
D.dta <- D.dta[complete.cases(D.dta),]


### Cut into quartile based groups
library(dplyr)

### survival analysis
final.cox<-coxph(Surv(time,status)~pspline(Ki67.LI,5),D.dta)

surv_rates<-function(mod, pred_data, times=60){
  s1<-summary(survfit(mod,newdata=pred_data),times=times)
  cbind(pred_data,do.call(data.frame,lapply(s1[c("surv","lower","upper")],as.vector)))}


### predicton data
pdata<-data.frame(Ki67.LI=seq.range(D.dta$Ki67.LI,length.out=100))
surv5y_final<-cbind(surv_rates(final.cox,pdata), model="penalized spline 5th deg")
ggplot(surv5y_final,aes(x=Ki67.LI,y=surv))+geom_line(aes(colour=model))+geom_ribbon(aes(ymin=lower,ymax=upper,fill=model),alpha=.25)+theme_bw()+
  labs(x="Auto Ki67-LI",y="5-years PSA recurence-free survival") +theme(legend.position="none")+ scale_y_continuous(limits=c(0, 1), expand = c(0, 0)) +scale_x_continuous(limits=c(0, 30), expand = c(0, 0)) 


##### hetero
######################################                                        #########################################################################
######################################    Numeric (Cutoff free) 5yersRFS     #########################################################################
######################################                                        #########################################################################
######## PSA recurrence-free survival  ############

###Load raw data  

setwd("/Users/niclasblessin/MS08_Prostate ki67 score/")   ####change your working direction here
###Import and select Data
detach("package:xlsx")
library("openxlsx")
Header = TRUE 
A.data.patients=read.xlsx("Blessin_autoKi67LI_per patient data hetero.xlsx", sheet="Blessin_autoKi67LI_hetero", colNames = TRUE)

### Include patients like the JMP-Datafilter
#B.data <- A.data.patients[A.data.patients$Filter.final == "include",]
#B.data <- B.data[B.data$RPE_TMA == "Pro 19",] 


### calculations for the Paper 
library(dplyr)
C.data <-B.data%>% select("TMA1place"="RPE_Punch1_TMAlocalisation","TMA2place"="RPE_Punch2_TMAlocalisation","TMA3place"="RPE_Punch3_TMAlocalisation",
                          "TMA4place"="RPE_Punch4_TMAlocalisation","TMA5place"="RPE_Punch5_TMAlocalisation","TMA6place"="RPE_Punch6_TMAlocalisation",
                          "Foci1.23um"="Ki67.Li.(after.23um.exclusion).of.Biopsy1","Foci2.23um"="Ki67.Li.(after.23um.exclusion).of.Biopsy2",
                          "Foci3.23um"="Ki67.Li.(after.23um.exclusion).of.Biopsy3","Foci4.23um"="Ki67.Li.(after.23um.exclusion).of.Biopsy4",
                          "Foci5.23um"="Ki67.Li.(after.23um.exclusion).of.Biopsy5","Foci6.23um"="Ki67.Li.(after.23um.exclusion).of.Biopsy6",
                          "Ki67ofMAXGleason"="Ki67-LI.of.spot.with.max.Gleason",
                          "IQ_Gleason"="RPE_IQ_Gleason",
                          "BCR.censor"="BCR_censor",
                          "BCR.months"="BCR_months")





######################################                                        #########################################################################
######################################    Numeric (Cutoff free) 5yersRFS     #########################################################################
######################################                                        #########################################################################
######## PSA recurrence-free survival  ############



D.data <- C.data%>% select("BCR.censor","BCR.months","IQ_Gleason")

### test for NA and remove them
#my.data <-na.omit(my.data)
any(is.na(D.data))
sum(is.na(D.data))
D.dta <- D.data[complete.cases(D.data),]
any(is.na(D.dta))
summary(D.dta)
#dta <-na.omit(dta)

########## full automatic plot(df$ID, df$UID, xlab=names(df)[1], ylab=names(df)[2], type='l') xlab= names(df)/rownames(df)
#yy <- D.data$Ki67.LI
# Select Subgroup
#dta <- dta[dta$RPE_Gleason == "3+4",]
### numerische scala einführen                                    
seq.range<-function (x, ...) {
  rx <- range(x, na.rm = TRUE)
  seq(rx[1], rx[2], ...)
}


### survival-Variablen 

D.dta$status<-1-as.numeric(D.dta$BCR.censor)
D.dta$time<-D.dta$BCR.months
#D.dta$marker_num<-as.numeric(D.dta$Ki67.LI)                 #### here ####
#A.dta$gleason_iq<- as.numeric(A.dta$gleason_iq)
D.dta$IQ_Gleason<-as.numeric(D.dta$IQ_Gleason)
D.dta <- D.dta[complete.cases(D.dta),]


### Cut into quartile based groups
library(dplyr)
#D.dta$Ki67.LI_a <- ntile(D.dta$Ki67.LI,9) 
### survival analysis
library(survival)
final.cox<-coxph(Surv(time,status)~pspline(IQ_Gleason,5),D.dta)

surv_rates<-function(mod, pred_data, times=60){
  s1<-summary(survfit(mod,newdata=pred_data),times=times)
  cbind(pred_data,do.call(data.frame,lapply(s1[c("surv","lower","upper")],as.vector)))}


### predicton data
pdata<-data.frame(IQ_Gleason=seq.range(D.dta$IQ_Gleason,length.out=100))
surv5y_final<-cbind(surv_rates(final.cox,pdata), model="penalized spline 5th deg")
ggplot(surv5y_final,aes(x=IQ_Gleason,y=surv))+geom_line(colour="orange")+
  geom_ribbon(aes(ymin=lower,ymax=upper),fill="orange",alpha=.25)+theme_bw()+
  labs(x="Automated Ki67-LI",y="5-years PSA recurence-free survival") +
  theme(legend.position="none")+ scale_y_continuous(limits=c(0, 1), expand = c(0, 0)) +scale_x_continuous(limits=c(0, 100), expand = c(0, 0)) 




######################################################################################################################################################################################################
######################################################################################################################################################################################################
############################################################ Figure 3 ROC curve Gleason scores vs. Ki67 ############################################################ 
######################################################################################################################################################################################################
######################################################################################################################################################################################################


######################################                                                                     #########################################################################
######################################    17k classical Prostate (ki67+IQ, Quant Gleason, Gleason, ISUP)  #########################################################################
######################################                                                                    #########################################################################
# Ki67 Mean of 6  vs. GS::SUM_Gleason_Berechnet --> ISUP vs. ISUP + auto-Ki67-LI
# Package riskRegression (12.2020) (https://github.com/tagteam/riskRegression)

### select data and load required packages
library(dplyr)
library(riskRegression)
library(survival)

library(tibble)
library(magrittr)
library(tidyr)
library(ggplot2)
###Load raw data from VERSA 8  
setwd("/Users/niclasblessin/Nico/Forschung/# My paperwork/MS08_Prostate ki67 score/") 
#setwd("C:/Users/Workstation3/Dropbox/Multiplex IF AG") 
#setwd("C:/Users/Workstation 5/Dropbox/Multiplex IF AG") 
#setwd("C:/Users/Patho 1/Dropbox/Multiplex IF AG")

###Import and select Data
detach("package:xlsx")
library("openxlsx")
Header = TRUE 
A.data.patients=read.xlsx("2021_03_17 Prostate reanalysis.xlsx", sheet="R", colNames = TRUE)
#A.data.patients <- A.data.patients[1:800,]

### Include patients like the JMP-Datafilter
B.data <- A.data.patients [A.data.patients$`Filter.(less.than.200.AE1/3.cells)` == "include",]
#B.data <- A.data.patients
B.data <- B.data [B.data$Final.filter == "include",]
#B.data <- B.data [B.data$`FIlter.2.(6000.patients)` == "include"|B.data$Slide == "ab0290"
#|B.data$Slide == "ab0293"|B.data$Slide == "ab0297"|B.data$Slide == "ab0300"|B.data$Slide == "ab0301"
#|B.data$Slide == "ab0303"|B.data$Slide == "ab0305"|B.data$Slide == "ab0308"|B.data$Slide == "ab0309"
#|B.data$Slide == "ab0310"|B.data$Slide == "ab0314"|B.data$Slide == "ab0315"|B.data$Slide == "ab0316",]


#### pickup data needed
C.data <-B.data%>% select("BCR.censor"="BCR.censor","BCR.months"="BCR.months",
                          "Ki67.LI"="Ki67.Li.(after.23um.exclusion)",
                          "ISUP"="pGleason.of.Pro17k_clean_NB",
                          "Gleason"="RPE_Gleason_groups",
                          "QuantGleason"="RPE_quant_Gleason",
                          "IQGleason"="RPE_IQ_Gleason")




### understanding the data
barplot(table(C.data$Gleason))

### Change to ISUP
C.data<-C.data%>% mutate(ISUP=recode(ISUP,"+"="1","0+"="1","2+2"="1","2+2"="1","2+3"="1","2+4"="1","3+2"="1",
                                     "3+3"="1",
                                     "3+4"="2",
                                     "3+5"="4",
                                     "4+3"="3",
                                     "4+4"="4",
                                     "4+5"="5",
                                     "5+3"="4",
                                     "5+4"="5",
                                     "5+5"="5"))
### understanding the data
barplot(table(C.data$ISUP))
barplot(table(C.data$Gleason))
barplot(table(C.data$QuantGleason))
barplot(table(C.data$IQGleason))

### Use always numeric values if possible! 
C.data$ISUP <- as.numeric(C.data$ISUP)
C.data$IQGleason <- as.numeric(C.data$IQGleason)
plot(density(C.data$IQGleason))

### is.na
any(is.na(C.data))
sum(is.na(C.data))
C.data <- C.data[complete.cases(C.data),]
any(is.na(C.data))
summary(C.data)

### pick the variables like in the multivariate analysis (independ?) for testing them for their predictive power
### auto ki67-Li
c1<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI, C.data)
### ISUP
c2<-coxph(Surv(BCR.months,BCR.censor==0)~ISUP, C.data)
### auto-Ki67+Gelason 
c3<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI+ISUP, C.data)
### QuantGleason
c4<-coxph(Surv(BCR.months,BCR.censor==0)~QuantGleason, C.data)
### IQ Gleason
c5<-coxph(Surv(BCR.months,BCR.censor==0)~IQGleason, C.data)
### auto-Ki67 + IQ Gleason
c6<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI+IQGleason, C.data)
mdta<-model.frame(~BCR.months+BCR.censor+Ki67.LI+ISUP+Gleason+QuantGleason+IQGleason, C.data)

### calculate predictors
mdta$pred1<-predict(c1,type="lp")
mdta$pred2<-predict(c2,type="lp")
mdta$pred3<-predict(c3,type="lp")
mdta$pred4<-predict(c4,type="lp")
mdta$pred5<-predict(c5,type="lp")
mdta$pred6<-predict(c6,type="lp")
plot(pred2~pred1,mdta)
plot(pred3~pred2,mdta)
plot(pred4~pred3,mdta)
plot(pred5~pred4,mdta)
plot(pred6~pred5,mdta)


### Multivariate regression analysis step by step
times<-5*12

ROC.data <- Score(list("Ki67-LI"=mdta$pred1,"ISUP"=mdta$pred2,"Ki67+ISUP"=mdta$pred3,"QuantGleason"=mdta$pred4,
                       "IQGleason"=mdta$pred5,"Ki67+IQGlea."=mdta$pred6),
                  Surv(BCR.months,BCR.censor==0)~1,data=mdta,
                  times=times,plots="roc",metrics="auc",summary='risks')

### plot the time-dependent receiver operating characteristic curves
plotROC(ROC.data,times=times,
        xlab = "1-Specifity (False-Postive rate)",
        ylab= "Sensitivity (True-Positive rate)",
        col =c("orange","royalblue3","red","turquoise","forestgreen","firebrick"),
        lwd=2, lty=1, cex=1,pch=1, 
        legend=TRUE,auc.in.legend =TRUE,brier.in.legend=FALSE,
        add = FALSE)
### close the box & make the diagonal line less obvious
lines(x = c(-0.1,1.5), y = c(-0.1,1.5), col="white",lwd=3.5)
lines(x = c(-0.05,1.05), y = c(-0.05,1.05), col="lightgrey",lwd=1,lty="dotted")
box(which = "plot", lty = "solid", col="black")

### p-value for AUC comparison
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

### ggplot 2 preperation 
ROC.riskRegression<- ROC.data%$%ROC%$%plotframe
AUC.RR<- ROC.data%$%AUC%$%score
p.RR<- ROC.data%$%AUC%$%contrasts
p.RR<- p.RR[c(1,6,10,13,15)] #### select the p-values from the list we need
#AUC.RR$label<-AUC.RR %$% paste0(model,": "," AUC","=",round(AUC,2)," ","(",round(lower,2),"-",round(upper,2),")")
AUC.RR$label<-AUC.RR %$% paste0(model,": "," AUC","=",sprintf("%.2f", round(AUC,2))," ","(",sprintf("%.2f", round(lower,2)),"-",sprintf("%.2f", round(upper,2)),")")
AUC.RR$y<-c(0.1,0.15,0.2,0.25,0.3,0.35) ### just add 0.25, 0.3 ... according to the number of additonal curves
p.RR$label<-p.RR %$% paste0("p","=",round(p,4))
AUC.RR$model <- factor(AUC.RR$model, levels = c("Ki67-LI","ISUP","Ki67+ISUP","QuantGleason",
                                                "IQGleason","Ki67+IQGlea.")) ### order them here!


### finally plot all data with ggplot
AUC.RR$model ### to change the names for the corresponding colors
z=c(0.2,0.3,0.5,0.7,0.9)
ggplot(ROC.riskRegression,aes(x=FPR,y=TPR, colour=model, group=model))+
  geom_line(size=0.9)+facet_wrap(~times)+theme_bw()+
  geom_abline(intercept = 0, slope=1, linetype=3, colour="gray")+
  labs(colour="Ki67-LI", x="1-Specifity (False-Postive rate)", y="Sensitivity (True-Positive rate)")+
  theme(panel.grid=element_blank(),strip.background = element_blank(), panel.spacing = unit(2, "lines"))+
  scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+
  scale_colour_manual(values=alpha(c("orange","royalblue3","red","turquoise","forestgreen","firebrick"),c(1,1,1,1,1,1)))+
  geom_text(data=AUC.RR,aes(label=label,x=.95,y=y),size=10/.pt, show.legend = FALSE,hjust="inward") + 
  geom_text(data=p.RR,aes(label=label,x=z,y=0.05, group=NULL, colour=NULL),size=8/.pt, show.legend = FALSE,hjust="inward")


ROC.data[["ROC"]]
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

plotRisk(ROC.data,times=times)

# Distribuiton of the Data
hist(C.data$ISUP)
plot(density(C.data$ISUP))


### AUC-with-Bootstrap-CI.R --- 
#----------------------------------------------------------------------
## Author: Paul Blanche
## Created and updated: Jan 28 2021 (12:25) 


rm(list=ls())
###Import and select Data
detach("package:xlsx")
library("openxlsx")
library(dplyr)
Header = TRUE 
A.data.patients=read.xlsx("2021_03_17 Prostate reanalysis.xlsx", sheet="R", colNames = TRUE)
#A.data.patients <- A.data.patients[1:800,]

### Include patients like the JMP-Datafilter
B.data <- A.data.patients [A.data.patients$`Filter.(less.than.200.AE1/3.cells)` == "include",]
#B.data <- A.data.patients
B.data <- B.data [B.data$Final.filter == "include",]
#B.data <- B.data [B.data$`FIlter.2.(6000.patients)` == "include"|B.data$Slide == "ab0290"
#|B.data$Slide == "ab0293"|B.data$Slide == "ab0297"|B.data$Slide == "ab0300"|B.data$Slide == "ab0301"
#|B.data$Slide == "ab0303"|B.data$Slide == "ab0305"|B.data$Slide == "ab0308"|B.data$Slide == "ab0309"
#|B.data$Slide == "ab0310"|B.data$Slide == "ab0314"|B.data$Slide == "ab0315"|B.data$Slide == "ab0316",]


#### pickup data needed
C.data <-B.data%>% select("BCR.censor"="BCR.censor","BCR.months"="BCR.months",
                          "Ki67.LI"="Ki67.Li.(after.23um.exclusion)",
                          "ISUP"="pGleason.of.Pro17k_clean_NB",
                          "Gleason"="RPE_Gleason_groups",
                          "QuantGleason"="RPE_quant_Gleason",
                          "IQGleason"="RPE_IQ_Gleason")




### understanding the data
barplot(table(C.data$Gleason))

### Change to ISUP
C.data<-C.data%>% mutate(ISUP=recode(ISUP,"+"="1","0+"="1","2+2"="1","2+2"="1","2+3"="1","2+4"="1","3+2"="1",
                                     "3+3"="1",
                                     "3+4"="2",
                                     "3+5"="4",
                                     "4+3"="3",
                                     "4+4"="4",
                                     "4+5"="5",
                                     "5+3"="4",
                                     "5+4"="5",
                                     "5+5"="5"))
### understanding the data
barplot(table(C.data$ISUP))
barplot(table(C.data$Gleason))
barplot(table(C.data$QuantGleason))
barplot(table(C.data$IQGleason))

### Use always numeric values if possible! 
C.data$ISUP <- as.numeric(C.data$ISUP)
C.data$IQGleason <- as.numeric(C.data$IQGleason)
plot(density(C.data$IQGleason))

### is.na
any(is.na(C.data))
sum(is.na(C.data))
C.data <- C.data[complete.cases(C.data),]
any(is.na(C.data))
summary(C.data)

#### pickup data needed
C.data <-C.data%>% select("BCR.censor"="BCR.censor","BCR.months"="BCR.months",
                          "Ki67.LI"="Ki67.LI",
                          "ISUP"="ISUP")

library(survival)
library(riskRegression)

# {{{ Example without Bootstrap for 95% computation
ResPaquid <- Score(list("Ki67.LI"=C.data$Ki67.LI,"ISUP"=C.data$ISUP),
                   formula=Hist(BCR.months,BCR.censor==0)~1,
                   data=C.data,
                   null.model = FALSE,
                   conf.int=TRUE,
                   metrics=c("auc"),
                   times=c(5),
                   plots="ROC")
ResPaquid$AUC

## Results of model comparisons:

plotROC(ResPaquid,time=5)
# }}}

# {{{ Example with Bootstrap for 95% computation
# set random seed for reproducibility
set.seed(20210128)
# number of Bootstrap samples
nB <- 500
# initialize vector to save results from each iteration
AllAUC <- matrix(NA,nrow=nB,ncol=2)
# Bootstrap
for(b in 1:nB){
  # monitor computation progress
  print(paste0("Step b=",b," out of ",nB))
  # Create Bootstrapped dataset
  db <- C.data[sample(x=1:nrow(C.data),size=nrow(C.data),replace=TRUE),]
  # Estimate AUC with Bootstrap data
  Res2b <- Score(list("Ki67.LI"=C.data$Ki67.LI,"ISUP"=C.data$ISUP),
                 formula=Hist(BCR.months,BCR.censor==0)~1,
                 data=C.data,
                 null.model = FALSE,
                 conf.int=TRUE,
                 metrics=c("auc"),
                 times=c(5),
                 plots="ROC")
  # Save result
  AllAUC[b,] <- Res2b$AUC$score$AUC
}

# Wald type 95% CI, i.e., se estimated from Bootstrapping. OK if nB >= 400
# For KI67-LI
CIboot1 <- c(ResPaquid$AUC$score$AUC[1]-1.96*sd(AllAUC[,1]),ResPaquid$AUC$score$AUC[1]+1.96*sd(AllAUC[,1]))
CIboot1

# For ISUP
CIboot2 <- c(ResPaquid$AUC$score$AUC[2]-1.96*sd(AllAUC[,2]),ResPaquid$AUC$score$AUC[2]+1.96*sd(AllAUC[,2]))
CIboot2

#  Wald type p-value and CI for the difference
CI <- c(ResPaquid$AUC$score$AUC[2]-ResPaquid$AUC$score$AUC[1]- 1.96*sd(AllAUC[,2]-AllAUC[,1]),
        ResPaquid$AUC$score$AUC[2]-ResPaquid$AUC$score$AUC[1]+ 1.96*sd(AllAUC[,2]-AllAUC[,1]))
CI
pvalue <- 2*pnorm(abs(ResPaquid$AUC$score$AUC[2]-ResPaquid$AUC$score$AUC[1])/sd(AllAUC[,2]-AllAUC[,1])
                  ,lower.tail=FALSE)
pvalue
# }}}

#----------------------------------------------------------------------
### AUC-with-Bootstrap-CI.R ends here






#ISUP aus mean
######################################                                                                     #########################################################################
######################################    HeteroProg (ki67+IQ, Quant Gleason, Gleason, ISUP)  #########################################################################
######################################                                                                    #########################################################################
# Ki67 Mean of 6  vs. GS::SUM_Gleason_Berechnet --> ISUP vs. ISUP + auto-Ki67-LI
# Package riskRegression (12.2020) (https://github.com/tagteam/riskRegression)

###Load raw data  
setwd("/Users/niclasblessin/MS08_Prostate ki67 score/") 


###Import and select Data
detach("package:xlsx")
library("openxlsx")
Header = TRUE 
#A.data.patients <- A.data.patients[1:800,]
#B.data.nomogram=read.xlsx("Prostate TMA 2nd round 39 slides new2 FINAL.xlsx", sheet="Nomogram", colNames = TRUE)

### Include patients like the JMP-Datafilter
B.data <- A.data.patients [A.data.patients$Filter.final == "include",]
B.data <- B.data[B.data$RPE_TMA == "Pro 19",] 

### select data and load required packages
library(dplyr)
library(riskRegression)
library(survival)

library(tibble)
library(magrittr)
library(tidyr)
library(ggplot2)


#### pickup data needed
C.data <-B.data%>% select("BCR.censor"="BCR.censor","BCR.months"="BCR.months",
                          "Ki67.LI"="Mean.of.6.(23um)",
                          "Gleason"="GS::SUM_Gleason_Berechnet",
                          "QuantGleason"="GS::SUM_quant_Gleason",
                          "IQGleason"="GS::SUM_IQ_Gleason")

### understanding the data
barplot(table(B.data$`GS::SUM_quant_Gleason`))
barplot(table(B.data$`GS::SUM_IQ_Gleason`))

### Change to ISUP
C.data<-C.data%>% mutate(ISUP=recode(Gleason,"3+3"="1",
                                     "3+4"="2",
                                     "3+4 Tert.5"="4",
                                     "3+5"="4",
                                     "4+3"="3",
                                     "4+3 Tert.5"="5",
                                     "4+4"="4",
                                     "4+5"="5",
                                     "5+3"="4",
                                     "5+4"="5",
                                     "5+5"="5"))


### understanding the data
barplot(table(C.data$ISUP))
barplot(table(C.data$Gleason))
barplot(table(C.data$QuantGleason))
barplot(table(C.data$IQGleason))

### Use always numeric values if possible! 
C.data$ISUP <- as.numeric(C.data$ISUP)
C.data$IQGleason <- as.numeric(C.data$IQGleason)

### is.na
any(is.na(C.data))
sum(is.na(C.data))
C.data <- C.data[complete.cases(C.data),]
any(is.na(C.data))
summary(C.data)

### pick the variables like in the multivariate analysis (independ?) for testing them for their predictive power
### auto ki67-Li
c1<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI, C.data)
### ISUP
c2<-coxph(Surv(BCR.months,BCR.censor==0)~ISUP, C.data)
### Ki67+Gleason
c3<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI+ISUP, C.data)
### QuantGleason
c4<-coxph(Surv(BCR.months,BCR.censor==0)~QuantGleason, C.data)
### IQ Gleason
c5<-coxph(Surv(BCR.months,BCR.censor==0)~IQGleason, C.data)
### auto-Ki67 + IQ Gleason
c6<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI+IQGleason, C.data)
mdta<-model.frame(~BCR.months+BCR.censor+Ki67.LI+ISUP+Gleason+QuantGleason+IQGleason, C.data)


### calculate predictors
mdta$pred1<-predict(c1,type="lp")
mdta$pred2<-predict(c2,type="lp")
mdta$pred3<-predict(c3,type="lp")
mdta$pred4<-predict(c4,type="lp")
mdta$pred5<-predict(c5,type="lp")
mdta$pred6<-predict(c6,type="lp")
plot(pred2~pred1,mdta)
plot(pred3~pred2,mdta)
plot(pred4~pred3,mdta)
plot(pred5~pred4,mdta)
plot(pred6~pred5,mdta)

### Multivariate regression analysis step by step
times<-5*12

ROC.data <- Score(list("Ki67-LI"=mdta$pred1,"ISUP"=mdta$pred2,
                       "Ki67+ISUP"=mdta$pred3,"QuantGleason"=mdta$pred4,
                       "IQGleason"=mdta$pred5,"Ki67+IQGlea."=mdta$pred6),
                  Surv(BCR.months,BCR.censor==0)~1,data=mdta,
                  times=times,plots="roc",metrics="auc",summary='risks')

### plot the time-dependent receiver operating characteristic curves
plotROC(ROC.data,times=times,
        xlab = "1-Specifity (False-Postive rate)",
        ylab= "Sensitivity (True-Positive rate)",
        col =c("orange","royalblue3","red","turquoise","forestgreen","firebrick"),
        lwd=2, lty=1, cex=1,pch=1, 
        legend=TRUE,auc.in.legend =TRUE,brier.in.legend=FALSE,
        add = FALSE)
### close the box & make the diagonal line less obvious
lines(x = c(-0.1,1.5), y = c(-0.1,1.5), col="white",lwd=3.5)
lines(x = c(-0.05,1.05), y = c(-0.05,1.05), col="lightgrey",lwd=1,lty="dotted")
box(which = "plot", lty = "solid", col="black")

### p-value for AUC comparison
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

### ggplot 2 preperation 
ROC.riskRegression<- ROC.data%$%ROC%$%plotframe
AUC.RR<- ROC.data%$%AUC%$%score
p.RR<- ROC.data%$%AUC%$%contrasts
p.RR<- p.RR[c(1,6,10,13,15)] #### select the p-values from the list we need
#AUC.RR$label<-AUC.RR %$% paste0(model,": "," AUC","=",round(AUC,2)," ","(",round(lower,2),"-",round(upper,2),")")
AUC.RR$label<-AUC.RR %$% paste0(model,": "," AUC","=",sprintf("%.2f", round(AUC,2))," ","(",sprintf("%.2f", round(lower,2)),"-",sprintf("%.2f", round(upper,2)),")")
AUC.RR$y<-c(0.1,0.15,0.2,0.25,0.3,0.35) ### just add 0.25, 0.3 ... according to the number of additonal curves
p.RR$label<-p.RR %$% paste0("p","=",round(p,4))
AUC.RR$model <- factor(AUC.RR$model, levels = c("Ki67-LI","ISUP","Ki67+ISUP","QuantGleason",
                                                "IQGleason","Ki67+IQGlea.")) ### order them here!

### finally plot all data with ggplot
AUC.RR$model ### to change the names for the corresponding colors
z=c(0.2,0.3,0.5,0.7,0.9)
ggplot(ROC.riskRegression,aes(x=FPR,y=TPR, colour=model, group=model))+
  geom_line(size=0.9)+facet_wrap(~times)+theme_bw()+
  geom_abline(intercept = 0, slope=1, linetype=3, colour="gray")+
  labs(colour="Ki67-LI", x="1-Specifity (False-Postive rate)", y="Sensitivity (True-Positive rate)")+
  theme(panel.grid=element_blank(),strip.background = element_blank(), panel.spacing = unit(2, "lines"))+
  scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+
  scale_colour_manual(values=alpha(c("orange","royalblue3","red","turquoise","forestgreen","firebrick"),c(1,1,1,1,1,0.0000000001)))+
  geom_text(data=AUC.RR,aes(label=label,x=.95,y=y),size=10/.pt, show.legend = FALSE,hjust="inward") + 
  geom_text(data=p.RR,aes(label=label,x=z,y=0.05, group=NULL, colour=NULL),size=8/.pt, show.legend = FALSE,hjust="inward")



ROC.data[["ROC"]]
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

plotRisk(ROC.data,times=times)

# Distribuiton of the Data
barplot(table(C.data$ISUP))
plot(density(C.data$ISUP))


# Colors: palette=c("black","darkblue","royalblue3","forestgreen","mediumseagreen","orange","gold3","red","firebrick"),
# "blue","cyan3","#EFE814","#6B8DBD","#348F57","Firebrick"
#Order old built in RiskRegression plot "orange","darkblue","royalblue3","forestgreen","mediumseagreen","firebrick"
#Order ggplot: "firebrick","mediumseagreen","forestgreen","royalblue3","darkblue","orange"

################################################################
#### Lets double check the p-values with the old timROC
################################################################
### load required packages
library(dplyr)
library(magrittr)
library(timeROC)
library(survival)
library(tibble)
library(tidyr)
library(ggplot2)


### pick the variables like in the multivariate analysis (independ?) for testing them for their predictive power
### auto ki67-Li
c1<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI, C.data)
### ISUP
c2<-coxph(Surv(BCR.months,BCR.censor==0)~ISUP, C.data)
### Gelason
c3<-coxph(Surv(BCR.months,BCR.censor==0)~Gleason, C.data)
### QuantGleason
c4<-coxph(Surv(BCR.months,BCR.censor==0)~QuantGleason, C.data)
### IQ Gleason
c5<-coxph(Surv(BCR.months,BCR.censor==0)~IQGleason, C.data)
### auto-Ki67 + IQ Gleason
c6<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI+IQGleason, C.data)
mdta<-model.frame(~BCR.months+BCR.censor+Ki67.LI+ISUP+Gleason+QuantGleason+IQGleason, C.data)

### calculate predictors
mdta$pred1<-predict(c1,type="lp")
mdta$pred2<-predict(c2,type="lp")
mdta$pred3<-predict(c3,type="lp")
mdta$pred4<-predict(c4,type="lp")
mdta$pred5<-predict(c5,type="lp")
mdta$pred6<-predict(c6,type="lp")
plot(pred2~pred1,mdta)
plot(pred3~pred2,mdta)
plot(pred4~pred3,mdta)
plot(pred5~pred4,mdta)
plot(pred6~pred5,mdta)

### Multivariate regression analysis step by step
#times<-1:5*12
times<-5*12
#mdta_s<-mdta[sample(nrow(mdta),3000),]

ROC.B1<-mdta%$%timeROC(T=BCR.months, delta=1-BCR.censor, marker=pred1, weighting = "marginal",cause=1, iid=TRUE, times=times)
ROC.B2<-mdta%$%timeROC(T=BCR.months, delta=1-BCR.censor, marker=pred2, weighting = "marginal",cause=1, iid=TRUE, times=times)
ROC.B3<-mdta%$%timeROC(T=BCR.months, delta=1-BCR.censor, marker=pred3, weighting = "marginal",cause=1, iid=TRUE, times=times)
ROC.B4<-mdta%$%timeROC(T=BCR.months, delta=1-BCR.censor, marker=pred4, weighting = "marginal",cause=1, iid=TRUE, times=times)
ROC.B5<-mdta%$%timeROC(T=BCR.months, delta=1-BCR.censor, marker=pred5, weighting = "marginal",cause=1, iid=TRUE, times=times)
ROC.B6<-mdta%$%timeROC(T=BCR.months, delta=1-BCR.censor, marker=pred6, weighting = "marginal",cause=1, iid=TRUE, times=times)


compare(ROC.B6,ROC.B5,ROC.B4,ROC.B3,ROC.B2,ROC.B1)
plotAUCcurveDiff(ROC.B6,ROC.B5,ROC.B4,ROC.B3,ROC.B2,ROC.B1, conf.int = TRUE)

ROC<-bind_rows(inner_join(ROC.B6%$%as_tibble(FP)%>%mutate(index=row_number())%>%gather("time","FP",-index),
                          ROC.B6%$%as_tibble(TP)%>%mutate(index=row_number())%>%gather("time","TP",-index))%>%mutate(mode="B6"),
               inner_join(ROC.B5%$%as_tibble(FP)%>%mutate(index=row_number())%>%gather("time","FP",-index),
                          ROC.B5%$%as_tibble(TP)%>%mutate(index=row_number())%>%gather("time","TP",-index))%>%mutate(mode="B5"),
               inner_join(ROC.B4%$%as_tibble(FP)%>%mutate(index=row_number())%>%gather("time","FP",-index),
                          ROC.B4%$%as_tibble(TP)%>%mutate(index=row_number())%>%gather("time","TP",-index))%>%mutate(mode="B4"),
               inner_join(ROC.B3%$%as_tibble(FP)%>%mutate(index=row_number())%>%gather("time","FP",-index),
                          ROC.B3%$%as_tibble(TP)%>%mutate(index=row_number())%>%gather("time","TP",-index))%>%mutate(mode="B3"),
               inner_join(ROC.B2%$%as_tibble(FP)%>%mutate(index=row_number())%>%gather("time","FP",-index),
                          ROC.B2%$%as_tibble(TP)%>%mutate(index=row_number())%>%gather("time","TP",-index))%>%mutate(mode="B2"),
               inner_join(ROC.B1%$%as_tibble(FP)%>%mutate(index=row_number())%>%gather("time","FP",-index),
                          ROC.B1%$%as_tibble(TP)%>%mutate(index=row_number())%>%gather("time","TP",-index))%>%mutate(mode="B1"))

auc<-left_join(bind_rows(enframe(ROC.B6$AUC,name="time", value="AUC")%>%mutate(mode="B6"),
                         enframe(ROC.B5$AUC,name="time", value="AUC")%>%mutate(mode="B5"),
                         enframe(ROC.B4$AUC,name="time", value="AUC")%>%mutate(mode="B4"),
                         enframe(ROC.B3$AUC,name="time", value="AUC")%>%mutate(mode="B3"),
                         enframe(ROC.B2$AUC,name="time", value="AUC")%>%mutate(mode="B2"),
                         enframe(ROC.B1$AUC,name="time", value="AUC")%>%mutate(mode="B1")),
               bind_rows(as_tibble(confint(ROC.B1)$CI_AUC/100, rownames="time")%>%mutate(mode="B1"),
                         as_tibble(confint(ROC.B2)$CI_AUC/100, rownames="time")%>%mutate(mode="B2"),
                         as_tibble(confint(ROC.B3)$CI_AUC/100, rownames="time")%>%mutate(mode="B3"),
                         as_tibble(confint(ROC.B4)$CI_AUC/100, rownames="time")%>%mutate(mode="B4"),
                         as_tibble(confint(ROC.B5)$CI_AUC/100, rownames="time")%>%mutate(mode="B5"),
                         as_tibble(confint(ROC.B6)$CI_AUC/100, rownames="time")%>%mutate(mode="B6")))
auc %<>% mutate(label=sprintf("AUC %s: %.3f (%.3f, %.3f)",mode,AUC,`2.5%`, `97.5%`),y=.15-.05*(mode=="B1"))

pval1<- enframe(compare(ROC.B1,ROC.B2)$p_values_AUC,name="time", value="pvalue")%>%mutate(label=sprintf("p-value %.3f", pvalue), y=.05)
pval2<- enframe(compare(ROC.B2,ROC.B3)$p_values_AUC,name="time", value="pvalue")%>%mutate(label=sprintf("p-value %.3f", pvalue), y=.05)
pval3<- enframe(compare(ROC.B3,ROC.B4)$p_values_AUC,name="time", value="pvalue")%>%mutate(label=sprintf("p-value %.3f", pvalue), y=.05)
pval4<- enframe(compare(ROC.B4,ROC.B5)$p_values_AUC,name="time", value="pvalue")%>%mutate(label=sprintf("p-value %.3f", pvalue), y=.05)
pval5<- enframe(compare(ROC.B5,ROC.B6)$p_values_AUC,name="time", value="pvalue")%>%mutate(label=sprintf("p-value %.3f", pvalue), y=.05)


rocp<-ROC%>%filter(time%in%paste0("t=",c(5)*12))
aucp<-auc%>%filter(time%in%paste0("t=",c(5)*12))
pvalp1<-pval1%>%filter(time%in%paste0("t=",c(5)*12))

### position for mulitple AUC labels
z<-c(0.1,0.15,0.2,0.25,0.3,0.35)

### finally plot all data with ggplot
ggplot(rocp,aes(x=FP,y=TP, colour=mode, group=mode))+geom_line(size=1)+facet_wrap(~time)+theme_bw()+
  geom_abline(intercept = 0, slope=1, linetype=3, colour="gray")+labs(colour="HeteroProg", x="False positive rate (1-Specifity)", y="True positive rate (Sensitivity)")+
  theme(panel.grid=element_blank(),strip.background = element_blank(), panel.spacing = unit(2, "lines"))+
  scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+
  geom_text(data=aucp,aes(label=label,x=0.95,y=z),size=8/.pt, show.legend = FALSE,hjust="inward") + 
  geom_text(data=pvalp1,aes(label=label,x=0.95,y=y, group=NULL, colour=NULL),size=8/.pt, show.legend = FALSE,hjust="inward")

# geom_line(size=1) = Strichdicke der ROC-Curves


colSums(!is.na(B.data.random))
colSums(!is.na(C.data))
pval1
pval2 
pval3
pval4
pval5


#ISUP aus RPE gleason
######################################                                                                     #########################################################################
######################################    HeteroProg (ki67+IQ, Quant Gleason, Gleason, ISUP)  #########################################################################
######################################                                                                    #########################################################################
# Ki67 Mean of 6  vs. GS::SUM_Gleason_Berechnet --> ISUP vs. ISUP + auto-Ki67-LI
# Package riskRegression (12.2020) (https://github.com/tagteam/riskRegression)

###Load raw data  
setwd("/Users/niclasblessin/MS08_Prostate ki67 score/") 


###Import and select Data
detach("package:xlsx")
library("openxlsx")
Header = TRUE 
A.data.patients=read.xlsx("Blessin_autoKi67LI_per patient data hetero.xlsx", sheet="Blessin_autoKi67LI_hetero", colNames = TRUE)
#A.data.patients <- A.data.patients[1:800,]
#B.data.nomogram=read.xlsx("Prostate TMA 2nd round 39 slides new2 FINAL.xlsx", sheet="Nomogram", colNames = TRUE)

### Include patients like the JMP-Datafilter
B.data <- A.data.patients [A.data.patients$Filter.final == "include",]
B.data <- B.data[B.data$RPE_TMA == "Pro 19",] 

### select data and load required packages
library(dplyr)
library(riskRegression)
library(survival)

library(tibble)
library(magrittr)
library(tidyr)
library(ggplot2)


#### pickup data needed
C.data <-B.data%>% select("BCR.censor"="BCR.censor","BCR.months"="BCR.months",
                          "Ki67.LI"="Mean.of.6.(23um)",
                          "Gleason"="RPE_Gleason",
                          "QuantGleason"="RPE_quantGleason",
                          "IQGleason"="RPE_IQ_Gleason")

### understanding the data
barplot(table(B.data$RPE_Gleason))
barplot(table(B.data$RPE_quantGleason))

### Change to ISUP
C.data<-C.data%>% mutate(ISUP=recode(Gleason,"3+3"="1",
                                     "3+4"="2",
                                     "3+4 Tert.5"="2",
                                     "3+5"="4",
                                     "4+3"="3",
                                     "4+3 Tert.5"="3",
                                     "4+4"="4",
                                     "4+5"="5",
                                     "5+3"="4",
                                     "5+4"="5",
                                     "5+5"="5"))


### understanding the data
barplot(table(C.data$ISUP))
barplot(table(C.data$Gleason))
barplot(table(C.data$QuantGleason))
barplot(table(C.data$IQGleason))

### Use always numeric values if possible! 
C.data$ISUP <- as.numeric(C.data$ISUP)
C.data$IQGleason <- as.numeric(C.data$IQGleason)

### is.na
any(is.na(C.data))
sum(is.na(C.data))
C.data <- C.data[complete.cases(C.data),]
any(is.na(C.data))
summary(C.data)

### pick the variables like in the multivariate analysis (independ?) for testing them for their predictive power
### auto ki67-Li
c1<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI, C.data)
### ISUP
c2<-coxph(Surv(BCR.months,BCR.censor==0)~ISUP, C.data)
### Ki67+Gleason
c3<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI+ISUP, C.data)
### QuantGleason
c4<-coxph(Surv(BCR.months,BCR.censor==0)~QuantGleason, C.data)
### IQ Gleason
c5<-coxph(Surv(BCR.months,BCR.censor==0)~IQGleason, C.data)
### auto-Ki67 + IQ Gleason
c6<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI+IQGleason, C.data)
mdta<-model.frame(~BCR.months+BCR.censor+Ki67.LI+ISUP+Gleason+QuantGleason+IQGleason, C.data)


### calculate predictors
mdta$pred1<-predict(c1,type="lp")
mdta$pred2<-predict(c2,type="lp")
mdta$pred3<-predict(c3,type="lp")
mdta$pred4<-predict(c4,type="lp")
mdta$pred5<-predict(c5,type="lp")
mdta$pred6<-predict(c6,type="lp")
plot(pred2~pred1,mdta)
plot(pred3~pred2,mdta)
plot(pred4~pred3,mdta)
plot(pred5~pred4,mdta)
plot(pred6~pred5,mdta)

### Multivariate regression analysis step by step
times<-5*12

ROC.data <- Score(list("Ki67-LI"=mdta$pred1,"ISUP"=mdta$pred2,
                       "Ki67+ISUP"=mdta$pred3,"QuantGleason"=mdta$pred4,
                       "IQGleason"=mdta$pred5,"Ki67+IQGlea."=mdta$pred6),
                  Surv(BCR.months,BCR.censor==0)~1,data=mdta,
                  times=times,plots="roc",metrics="auc",summary='risks')

### plot the time-dependent receiver operating characteristic curves
plotROC(ROC.data,times=times,
        xlab = "1-Specifity (False-Postive rate)",
        ylab= "Sensitivity (True-Positive rate)",
        col =c("orange","royalblue3","red","turquoise","forestgreen","firebrick"),
        lwd=2, lty=1, cex=1,pch=1, 
        legend=TRUE,auc.in.legend =TRUE,brier.in.legend=FALSE,
        add = FALSE)
### close the box & make the diagonal line less obvious
lines(x = c(-0.1,1.5), y = c(-0.1,1.5), col="white",lwd=3.5)
lines(x = c(-0.05,1.05), y = c(-0.05,1.05), col="lightgrey",lwd=1,lty="dotted")
box(which = "plot", lty = "solid", col="black")

### p-value for AUC comparison
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

### ggplot 2 preperation 
ROC.riskRegression<- ROC.data%$%ROC%$%plotframe
AUC.RR<- ROC.data%$%AUC%$%score
p.RR<- ROC.data%$%AUC%$%contrasts
p.RR<- p.RR[c(1,6,10,13,15)] #### select the p-values from the list we need
AUC.RR$label<-AUC.RR %$% paste0(model,": "," AUC","=",round(AUC,2)," ","(",round(lower,2),"-",round(upper,2),")")
AUC.RR$y<-c(0.1,0.15,0.2,0.25,0.3,0.35) ### just add 0.25, 0.3 ... according to the number of additonal curves
p.RR$label<-p.RR %$% paste0("p","=",round(p,4))
AUC.RR$model <- factor(AUC.RR$model, levels = c("Ki67-LI","ISUP","Ki67+ISUP","QuantGleason",
                                                "IQGleason","Ki67+IQGlea.")) ### order them here!

### finally plot all data with ggplot
AUC.RR$model ### to change the names for the corresponding colors
z=c(0.2,0.3,0.5,0.7,0.9)
ggplot(ROC.riskRegression,aes(x=FPR,y=TPR, colour=model, group=model))+
  geom_line(size=0.9)+facet_wrap(~times)+theme_bw()+
  geom_abline(intercept = 0, slope=1, linetype=3, colour="gray")+
  labs(colour="Ki67-LI", x="1-Specifity (False-Postive rate)", y="Sensitivity (True-Positive rate)")+
  theme(panel.grid=element_blank(),strip.background = element_blank(), panel.spacing = unit(2, "lines"))+
  scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+
  scale_colour_manual(values=c("orange","royalblue3","red","turquoise","forestgreen","firebrick"))+
  geom_text(data=AUC.RR,aes(label=label,x=.95,y=y),size=10/.pt, show.legend = FALSE,hjust="inward") + 
  geom_text(data=p.RR,aes(label=label,x=z,y=0.05, group=NULL, colour=NULL),size=8/.pt, show.legend = FALSE,hjust="inward")



ROC.data[["ROC"]]
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

plotRisk(ROC.data,times=times)

# Distribuiton of the Data
barplot(table(C.data$ISUP))
plot(density(C.data$ISUP))














# ISUP aus MAx
######################################                                                                     #########################################################################
######################################    ISUP aus max HeteroProg (ki67+IQ, Quant Gleason, , ISUP)  #########################################################################
######################################                                                                    #########################################################################
# Ki67 Mean of 6  vs. GS::SUM_Gleason_Berechnet --> ISUP vs. ISUP + auto-Ki67-LI
# Package riskRegression (12.2020) (https://github.com/tagteam/riskRegression)

###Load raw data  
setwd("/Users/niclasblessin/MS08_Prostate ki67 score/") 


###Import and select Data
detach("package:xlsx")
library("openxlsx")
Header = TRUE 
A.data.patients=read.xlsx("Blessin_autoKi67LI_per patient data hetero.xlsx", sheet="Blessin_autoKi67LI_hetero", colNames = TRUE)
#A.data.patients <- A.data.patients[1:800,]
#B.data.nomogram=read.xlsx("Prostate TMA 2nd round 39 slides new2 FINAL.xlsx", sheet="Nomogram", colNames = TRUE)

### Include patients like the JMP-Datafilter
B.data <- A.data.patients [A.data.patients$Filter.final == "include",]
B.data <- B.data[B.data$RPE_TMA == "Pro 19",] 

### select data and load required packages
library(dplyr)
library(riskRegression)
library(survival)

library(tibble)
library(magrittr)
library(tidyr)
library(ggplot2)

#### pickup data needed
C.data <-B.data%>% select("BCR.censor"="BCR.censor","BCR.months"="BCR.months",
                          "Ki67.LI"="Mean.of.6.(23um)",
                          "Gleason"="GS::MAX_Gleason",
                          "QuantGleason"="GS::SUM_quant_Gleason",
                          "IQGleason"="GS::SUM_IQ_Gleason")

### understanding the data
barplot(table(B.data$`GS::MAX_Gleason`))
barplot(table(B.data$`GS::SUM_IQ_Gleason`))

### Change to ISUP
C.data<-C.data%>% mutate(ISUP=recode(Gleason,"3+3"="1",
                                     "3+4"="2",
                                     "3+5"="4",
                                     "4+3"="3",
                                     "4+4"="4",
                                     "4+5"="5",
                                     "5+3"="4",
                                     "5+4"="5",
                                     "5+5"="5"))


### understanding the data
barplot(table(C.data$ISUP))
barplot(table(C.data$Gleason))
barplot(table(C.data$QuantGleason))
barplot(table(C.data$IQGleason))

### Use always numeric values if possible! 
C.data$ISUP <- as.numeric(C.data$ISUP)
C.data$IQGleason <- as.numeric(C.data$IQGleason)

### is.na
any(is.na(C.data))
sum(is.na(C.data))
C.data <- C.data[complete.cases(C.data),]
any(is.na(C.data))
summary(C.data)

### pick the variables like in the multivariate analysis (independ?) for testing them for their predictive power
### auto ki67-Li
c1<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI, C.data)
### ISUP
c2<-coxph(Surv(BCR.months,BCR.censor==0)~ISUP, C.data)
### Ki67+Gleason
c3<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI+ISUP, C.data)
### QuantGleason
c4<-coxph(Surv(BCR.months,BCR.censor==0)~QuantGleason, C.data)
### IQ Gleason
c5<-coxph(Surv(BCR.months,BCR.censor==0)~IQGleason, C.data)
### auto-Ki67 + IQ Gleason
c6<-coxph(Surv(BCR.months,BCR.censor==0)~Ki67.LI+IQGleason, C.data)
mdta<-model.frame(~BCR.months+BCR.censor+Ki67.LI+ISUP+Gleason+QuantGleason+IQGleason, C.data)


### calculate predictors
mdta$pred1<-predict(c1,type="lp")
mdta$pred2<-predict(c2,type="lp")
mdta$pred3<-predict(c3,type="lp")
mdta$pred4<-predict(c4,type="lp")
mdta$pred5<-predict(c5,type="lp")
mdta$pred6<-predict(c6,type="lp")
plot(pred2~pred1,mdta)
plot(pred3~pred2,mdta)
plot(pred4~pred3,mdta)
plot(pred5~pred4,mdta)
plot(pred6~pred5,mdta)

### Multivariate regression analysis step by step
times<-5*12

ROC.data <- Score(list("Ki67-LI"=mdta$pred1,"ISUP"=mdta$pred2,
                       "Ki67+ISUP"=mdta$pred3,"QuantGleason"=mdta$pred4,
                       "IQGleason"=mdta$pred5,"Ki67+IQGlea."=mdta$pred6),
                  Surv(BCR.months,BCR.censor==0)~1,data=mdta,
                  times=times,plots="roc",metrics="auc",summary='risks')

### plot the time-dependent receiver operating characteristic curves
plotROC(ROC.data,times=times,
        xlab = "1-Specifity (False-Postive rate)",
        ylab= "Sensitivity (True-Positive rate)",
        col =c("orange","royalblue3","red","turquoise","forestgreen","firebrick"),
        lwd=2, lty=1, cex=1,pch=1, 
        legend=TRUE,auc.in.legend =TRUE,brier.in.legend=FALSE,
        add = FALSE)
### close the box & make the diagonal line less obvious
lines(x = c(-0.1,1.5), y = c(-0.1,1.5), col="white",lwd=3.5)
lines(x = c(-0.05,1.05), y = c(-0.05,1.05), col="lightgrey",lwd=1,lty="dotted")
box(which = "plot", lty = "solid", col="black")

### p-value for AUC comparison
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

### ggplot 2 preperation 
ROC.riskRegression<- ROC.data%$%ROC%$%plotframe
ROC.riskRegression<-ROC.riskRegression %>% add_row(model = unique(ROC.riskRegression$model),times=times,TPR=0,FPR=0)   ### create a start point for ROC
AUC.RR<- ROC.data%$%AUC%$%score
p.RR<- ROC.data%$%AUC%$%contrasts
p.RR<- p.RR[c(1,6,10,13,15)] #### select the p-values from the list we need
#AUC.RR$label<-AUC.RR %$% paste0(model,": "," AUC","=",round(AUC,2)," ","(",round(lower,2),"-",round(upper,2),")")
AUC.RR$label<-AUC.RR %$% paste0(model,": "," AUC","=",sprintf("%.2f", round(AUC,2))," ","(",sprintf("%.2f", round(lower,2)),"-",sprintf("%.2f", round(upper,2)),")")
AUC.RR$y<-c(0.1,0.15,0.2,0.25,0.3,0.35) ### just add 0.25, 0.3 ... according to the number of additonal curves
p.RR$label<-p.RR %$% paste0("p","=",round(p,4))
AUC.RR$model <- factor(AUC.RR$model, levels = c("Ki67-LI","ISUP","Ki67+ISUP","QuantGleason",
                                                "IQGleason","Ki67+IQGlea.")) ### order them here!

### finally plot all data with ggplot
AUC.RR$model ### to change the names for the corresponding colors
z=c(0.2,0.3,0.5,0.7,0.9)
ggplot(ROC.riskRegression,aes(x=FPR,y=TPR, colour=model, group=model))+
  geom_line(size=0.9)+facet_wrap(~times)+theme_bw()+
  geom_abline(intercept = 0, slope=1, linetype=3, colour="gray")+
  labs(colour="Ki67-LI", x="1-Specifity (False-Postive rate)", y="Sensitivity (True-Positive rate)")+
  theme(panel.grid=element_blank(),strip.background = element_blank(), panel.spacing = unit(2, "lines"))+
  scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+
  scale_colour_manual(values=alpha(c("orange","royalblue3","red","turquoise","forestgreen","firebrick"),c(1,1,1,1,1,1)))+
  geom_text(data=AUC.RR,aes(label=label,x=.95,y=y),size=10/.pt, show.legend = FALSE,hjust="inward") + 
  geom_text(data=p.RR,aes(label=label,x=z,y=0.05, group=NULL, colour=NULL),size=8/.pt, show.legend = FALSE,hjust="inward")



ROC.data[["ROC"]]
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

plotRisk(ROC.data,times=times)

# Distribuiton of the Data
barplot(table(C.data$ISUP))
plot(density(C.data$ISUP))


# Colors: palette=c("black","darkblue","royalblue3","forestgreen","mediumseagreen","orange","gold3","red","firebrick"),
# "blue","cyan3","#EFE814","#6B8DBD","#348F57","Firebrick"
#Order old built in RiskRegression plot "orange","darkblue","royalblue3","forestgreen","mediumseagreen","firebrick"
#Order ggplot: "firebrick","mediumseagreen","forestgreen","royalblue3","darkblue","orange"


############################################################################################################################################################################
############################################################################################################################################################################
############################################################################################################################################################################
######################################################## Figure 4 Heterogenious prognosis Message 6 better than 1 biopsy  ######################################################################
############################################################################################################################################################################
############################################################################################################################################################################
############################################################################################################################################################################

###Load raw data  
setwd("/Users/niclasblessin/MS08_Prostate ki67 score/") 


###Import and select Data
detach("package:xlsx")
library("openxlsx")
Header = TRUE 
A.data.patients=read.xlsx("Blessin_autoKi67LI_per patient data hetero.xlsx", sheet="Blessin_autoKi67LI_hetero", colNames = TRUE)
#A.data.patients <- A.data.patients[1:800,]
#B.data.nomogram=read.xlsx("Prostate TMA 2nd round 39 slides new2 FINAL.xlsx", sheet="Nomogram", colNames = TRUE)

### Include patients like the JMP-Datafilter
#B.data <- A.data.patients[A.data.patients$Filter.final == "include",]
#B.data <- B.data[B.data$RPE_TMA == "Pro 19",] 


### calculations for the Paper 
library(dplyr)
D.data <-B.data%>% select("TMA1place"="RPE_Punch1_TMAlocalisation","TMA2place"="RPE_Punch2_TMAlocalisation","TMA3place"="RPE_Punch3_TMAlocalisation",
                          "TMA4place"="RPE_Punch4_TMAlocalisation","TMA5place"="RPE_Punch5_TMAlocalisation","TMA6place"="RPE_Punch6_TMAlocalisation",
                          "Foci1.23um"="Ki67.Li.(after.23um.exclusion).of.Biopsy1","Foci2.23um"="Ki67.Li.(after.23um.exclusion).of.Biopsy2",
                          "Foci3.23um"="Ki67.Li.(after.23um.exclusion).of.Biopsy3","Foci4.23um"="Ki67.Li.(after.23um.exclusion).of.Biopsy4",
                          "Foci5.23um"="Ki67.Li.(after.23um.exclusion).of.Biopsy5","Foci6.23um"="Ki67.Li.(after.23um.exclusion).of.Biopsy6",
                          "Ki67ofMAXGleason"="Ki67-LI.of.spot.with.max.Gleason")

## finding out the number of cores with 6
D.data <- B.data[,3:8]
any(is.na(D.data))
sum(is.na(D.data))
D.data <- D.data[complete.cases(D.data),]
any(is.na(D.data))
summary(D.data)

colSums(!is.na(D.data))

######################################                                                                     #########################################################################
######################################    HeteroProg Mean Roc-Curve for Heteroprognosis analysis un-randomizded (Sample 1-6 improve a lot)  #########################################################################
######################################                                                                    #########################################################################
# Mean of 1-6 samples vs. each other
# Package riskRegression (12.2020) (https://github.com/tagteam/riskRegression)

### select data and load required packages
library(dplyr)
library(riskRegression)
library(survival)

library(tibble)
library(magrittr)
library(tidyr)
library(ggplot2)

#Datainfo
summary(B.data)
colSums(!is.na(B.data))
barplot(table(B.data$`Ki67.Li.(after.23um.exclusion).of.Biopsy1`))


#####23um

#### pickup data needed
C.data<-B.data[c("BCR.censor","BCR.months")]
C.data$MeanBiopsy1<-B.data$`Ki67.Li.(after.23um.exclusion).of.Biopsy1`
C.data$MeanBiopsy2<-B.data$`Mean.of.2.(23um)`
C.data$MeanBiopsy3<-B.data$`Mean.of.3.(23um)`
C.data$MeanBiopsy4<-B.data$`Mean.of.4.(23um)`
C.data$MeanBiopsy5<-B.data$`Mean.of.5.(23um)`
C.data$MeanBiopsy6<-B.data$`Mean.of.6.(23um)`
#C.data<-A.data.patients%>%select(PatID,"Ki67.score","BCR.censor","BCR.months", Biobsy.Gleason=Bx.Gleason.max.group, Clinical.stage=Bx.klin.Stadium,  Presurgery.PSA=PSA.preop)

### is.na
any(is.na(C.data))
sum(is.na(C.data))
C.data <- C.data[complete.cases(C.data),]
any(is.na(C.data))
summary(C.data)

### pick the variables like in the multivariate analysis (independ?) for testing them for their predictive power
### old
c1<-coxph(Surv(BCR.months,BCR.censor==0)~MeanBiopsy1, C.data)
### new
c2<-coxph(Surv(BCR.months,BCR.censor==0)~MeanBiopsy2, C.data)
### newest
c3<-coxph(Surv(BCR.months,BCR.censor==0)~MeanBiopsy3, C.data)
c4<-coxph(Surv(BCR.months,BCR.censor==0)~MeanBiopsy4, C.data)
c5<-coxph(Surv(BCR.months,BCR.censor==0)~MeanBiopsy5, C.data)
c6<-coxph(Surv(BCR.months,BCR.censor==0)~MeanBiopsy6, C.data)

mdta<-model.frame(~BCR.months+BCR.censor+MeanBiopsy1+MeanBiopsy2+MeanBiopsy3+MeanBiopsy4+MeanBiopsy5+MeanBiopsy6, C.data)

### calculate pred 1 & 2
mdta$pred1<-predict(c1,type="lp")
mdta$pred2<-predict(c2,type="lp")
mdta$pred3<-predict(c3,type="lp")
mdta$pred4<-predict(c4,type="lp")
mdta$pred5<-predict(c5,type="lp")
mdta$pred6<-predict(c6,type="lp")
plot(pred2~pred1,mdta)
plot(pred4~pred3,mdta)
plot(pred6~pred5,mdta)

### Multivariate regression analysis step by step
#times<-1:5*12
times<-5*12
#mdta_s<-mdta[sample(nrow(mdta),3000),]

ROC.data <- Score(list("Mean of 1"=mdta$pred1,"Mean of 2"=mdta$pred2,"Mean of 3"=mdta$pred3,"Mean of 4"=mdta$pred4,
                       "Mean of 5"=mdta$pred5,"Mean of 6"=mdta$pred6),
                  Surv(BCR.months,BCR.censor==0)~1,data=mdta,
                  times=times,plots="roc",metrics="auc",summary='risks')

### plot the time-dependent receiver operating characteristic curves
plotROC(ROC.data,times=times,
        xlab = "1-Specifity (False-Postive rate)",
        ylab= "Sensitivity (True-Positive rate)",
        col =c("black","firebrick","red","firebrick1","orange","gold"),
        lwd=2, lty=1, cex=1,pch=1, 
        legend=TRUE,auc.in.legend =TRUE,brier.in.legend=FALSE,
        add = FALSE)
### close the box & make the diagonal line less obvious
lines(x = c(-0.1,1.5), y = c(-0.1,1.5), col="white",lwd=3.5)
lines(x = c(-0.05,1.05), y = c(-0.05,1.05), col="lightgrey",lwd=1,lty="dotted")
box(which = "plot", lty = "solid", col="black")

### p-value for AUC comparison
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

### ggplot 2 preperation 
ROC.riskRegression<- ROC.data%$%ROC%$%plotframe
AUC.RR<- ROC.data%$%AUC%$%score
p.RR<- ROC.data%$%AUC%$%contrasts
#p.RR<- p.RR[c(1,6,10,13,15)] 
p.RR<- p.RR[c(15,13,10,6,1)] 
#### select the p-values from the list we need
#AUC.RR$label<-AUC.RR %$% paste0(model,": "," AUC","=",round(AUC,2)," ","(",round(lower,2),"-",round(upper,2),")")
AUC.RR$label<-AUC.RR %$% paste0(model,": "," AUC","=",sprintf("%.2f", round(AUC,2))," ","(",round(lower,2),"-",round(upper,2),")")

AUC.RR$y<-c(0.1,0.15,0.2,0.25,0.3,0.35) ### just add 0.25, 0.3 ... according to the number of additonal curves
p.RR$label<-p.RR %$% paste0("p","=",round(p,4))
AUC.RR$model <- factor(AUC.RR$model, levels = c("Mean of 1","Mean of 2","Mean of 3","Mean of 4",
                                                "Mean of 5","Mean of 6")) ### order them here!

### finally plot all data with ggplot
AUC.RR$model ### to change the names for the corresponding colors
z=c(0.2,0.3,0.5,0.7,0.9)
ggplot(ROC.riskRegression,aes(x=FPR,y=TPR, colour=model, group=model))+
  geom_line(size=0.75)+facet_wrap(~times)+theme_bw()+
  geom_abline(intercept = 0, slope=1, linetype=3, colour="gray")+
  labs(colour="Ki67-LI", x="1-Specifity (False-Postive rate)", y="Sensitivity (True-Positive rate)")+
  theme(panel.grid=element_blank(),strip.background = element_blank(), panel.spacing = unit(2, "lines"))+
  scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+
  scale_colour_manual(values=alpha(c("gold2","orange","#E18727FF","#FF0000","firebrick","#750000"),c(1,1,1,1,1,1)))+
  geom_text(data=AUC.RR,aes(label=label,x=.95,y=y),size=10/.pt, show.legend = FALSE,hjust="inward")+ 
  geom_text(data=p.RR,aes(label=label,x=z,y=0.05, group=NULL, colour=NULL),size=8/.pt, show.legend = FALSE,hjust="inward")


ROC.data[["ROC"]]
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

plotRisk(ROC.data,times=times)

# Distribuiton of the Data
barplot(table(C.data$xx))
plot(density(C.data$xxx))

#scale_colour_manual(values=c("gold","orange","firebrick1","red","firebrick","black"))
#col =c("firebrick","forestgreen","turquoise","royalblue3","black","orange"),
# Colors: palette=c("black","darkblue","royalblue3","forestgreen","mediumseagreen","orange","gold3","red","firebrick"),
# "blue","cyan3","#EFE814","#6B8DBD","#348F57","Firebrick"
#Order old built in RiskRegression plot "orange","darkblue","royalblue3","forestgreen","mediumseagreen","firebrick"
#Order ggplot: "firebrick","mediumseagreen","forestgreen","royalblue3","darkblue","orange"



######################################                                                                     #########################################################################
######################################    HeteroProg Mean of 6 vs. MAX of 6 vs. 1 with worst Gleason  #########################################################################
######################################                                                                    #########################################################################
# mean vs. max. vs. worst gleason

#Datainfo
summary(B.data)
colSums(!is.na(B.data))

#### pickup data needed
C.data<-B.data[c("BCR.censor","BCR.months")]
C.data$Max <-B.data$`Ki67-LI.of.spot.with.max.Gleason`
C.data$Mean <-B.data$`Mean.of.6.(23um)`
C.data$Max6 <-B.data$`Max.of.6.(23um)`
C.data <- C.data[C.data$Max>0.0001,] 


### select data and load required packages
library(dplyr)
library(riskRegression)
library(survival)

library(tibble)
library(magrittr)
library(tidyr)
library(ggplot2)

#Datainfo
summary(B.data)
colSums(!is.na(C.data))
barplot(table(C.data$Mean))

### is.na
any(is.na(C.data))
sum(is.na(C.data))
C.data <- C.data[complete.cases(C.data),]
any(is.na(C.data))
summary(C.data)

### pick the variables like in the multivariate analysis (independ?) for testing them for their predictive power
### old
c1<-coxph(Surv(BCR.months,BCR.censor==0)~Max, C.data)
### new
c2<-coxph(Surv(BCR.months,BCR.censor==0)~Mean, C.data)
### newest
c3<-coxph(Surv(BCR.months,BCR.censor==0)~Max6, C.data)

mdta<-model.frame(~BCR.months+BCR.censor+Mean+Max+Max6, C.data)

### calculate pred 1 & 2
mdta$pred1<-predict(c1,type="lp")
mdta$pred2<-predict(c2,type="lp")
mdta$pred3<-predict(c3,type="lp")
plot(pred2~pred1,mdta)

### Multivariate regression analysis step by step
#times<-1:5*12
times<-5*12
#mdta_s<-mdta[sample(nrow(mdta),3000),]

ROC.data <- Score(list("Ki67-LI of 1 (worst Gleason)"=mdta$pred1,"Max of 6"=mdta$pred3,
                       "Mean of 6"=mdta$pred2),  ### change names only here
                  Surv(BCR.months,BCR.censor==0)~1,data=mdta,
                  times=times,plots="roc",metrics="auc",summary='risks')

### plot the time-dependent receiver operating characteristic curves
plotROC(ROC.data,times=times,
        xlab = "1-Specifity (False-Postive rate)",
        ylab= "Sensitivity (True-Positive rate)",
        col =c("orange","red","firebrick"),
        lwd=2, lty=1, cex=1,pch=1, 
        legend=TRUE,auc.in.legend =TRUE,brier.in.legend=FALSE,
        add = FALSE)
### close the box & make the diagonal line less obvious
lines(x = c(-0.1,1.5), y = c(-0.1,1.5), col="white",lwd=3.5)
lines(x = c(-0.05,1.05), y = c(-0.05,1.05), col="lightgrey",lwd=1,lty="dotted")
box(which = "plot", lty = "solid", col="black")

### p-value for AUC comparison
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

### ggplot 2 preperation 
ROC.riskRegression<- ROC.data%$%ROC%$%plotframe
AUC.RR<- ROC.data%$%AUC%$%score
p.RR<- ROC.data%$%AUC%$%contrasts
#p.RR<- p.RR[c(1,6,10,13,15)] 
p.RR<- p.RR[c(1,3)] 
#### select the p-values from the list we need
AUC.RR$label<-AUC.RR %$% paste0(model,": "," AUC","=",round(AUC,2)," ","(",round(lower,2),"-",round(upper,2),")")
AUC.RR$y<-c(0.1,0.15,0.2) ### just add 0.25, 0.3 ... according to the number of additonal curves
p.RR$label<-p.RR %$% paste0("p","=",round(p,4))
AUC.RR$model <- factor(AUC.RR$model, levels = c("Mean of 6","Max of 6","Ki67-LI of 1 (worst Gleason)")) ### order them here!

### finally plot all data with ggplot
AUC.RR$model ### to change the names for the corresponding colors
z=c(0.9,0.7)
ggplot(ROC.riskRegression,aes(x=FPR,y=TPR, colour=model, group=model))+
  geom_line(size=0.75)+facet_wrap(~times)+theme_bw()+
  geom_abline(intercept = 0, slope=1, linetype=3, colour="gray")+
  labs(colour="Ki67-LI", x="1-Specifity (False-Postive rate)", y="Sensitivity (True-Positive rate)")+
  theme(panel.grid=element_blank(),strip.background = element_blank(), panel.spacing = unit(2, "lines"))+
  scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+
  scale_colour_manual(values=alpha(c("firebrick","red","orange"),c(1,1,1)))+
  geom_text(data=AUC.RR,aes(label=label,x=.95,y=y),size=10/.pt, show.legend = FALSE,hjust="inward") + 
  geom_text(data=p.RR,aes(label=label,x=z,y=0.05, group=NULL, colour=NULL),size=8/.pt, show.legend = FALSE,hjust="inward")


ROC.data[["ROC"]]
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)
colSums(!is.na(C.data))


plotRisk(ROC.data,times=times)

# Distribuiton of the Data
barplot(table(C.data$xx))
plot(density(C.data$xxx))

#col =c("firebrick","forestgreen","turquoise","royalblue3","black","orange"),
# Colors: palette=c("black","darkblue","royalblue3","forestgreen","mediumseagreen","orange","gold3","red","firebrick"),
# "blue","cyan3","#EFE814","#6B8DBD","#348F57","Firebrick"
#Order old built in RiskRegression plot "orange","darkblue","royalblue3","forestgreen","mediumseagreen","firebrick"
#Order ggplot: "firebrick","mediumseagreen","forestgreen","royalblue3","darkblue","orange"

######################################                                                                     #########################################################################
######################################                            MAX of 1-6                 #########################################################################
######################################                                                                    #########################################################################
# mean vs. max. vs. worst gleason



###Load raw data  
setwd("/Users/niclasblessin/MS08_Prostate ki67 score/") 


###Import and select Data
detach("package:xlsx")
library("openxlsx")
Header = TRUE 
A.data.patients=read.xlsx("Blessin_autoKi67LI_per patient data hetero.xlsx", sheet="Blessin_autoKi67LI_hetero", colNames = TRUE)
#A.data.patients <- A.data.patients[1:800,]
#B.data.nomogram=read.xlsx("Prostate TMA 2nd round 39 slides new2 FINAL.xlsx", sheet="Nomogram", colNames = TRUE)

### Include patients like the JMP-Datafilter
B.data <- A.data.patients[A.data.patients$Filter.final == "include",]
B.data <- B.data[B.data$RPE_TMA == "Pro 19",] 

######################################                                                                     #########################################################################
######################################    HeteroProg Mean Roc-Curve for Heteroprognosis analysis un-randomizded (Sample 1-6 improve a lot)  #########################################################################
######################################                                                                    #########################################################################
# Mean of 1-6 samples vs. each other
# Package riskRegression (12.2020) (https://github.com/tagteam/riskRegression)

### select data and load required packages
library(dplyr)
library(riskRegression)
library(survival)

library(tibble)
library(magrittr)
library(tidyr)
library(ggplot2)

#Datainfo
summary(B.data)
colSums(!is.na(B.data))
barplot(table(B.data$`Ki67.Li.(after.23um.exclusion).of.Biopsy1`))


#####23um

#### pickup data needed
C.data<-B.data[c("BCR.censor","BCR.months")]
C.data$MaxBiopsy1<-B.data$`Ki67.Li.(after.23um.exclusion).of.Biopsy1`
C.data$MaxBiopsy2<-B.data$`Max.of.2.(23um)`
C.data$MaxBiopsy3<-B.data$`Max.of.3.(23um)`
C.data$MaxBiopsy4<-B.data$`Max.of.4.(23um)`
C.data$MaxBiopsy5<-B.data$`Max.of.5.(23um)`
C.data$MaxBiopsy6<-B.data$`Max.of.6.(23um)`
#C.data<-A.data.patients%>%select(PatID,"Ki67.score","BCR.censor","BCR.months", Biobsy.Gleason=Bx.Gleason.max.group, Clinical.stage=Bx.klin.Stadium,  Presurgery.PSA=PSA.preop)

### is.na
any(is.na(C.data))
sum(is.na(C.data))
C.data <- C.data[complete.cases(C.data),]
any(is.na(C.data))
summary(C.data)

### pick the variables like in the multivariate analysis (independ?) for testing them for their predictive power
### old
c1<-coxph(Surv(BCR.months,BCR.censor==0)~MaxBiopsy1, C.data)
### new
c2<-coxph(Surv(BCR.months,BCR.censor==0)~MaxBiopsy2, C.data)
### newest
c3<-coxph(Surv(BCR.months,BCR.censor==0)~MaxBiopsy3, C.data)
c4<-coxph(Surv(BCR.months,BCR.censor==0)~MaxBiopsy4, C.data)
c5<-coxph(Surv(BCR.months,BCR.censor==0)~MaxBiopsy5, C.data)
c6<-coxph(Surv(BCR.months,BCR.censor==0)~MaxBiopsy6, C.data)

mdta<-model.frame(~BCR.months+BCR.censor+MaxBiopsy1+MaxBiopsy2+MaxBiopsy3+MaxBiopsy4+MaxBiopsy5+MaxBiopsy6, C.data)

### calculate pred 1 & 2
mdta$pred1<-predict(c1,type="lp")
mdta$pred2<-predict(c2,type="lp")
mdta$pred3<-predict(c3,type="lp")
mdta$pred4<-predict(c4,type="lp")
mdta$pred5<-predict(c5,type="lp")
mdta$pred6<-predict(c6,type="lp")
plot(pred2~pred1,mdta)
plot(pred4~pred3,mdta)
plot(pred6~pred5,mdta)

### Multivariate regression analysis step by step
#times<-1:5*12
times<-5*12
#mdta_s<-mdta[sample(nrow(mdta),3000),]

ROC.data <- Score(list("Max of 1"=mdta$pred1,"Max of 2"=mdta$pred2,"Max of 3"=mdta$pred3,"Max of 4"=mdta$pred4,
                       "Max of 5"=mdta$pred5,"Max of 6"=mdta$pred6),
                  Surv(BCR.months,BCR.censor==0)~1,data=mdta,
                  times=times,plots="roc",metrics="auc",summary='risks')

### plot the time-dependent receiver operating characteristic curves
plotROC(ROC.data,times=times,
        xlab = "1-Specifity (False-Postive rate)",
        ylab= "Sensitivity (True-Positive rate)",
        col =c("black","firebrick","red","firebrick1","orange","gold"),
        lwd=2, lty=1, cex=1,pch=1, 
        legend=TRUE,auc.in.legend =TRUE,brier.in.legend=FALSE,
        add = FALSE)
### close the box & make the diagonal line less obvious
lines(x = c(-0.1,1.5), y = c(-0.1,1.5), col="white",lwd=3.5)
lines(x = c(-0.05,1.05), y = c(-0.05,1.05), col="lightgrey",lwd=1,lty="dotted")
box(which = "plot", lty = "solid", col="black")

### p-value for AUC comparison
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

### ggplot 2 preperation 
ROC.riskRegression<- ROC.data%$%ROC%$%plotframe
AUC.RR<- ROC.data%$%AUC%$%score
p.RR<- ROC.data%$%AUC%$%contrasts
#p.RR<- p.RR[c(1,6,10,13,15)] 
p.RR<- p.RR[c(15,13,10,6,1)] 
#### select the p-values from the list we need
AUC.RR$label<-AUC.RR %$% paste0(model,": "," AUC","=",round(AUC,2)," ","(",round(lower,2),"-",round(upper,2),")")
AUC.RR$y<-c(0.1,0.15,0.2,0.25,0.3,0.35) ### just add 0.25, 0.3 ... according to the number of additonal curves
p.RR$label<-p.RR %$% paste0("p","=",round(p,4))
AUC.RR$model <- factor(AUC.RR$model, levels = c("Max of 1","Max of 2","Max of 3","Max of 4",
                                                "Max of 5","Max of 6")) ### order them here!

### finally plot all data with ggplot
AUC.RR$model ### to change the names for the corresponding colors
z=c(0.2,0.3,0.5,0.7,0.9)
ggplot(ROC.riskRegression,aes(x=FPR,y=TPR, colour=model, group=model))+
  geom_line(size=0.75)+facet_wrap(~times)+theme_bw()+
  geom_abline(intercept = 0, slope=1, linetype=3, colour="gray")+
  labs(colour="Ki67-LI", x="1-Specifity (False-Postive rate)", y="Sensitivity (True-Positive rate)")+
  theme(panel.grid=element_blank(),strip.background = element_blank(), panel.spacing = unit(2, "lines"))+
  scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+
  scale_colour_manual(values=c("gold","orange","firebrick1","red","firebrick","black"))+
  geom_text(data=AUC.RR,aes(label=label,x=.95,y=y),size=10/.pt, show.legend = FALSE,hjust="inward") + 
  geom_text(data=p.RR,aes(label=label,x=z,y=0.05, group=NULL, colour=NULL),size=8/.pt, show.legend = FALSE,hjust="inward")


ROC.data[["ROC"]]
print(ROC.data, times, digits=3, eps=10^-4,verbose=TRUE, conf.int=0.95)

plotRisk(ROC.data,times=times)

# Distribuiton of the Data
barplot(table(C.data$xx))
plot(density(C.data$xxx))

#col =c("firebrick","forestgreen","turquoise","royalblue3","black","orange"),
# Colors: palette=c("black","darkblue","royalblue3","forestgreen","mediumseagreen","orange","gold3","red","firebrick"),
# "blue","cyan3","#EFE814","#6B8DBD","#348F57","Firebrick"
#Order old built in RiskRegression plot "orange","darkblue","royalblue3","forestgreen","mediumseagreen","firebrick"
#Order ggplot: "firebrick","mediumseagreen","forestgreen","royalblue3","darkblue","orange"