ChemoResistantPredictionCIN / Analysis / Implementing_Platins_Response_Predictor / platins_classifier.R
platins_classifier.R
Raw
################################################################################
## This script is used to generalize the classifier for platins prediction to
## all different cohorts. This is crucial for moving the biomarker into the clinic
## Script adapted from Drews et al. Nature 2022
################################################################################

rm(list=ls(all=TRUE))

### LIBRARIES
suppressPackageStartupMessages({
    # Standard packages used throughout repo
    library(this.path)
    library(data.table)
    library(ggplot2)
    library(ggthemes)
    # # For survival analysis
    library(survival)
    library(survminer)
})

### PLOTTING THEMES
theme_set(theme_tufte(base_size = 6, base_family = "ArialMT"))
theme_update(text = element_text(size = 6),
             axis.text = element_text(size = 6),
             axis.line = element_line(size = 0.5),
             axis.ticks = element_line(size = 0.5),
             axis.ticks.length = unit(.1, "cm"))

### PATHS
PLATDIR=dirname(this.path())
BASE=dirname(dirname(PLATDIR))
INDIR=paste0(BASE,"/Input_Data")
FIGDIR=paste0(BASE,"/Figures")
dir.create(FIGDIR, showWarnings = FALSE, recursive = TRUE)

### LOAD DATA
## Data - TCGA
ACTIVITIES=file.path(INDIR, "Signature_Compendium_v5_Cosine-0.74_Activities_THRESH95_NAMESAPRIL21.rds")
# ACTIVITIES=file.path("C:/Users/bhernando/Desktop/CNIO/UsefulData/TCGA/Jul2024/Signature_Compendium_v5_Cosine-0.74_Activities_THRESH95_JULY2024.rds")
META=file.path(INDIR, "Metadata_TCGA_ASCAT_penalty70.rds")
BRCA.STATE=file.path(INDIR, "TCGA_BRCA_MutStatus.rds")
## From previous survival analysis (Compile_survival_data_onlyclin.R)
SURVIVALOV=file.path(INDIR, "Survival_and_BRCA_Status_TCGA-OV.rds")

LINK=file.path(INDIR,"TCGA_PCAWG_links_plusCancer.rds")

## Data - PCAWG
PCAWG=file.path(INDIR, "PCAWG_signature_activities_THRESH095_NAMESAPRIL21.rds")
# PCAWG=file.path("C:/Users/bhernando/Desktop/CNIO/UsefulData/PCAWG/Jul2024/4_Activities_CompendiumCINSigs_THRESH95_PCAWG_Jul24.rds")
METAPCAWG=file.path(INDIR, "PCAWG_1900_samples_CINSig_activities_metadata_plus_HRDetect.rds")
SURVPCAWG=file.path(INDIR, "pcawg_donor_clinical_August2016_v9.fixed.tsv")
TSPCAWG=file.path(INDIR, "pcawg_donor.tsv")

## Data - ICGC (560 Breast cancers)
ICGC560=file.path(INDIR, "ICGC_560BCs_signature_activities_THRESH095_NAMESAPRIL21.rds")
METAICGC560=file.path(INDIR, "Meta_560BreastCancers.tsv")

## Other essential variables
AGEBRACKETS=c(0, 53, 62, 76, 100)

## Load essential files
links = readRDS(LINK)


### FUNCTIONS
## Little function for applying the classifier to a data frame with columns "CX3", "CX2" and "Name"
applyClinClass = function(dtSurvOV, lModel) {
    
    ## Scale the activities
    mTCGAOV = as.matrix(dtSurvOV[ , c("CX3", "CX2")])
    rownames(mTCGAOV) = dtSurvOV$Name
    smTCGAOV = sweep(sweep(mTCGAOV, 2, lModel$mean, FUN = '-'), 2, lModel$scale, FUN = "/")
    
    dtSurvOV$sCX3 = smTCGAOV[,"CX3"]
    dtSurvOV$sCX2 = smTCGAOV[,"CX2"]
    
    ## Apply classifier
    if(identical(dtSurvOV$Name, rownames(smTCGAOV))) {
        dtSurvOV$Classifier = ifelse(dtSurvOV$sCX3 >= dtSurvOV$sCX2, "Predicted sensitive", "Predicted resistant")  
        dtSurvOV$Classifier = factor(dtSurvOV$Classifier, levels = c("Predicted sensitive","Predicted resistant"))
    } else {
        stop("Something went wrong.")
    }
    
    return(dtSurvOV)
    
}

## Bread and butter function of this markdown
plotKMandCoxOS = function(dtSurvOV, COHORTNAME = "TCGA-OV", CLASSIFIERNAME = "CX3/CX2 classifier", COXMODEL = "Full") {
    
    ## Argument: CLASSIFIER = "Classifier"
    
    ### Cannot get the automatic handling of variable CLASSIFIER from within the function. It is not recognised within the function despite being defined.
    # kmOV = survfit(Surv(OS.time, OS) ~ Classifier, data = dtSurvOV)
    # CLASSIFIER = noquote(COLNAME)
    # print(CLASSIFIER)
    # print(exists(CLASSIFIER))
    # kmOV = survfit(Surv(OS.time, OS) ~ get(CLASSIFIER), data = dtSurvOV)
    kmOV = survfit(reformulate(termlabels = CLASSIFIER, response = 'Surv(OS.time, OS)'), data = dtSurvOV)
    # kmOV = survfit(as.formula(paste("Surv(OS.time, OS)~", paste(TEST))), data = dtSurvOV)
    pKMOV = ggsurvplot(kmOV, data = dtSurvOV, risk.table = TRUE, pval = TRUE, conf.int = FALSE,
                       xlim = c(0,4000), break.time.by = 365, 
                       ggtheme = theme_tufte(base_size = 6, base_family = "ArialMT"),
                       surv.median.line = "hv", risk.table.y.text.col = TRUE, risk.table.y.text = FALSE) + 
        ggtitle(paste("OS in", COHORTNAME, "patients by", CLASSIFIERNAME))
    
    # Cox model
    # resCoxOV = coxph(Surv(OS.time, OS) ~ Classifier + AgeCategory + TS, data =  dtSurvOV)
    if(COXMODEL == "Full") {
        resCoxOV = coxph(reformulate(termlabels = paste(CLASSIFIER, "AgeCategory", "TS", sep = "+"), response = 'Surv(OS.time, OS)'), data = dtSurvOV)
    } else {
        resCoxOV = coxph(reformulate(termlabels = paste(CLASSIFIER, "AgeCategory", sep = "+"), response = 'Surv(OS.time, OS)'), data = dtSurvOV)
    }
    # Don't attempt to plot if Cox model failed
    nFailedCox = sum(is.na(resCoxOV$coefficients))
    if(nFailedCox != length(resCoxOV$coefficients)) {
        pCoxOV = ggforest(resCoxOV, data = dtSurvOV, main = paste("Cox HR model for", COHORTNAME))  
    } else {
        # Just an empty plot to not break the plotting later on.
        pCoxOV = ggplot() + theme_void() + xlab(NULL) +
            geom_text(aes(0,0,label = paste("Cox HR model for", COHORTNAME, "\nEmpty because Cox model failed to converge.")))
    }
    
    lOut = list(km = kmOV, kmPlot = pKMOV, cox = resCoxOV, coxPlot = pCoxOV)
    return(lOut)
}

### MODEL FOR SCALING ACTIVITIES BASED ON PAN-CANCER BRCAmut
## Get all BRCA1/2mut TCGA samples
brcaMuts = readRDS(BRCA.STATE)
brca1 = unique(as.character(brcaMuts$Sample[ grepl("[germline|somatic] BRCA1", 
                                                   brcaMuts$Status) ]))
brca2 = unique(as.character(brcaMuts$Sample[ grepl("[germline|somatic] BRCA2", 
                                                   brcaMuts$Status) ]))
# rad51c = unique(as.character(brcaMuts$Sample[ grepl("RAD51C", brcaMuts$Status) ]))
brcaMuts = unique(c(brca1,brca2))

# Model for scaling
mBRCA = readRDS(ACTIVITIES)
mBRCA = mBRCA[row.names(mBRCA)%in%brcaMuts,]
mBRCA = as.matrix(mBRCA[ , c("CX3", "CX2")])

## Model for scaling
lModel = list(mean = attributes(scale(mBRCA))$`scaled:center`, 
              scale = attributes(scale(mBRCA))$`scaled:scale`)


################################################################################

#### TCGA-OV ####
## Apply classifier to all TCGA-OV samples
dtSurvOV = readRDS(SURVIVALOV)

## Apply classifier to all TCGA-OV samples
dtSurvOV = applyClinClass(dtSurvOV, lModel)

## Age categorisation chosen from age histogram so that it works across all cohorts equally
dtSurvOV$AgeCategory = cut(dtSurvOV$AgeY, breaks = AGEBRACKETS, labels = 1:(length(AGEBRACKETS)-1))

## Survival analysis
CLASSIFIER = "Classifier"
lTCGAOVCX = plotKMandCoxOS(dtSurvOV, COHORTNAME = "TCGA-OV", CLASSIFIERNAME = "CX3/CX2 classifier")
summary(lTCGAOVCX$cox)
# HR=0.7248 & p-val=0.0067


#### PCAWG-OV ####
## Prepare signature activities
mPCAWG = readRDS(PCAWG)

## Add survival
dtSurv = fread(SURVPCAWG)
metaPCAWG = readRDS(METAPCAWG)
dtPcawg = metaPCAWG[, c("projectcode", "cancer_type", "samplename", "icgc_donor_id", "icgc_sample_id",
                        "donor_age_at_diagnosis", "tumour_type", "histology_tier4", "HRDetect")]

## Check that age at diagnosis is identical in both - just sanity check
dtPcawg$comp = dtSurv$donor_age_at_diagnosis[ match(dtPcawg$icgc_donor_id, dtSurv$icgc_donor_id) ]
summary(dtPcawg$comp - dtPcawg$donor_age_at_diagnosis)
dtPcawg$comp = NULL

dtPcawg$vital_status = dtSurv$donor_vital_status[ match(dtPcawg$icgc_donor_id, 
                                                        dtSurv$icgc_donor_id) ]
dtPcawg$survival_time = dtSurv$donor_survival_time[ match(dtPcawg$icgc_donor_id, 
                                                          dtSurv$icgc_donor_id) ]
dtPcawg$last_follow_up = dtSurv$donor_interval_of_last_followup[ match(dtPcawg$icgc_donor_id, 
                                                                       dtSurv$icgc_donor_id) ]

## Combine survival_time and last_follow_up into one OS.time column
dtPcawg$OS.time = dtPcawg$survival_time
dtPcawg$OS.time[ is.na(dtPcawg$OS.time) ] = dtPcawg$last_follow_up[ is.na(dtPcawg$OS.time) ]


## Create data table with all information
dtPcawg$CX3 = mPCAWG[ match(dtPcawg$samplename, rownames(mPCAWG)), "CX3"]
dtPcawg$CX2 = mPCAWG[ match(dtPcawg$samplename, rownames(mPCAWG)), "CX2"]

## Convert survival
dtPcawg$OS = ifelse(dtPcawg$vital_status == "alive", 0, 1)
dtPcawg$OS[ dtPcawg$vital_status == "" ] = NA

## Add staging
## PCAWG
dtTS = fread(TSPCAWG)
dtPcawg$TS = dtTS$donor_tumour_stage_at_diagnosis[ match(dtPcawg$icgc_donor_id, dtTS$icgc_donor_id) ]

## TCGA
dtPcawg$TCGA = links$TCGA[ match(dtPcawg$icgc_donor_id, links$ICGC) ]

## Stratify by age
## Look at histogram of age and you see three dips, neatly dividing the cohort into four age groups.
dtPcawg$AgeCategory = cut(dtPcawg$donor_age_at_diagnosis, breaks = AGEBRACKETS, labels = 1:(length(AGEBRACKETS)-1))

#### PCAWG-OV-AU
## Use only PCAWG-AU as PCAWG-US is the TCGA-OV cohort

## CX classifier => scaling within the cohort
mPCAWG.OV = as.matrix(dtPcawg[ dtPcawg$projectcode == "OV-AU", c("CX3", "CX2")])

## Scaling
dtOV = applyClinClass(dtPcawg[ dtPcawg$projectcode == "OV-AU", ], lModel)
dtOV$TS[ dtOV$TS == "" ] = NA

## Survival analysis
CLASSIFIER = "Classifier"
lPCAWGAUCX = plotKMandCoxOS(dtOV, COHORTNAME = "PCAWG-AU", CLASSIFIERNAME = "CX3/CX2 classifier")
summary(lPCAWGAUCX$cox)
# HR=0.6437 & p-value=0.0932


#### PCAWG-ESAD ####
mPCAWG.ESAD = as.matrix(dtPcawg[ dtPcawg$cancer_type == "ESAD", c("CX3", "CX2")])

## Scaling
dtESAD = applyClinClass(dtPcawg[ dtPcawg$cancer_type == "ESAD", ], lModel)

## Simpify staging
dtESAD$NewTS = NA
dtESAD$NewTS[ grepl("T1", dtESAD$TS) ] = "I"
dtESAD$NewTS[ grepl("T2", dtESAD$TS) ] = "II"
dtESAD$NewTS[ grepl("T3", dtESAD$TS) ] = "III"
dtESAD$NewTS[ grepl("T4", dtESAD$TS) ] = "IV"
dtESAD$NewTS = factor(dtESAD$NewTS, levels = c("I", "II", "III", "IV")) 
dtESAD$TS = dtESAD$NewTS
dtESAD$NewTS = NULL

## Survival analysis
CLASSIFIER = "Classifier"
lPCAWGESADCX = plotKMandCoxOS(dtESAD, COHORTNAME = "PCAWG-ESAD", CLASSIFIERNAME = "CX3/CX2 classifier")
summary(lPCAWGESADCX$cox)
# HR=0.3489 & p-value=0.0141


#### ICGC ####
## Prepare survival
metaIcgc = fread(METAICGC560)
metaIcgc$OS = ifelse(metaIcgc$donor_vital_status == "alive", 0, 1)
metaIcgc$OS.time = NA

## Combine column of OS.time
## First use interval last check up as base (data points for survived patients)
metaIcgc$OS.time = metaIcgc$`donor_interval_of_last_follow-up_in_DAYS`

## Then add data from deceased people
metaIcgc$OS.time[ metaIcgc$donor_vital_status == "deceased" ] = 
    metaIcgc$donor_survival_time_in_DAYS[ metaIcgc$donor_vital_status == "deceased" ]

## Convert
metaIcgc$OS.time[ metaIcgc$OS.time == "no_data_supplied" ] = NA
metaIcgc$OS.time = as.numeric(metaIcgc$OS.time)

## CX classifier
m560BC = readRDS(ICGC560)
rownames(m560BC) = substr(rownames(m560BC), 1, nchar(rownames(m560BC))-1)
metaIcgc$CX3 = m560BC[ match(metaIcgc$sample_name, rownames(m560BC)), "CX3" ]
metaIcgc$CX2 = m560BC[ match(metaIcgc$sample_name, rownames(m560BC)), "CX2" ]

## Scaling
metaIcgc = applyClinClass(metaIcgc, lModel)

## Convert age and put into categories
metaIcgc$Age = metaIcgc$donor_age_at_diagnosis
metaIcgc$Age[ metaIcgc$Age == "over_80" ] = 80
metaIcgc$Age = as.numeric(metaIcgc$Age)
metaIcgc$AgeCategory = cut(metaIcgc$Age, breaks = AGEBRACKETS, labels = 1:(length(AGEBRACKETS)-1))

## Has tumour stage info already but in a different column
metaIcgc$TS = metaIcgc$T_stage

## Survival analysis
CLASSIFIER = "Classifier"
l560BCCX = plotKMandCoxOS(metaIcgc, COHORTNAME = "ICGC 560 Breast cancers", CLASSIFIERNAME = "CX3/CX2 classifier")
summary(l560BCCX$cox)
# HR=2.6707 & p-value=0.00367


#### PLOTTING: COMPARE RESULTS WITH ORIGINAL PAPER #############################
## Load original results
dfSurv.org=readRDS(file.path(INDIR,"Classifier_SurvResults_Drews2022.rds"))
dfSurv.org$method="original"
dfSurv.org=dfSurv.org[dfSurv.org$test!="TCGA-OV BRCAmut",]

## Table with new results
dfSurv <- data.frame(test = factor(c('TCGA-OV',
                                     'PCAWG-OV',
                                     'PCAWG-ESAD',
                                     'ICGC-BRCA'),
                                   levels = c('TCGA-OV',
                                              'PCAWG-OV',
                                              'PCAWG-ESAD',
                                              'ICGC-BRCA')),
                     lower_hr = c(summary(lTCGAOVCX$cox)$conf.int[1, 3],
                                  summary(lPCAWGAUCX$cox)$conf.int[1, 3],
                                  summary(lPCAWGESADCX$cox)$conf.int[1, 3],
                                  summary(l560BCCX$cox)$conf.int[1, 3]),
                     upper_hr = c(summary(lTCGAOVCX$cox)$conf.int[1, 4],
                                  summary(lPCAWGAUCX$cox)$conf.int[1, 4],
                                  summary(lPCAWGESADCX$cox)$conf.int[1, 4],
                                  summary(l560BCCX$cox)$conf.int[1, 4]),
                     hr = c(summary(lTCGAOVCX$cox)$conf.int[1, 1],
                            summary(lPCAWGAUCX$cox)$conf.int[1, 1],
                            summary(lPCAWGESADCX$cox)$conf.int[1, 1],
                            summary(l560BCCX$cox)$conf.int[1, 1]))

dfSurv$method="new"

## Join both for plotting
dfSurv=rbind(dfSurv,dfSurv.org)
dfSurv$group=paste0(dfSurv$test,"+",dfSurv$method)
dfSurv$group=factor(dfSurv$group,levels = c("TCGA-OV+original","TCGA-OV+new",
                                            "PCAWG-OV+original","PCAWG-OV+new",
                                            "PCAWG-ESAD+original","PCAWG-ESAD+new",
                                            "ICGC-BRCA+original","ICGC-BRCA+new"))

p <- ggplot(dfSurv[dfSurv$test=="PCAWG-ESAD",], aes(x = group, color = method, fill = method)) +
    geom_errorbar(aes(ymin = lower_hr, ymax = upper_hr), width = 0.1, linewidth = 0.8) +
    geom_point(aes(y = hr), size = 3) +
    geom_hline(yintercept = 1, linetype = 3, linewidth = 0.5) +
    scale_y_log10(position = 'right',
                  breaks = (c(0.2, 0.5, 1, 2, 3)),
                  labels = c('0.2', '0.5', '1', '2', '3')) +
    scale_color_manual(values = c("original" = "sienna2", "new" = "skyblue2")) +
    coord_flip() +
    theme_light() +
    theme(#panel.grid = element_blank(),
          #axis.ticks = element_blank(),
          axis.title = element_blank(),
          #panel.border = element_blank(),
          axis.text.y = element_text(size = 6, vjust = 0.5),
          axis.text.x = element_text(size = 6, vjust = 1),
          legend.position = "none")
ggsave(file.path(FIGDIR, 'Platins_Classifier_ESAD_hr.svg'), p, device = 'svg', width = 50, height = 20, units = 'mm')