ChemoResistantPredictionCIN / Analysis / Predicting_Chemotherapy_Response_Pancancer / Curating_HMF_Clinical_Data.R
Curating_HMF_Clinical_Data.R
Raw
# This script is for generating the curated HMF clinical history file 'HMF_clinical_history.RDS'

# Clean environment
freshr::freshr()

# Libraries
library(this.path)
library(readr)
library(dplyr)

# Files
pancancer_dir <- dirname(this.path())
base_dir <- dirname(dirname(pancancer_dir))
input_dir <- file.path(base_dir, 'Input_Data')
helper_dir <- file.path(base_dir, 'Analysis/Helper_Scripts')
source(file.path(helper_dir, 'Helper_functions.R'))
treatment_responses <- read_delim(file.path(input_dir, "treatment_responses.tsv"),
                                  delim = "\t",
                                  escape_double = FALSE,
                                  trim_ws = TRUE)
pre_biopsy_drugs <- read_delim(file.path(input_dir, "pre_biopsy_drugs.tsv"),
                               delim = "\t",
                               escape_double = FALSE,
                               trim_ws = TRUE)
post_biopsy_drugs <- read_delim(file.path(input_dir, "post_biopsy_drugs.tsv"),
                                delim = "\t",
                                escape_double = FALSE,
                                trim_ws = TRUE)
metadata <- read_delim(file.path(input_dir, "metadata.tsv"),
                       delim = "\t",
                       escape_double = FALSE,
                       trim_ws = TRUE)
metadata$patientIdentifier <- gsub('TI*V*$', '', metadata$sampleId, perl=TRUE)
hmf_segments <- readRDS(file.path(input_dir, 'hmf_cnp_smooth.rds'))
  

# Combine pre and post-biopsy drugs for patients that received post-biopsy treatment
combined_drugs <- data.frame(patientIdentifier=c(),
                             sampleId=c(),
                             startDate=c(),
                             endDate=c(),
                             name=c(),
                             type=c(),
                             mechanism=c(),
                             biopsy_state=c())
for(i in unique(post_biopsy_drugs$patientIdentifier)){
  temp_pre_df <- NULL
  sampleIds <- unique(post_biopsy_drugs$sampleId[post_biopsy_drugs$patientIdentifier==i])
  temp_pre_df <- pre_biopsy_drugs %>% filter(patientIdentifier==i)
  if(length(sampleIds)>0){
    temp_post_df <- NULL
    temp_pre_df_sample <- NULL
    temp_post_df <- post_biopsy_drugs %>% filter(sampleId %in% sampleIds)
    temp_df <- NULL
    if(nrow(temp_pre_df)>0){
      temp_pre_df_sample <- temp_pre_df %>%
        mutate(sampleId=sampleIds[1]) %>%
        select('patientIdentifier',
               'sampleId',
               'startDate',
               'endDate',
               'name',
               'type',
               'mechanism')
      temp_df <- rbind(temp_pre_df_sample, temp_post_df)
      combined_drugs <- rbind(combined_drugs, temp_df)
    }else{
      combined_drugs <- rbind(combined_drugs, temp_post_df)
    }
  }else{
    temp_pre_df <- temp_pre_df %>%
      mutate(sampleId=j) %>%
      select('patientIdentifier',
             'sampleId',
             'startDate',
             'endDate',
             'name',
             'type',
             'mechanism')
    combined_drugs <- rbind(combined_drugs, temp_pre_df)
  }
}

combined_drugs <- left_join(combined_drugs,
                            metadata %>%
                              mutate(dup=duplicated(patientIdentifier)) %>%
                              filter(!dup) %>%
                              select(patientIdentifier, cancer_type=primaryTumorLocation))

combined_drugs_filtered <- combined_drugs %>%
  filter(endDate!='null') %>%
  mutate(endDate=as.Date(endDate)) %>%
  filter(endDate>1950 & startDate>1950 & (endDate>=startDate)) %>%
  group_by(sampleId) %>%
  arrange(startDate)


# Combine overlapping treatment lines
for(i in 1:length(unique(combined_drugs_filtered$patientIdentifier))){
  print(i)
  if(i==1){
    temp_df <- assign_treatment_lines(combined_drugs_filtered %>%
                                        filter(patientIdentifier==unique(combined_drugs_filtered$patientIdentifier)[i]))
  }else{
    temp_df <- rbind(temp_df,
                     assign_treatment_lines(combined_drugs_filtered %>%
                                              filter(patientIdentifier==unique(combined_drugs_filtered$patientIdentifier)[i])))
  }
}

combined_drugs_collapsed <- temp_df %>%
  group_by(patientIdentifier, treatment_line, cancer_type) %>%
  summarise(name=paste(sort(unique(name)), collapse='/'),
            startDate=min(startDate),
            endDate=max(endDate)) %>%
  mutate(treatment_length=endDate-startDate)


# Calculate TTF
for(i in 1:length(unique(combined_drugs_collapsed$patientIdentifier))){
  id <- unique(combined_drugs_collapsed$patientIdentifier)[i]
  temp_drugs <- combined_drugs_collapsed %>% filter(patientIdentifier==id)
  temp_responses <- treatment_responses %>% filter(patientIdentifier==id)
  if(i==1){
    combined_drugs_TTF <- calc_TTF_HMF(temp_drugs, temp_responses)
  }else{
    combined_drugs_TTF <- rbind(combined_drugs_TTF, calc_TTF_HMF(temp_drugs, temp_responses))
  }
}


# Annotate with biopsy status
combined_drugs_TTF$biopsyState <- 'Unknown'
for(i in unique(combined_drugs_TTF$patientIdentifier)){
  bdate <- unique(metadata$biopsyDate[metadata$patientIdentifier==i])[1]
  if(bdate!='null'){
    bdate <- as.Date(bdate)
    combined_drugs_indexes <- which(combined_drugs_TTF$patientIdentifier==i)
    pre_biopsy_dates <- which(combined_drugs_TTF$startDate[combined_drugs_indexes] < bdate & combined_drugs_TTF$endDate[combined_drugs_indexes] <= bdate)
    post_biopsy_dates <- which(combined_drugs_TTF$startDate[combined_drugs_indexes] >= bdate & combined_drugs_TTF$endDate[combined_drugs_indexes] > bdate)
    biopsy_dates <- which(combined_drugs_TTF$startDate[combined_drugs_indexes] <= bdate & combined_drugs_TTF$endDate[combined_drugs_indexes] >= bdate)
    combined_drugs_TTF$biopsyState[combined_drugs_indexes[pre_biopsy_dates]] <- 'Pre'
    combined_drugs_TTF$biopsyState[combined_drugs_indexes[post_biopsy_dates]] <- 'Post'
    combined_drugs_TTF$biopsyState[combined_drugs_indexes[biopsy_dates]] <- 'Biopsy'
  }
}

saveRDS(combined_drugs_TTF, file.path(input_dir, 'HMF_clinical_data.RDS'))