SarkomaDRG / Analyse_DRG_Github.ipynb
Analyse_DRG_Github.ipynb
Raw
library(devtools)
library(dplyr)
library(Matrix)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(patchwork)
library(ggrepel)
library(scales)
library(ggthemes)
library(purrr)
library(ggrastr)
library(nortest)
library(nnet)
library(MASS)
library(lmtest)
library(ggpubr)
library(readxl)
library(viridis)
library(svglite)
sessionInfo()
R version 4.0.5 (2021-03-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS/LAPACK: /home/INIM/vladyslav.kavaka/miniconda3/envs/azimuth/lib/libopenblasp-r0.3.17.so

locale:
 [1] LC_CTYPE=C.UTF-8    LC_NUMERIC=C        LC_TIME=C          
 [4] LC_COLLATE=C        LC_MONETARY=C       LC_MESSAGES=C      
 [7] LC_PAPER=C          LC_NAME=C           LC_ADDRESS=C       
[10] LC_TELEPHONE=C      LC_MEASUREMENT=C    LC_IDENTIFICATION=C

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] svglite_2.1.3     viridis_0.6.3     viridisLite_0.4.2 readxl_1.3.1     
 [5] ggpubr_0.4.0      lmtest_0.9-40     zoo_1.8-12        MASS_7.3-55      
 [9] nnet_7.3-17       nortest_1.0-4     ggrastr_1.0.1     purrr_1.0.1      
[13] scales_1.2.1      ggrepel_0.9.3     patchwork_1.1.2   ggthemes_4.2.4   
[17] ggplot2_3.4.2     tidyr_1.1.4       Matrix_1.6-0      dplyr_1.0.7      
[21] devtools_2.4.5    usethis_2.2.2    

loaded via a namespace (and not attached):
 [1] fs_1.6.2          repr_1.1.4        tools_4.0.5       profvis_0.3.8    
 [5] backports_1.4.1   utf8_1.2.3        R6_2.5.1          vipor_0.4.5      
 [9] DBI_1.1.3         colorspace_2.0-3  urlchecker_1.0.1  withr_2.5.0      
[13] tidyselect_1.2.0  gridExtra_2.3     prettyunits_1.1.1 processx_3.8.2   
[17] compiler_4.0.5    cli_3.6.1         callr_3.7.3       pbdZMQ_0.3-7     
[21] systemfonts_1.0.3 stringr_1.5.0     digest_0.6.33     base64enc_0.1-3  
[25] pkgconfig_2.0.3   htmltools_0.5.5   sessioninfo_1.2.2 fastmap_1.1.1    
[29] htmlwidgets_1.6.2 rlang_1.1.1       shiny_1.7.4.1     generics_0.1.3   
[33] jsonlite_1.8.7    car_3.0-12        magrittr_2.0.3    Rcpp_1.0.11      
[37] ggbeeswarm_0.6.0  IRkernel_1.3      munsell_0.5.0     fansi_1.0.4      
[41] abind_1.4-5       lifecycle_1.0.3   stringi_1.7.6     carData_3.0-5    
[45] pkgbuild_1.4.2    grid_4.0.5        promises_1.2.0.1  crayon_1.5.2     
[49] miniUI_0.1.1.1    lattice_0.20-45   IRdisplay_1.1     ps_1.7.5         
[53] pillar_1.9.0      uuid_1.1-0        ggsignif_0.6.3    pkgload_1.3.2.1  
[57] glue_1.6.2        evaluate_0.21     remotes_2.4.2     vctrs_0.6.3      
[61] httpuv_1.6.11     cellranger_1.1.0  gtable_0.3.3      assertthat_0.2.1 
[65] cachem_1.0.8      mime_0.12         xtable_1.8-4      broom_0.7.12     
[69] rstatix_0.7.0     later_1.3.1       tibble_3.1.8      beeswarm_0.4.0   
[73] memoise_2.0.1     ellipsis_0.3.2   
set_figsize <- function(width, height){
    options(repr.plot.width = width, 
            repr.plot.height = height)
}

Load in the dataset

data <- read_excel('./Matrices/Fallliste_20240525.xlsx')

#create the grading High grade column
data$Grading_alt[data$Grading_alt %in% c('Low grade')] <- 'Low-grade'
data$High_grade <- data$Grading_alt
data$High_grade[data$Grading_alt == '2'] <- 1
data$High_grade[data$Grading_alt == '3'] <- 1
data$High_grade[data$Grading_alt %in% c('Low-grade', 'Low grade')] <- 0
data$High_grade[data$Grading_alt == '1'] <- 0
data$High_grade[data$Grading_alt == 'NA'] <- 0
data$High_grade[data$Grading_alt == 'High-grade'] <- 1
data$High_grade <- as.numeric(data$High_grade)

data$Grading_alt[data$Grading_alt == '1'] <- 'Low-grade'
data$Grading_alt[data$Grading_alt %in% c('2', '3')] <- 'High-grade'
unique(data$Grading_alt)
unique(data$High_grade)

#Rekonstruktion
data$Freie_Lappenplastik <- 0
data$Freie_Lappenplastik[data$Rekonstruktion == 3] <- 1

data$Lokale_Lappenplastik <- 0
data$Lokale_Lappenplastik[data$Rekonstruktion == 2] <- 1

data$SHTx <- 0
data$SHTx[data$Rekonstruktion == 1] <- 1

#add the column for the fall unter der mittleren VwD
data$Über_der_mittleren_VwD <- 1
for(i in 1:nrow(data)){
    if(data$Berechnungstage[i] < data[['Mittlere GrenzVwD']][i]){data$Über_der_mittleren_VwD[i] <- 0}
}

head(data)
colnames(data)
#read in the cost matrices
list_matrices <- list.files('./Matrices/')
list_matrices <- grep(x = list_matrices, pattern = 'Kosten', value = TRUE)

matrices <- c()
for(i in 1:length(list_matrices)){
    matrice_subset <- read_excel(paste0('./Matrices/', list_matrices[i]))
    matrices <- rbind(matrice_subset, matrices)
}

#make a wider pivot to combine the values with the list of patients for subsequent analysis
matrices <- pivot_wider(data = matrices, names_from = 'Kosten_KoStKurz', values_from = 3:ncol(matrices))
#sort only the values that are in both tables 
colnames(matrices)[colnames(matrices) == 'FallID'] <- 'Fallnummer'
data <- data[data$Fallnummer %in% matrices$Fallnummer, ]
#combine two tables
data_matrices <- merge(data, matrices, by = 'Fallnummer')
head(data_matrices)
colnames(data_matrices)
#create the Verlustfall column
data_matrices$Verlustfall <- 1
data_matrices$Verlustfall[data_matrices$Gesamt_Gesamt > 0] <- 0
table(data_matrices$Verlustfall)

#add the column to the table without cost matrices
data <- merge(data, data_matrices[c('Fallnummer', 'Verlustfall')], by = 'Fallnummer')
head(data)
dir_outs <- './outs/'
dir.create(dir_outs)
colnames(data)
nrow(data_matrices)

Take a look at the number of cases, distribution etc

variables_to_plot <- colnames(data)[21:ncol(data)]    
for(i in 1:length(variables_to_plot)){
# Piechart
        name <- variables_to_plot[i]
        width <- 15
        height <- 11
        set_figsize(width, height)
        obj <- data
        variable <- variables_to_plot[i]
        variable_name <- variable #for ggtitle
        obj$variable <- obj[[variable]]
        
        val_variable <- unique(obj[[variable]])
        obj_plot <- data.frame(matrix(NA, nrow = length(val_variable), ncol = 2))
        colnames(obj_plot) <- c('variable', 'value')
        obj_plot$variable <- val_variable
        
        for(i in 1:nrow(obj_plot)){
            obj_plot$value[i] <- 100*nrow(subset(obj, variable == val_variable[i])) / nrow(obj)
            obj_plot$value[i] <- round(obj_plot$value[i], 2)
        }
        if(nrow(obj_plot)==2){
            obj_plot$variable[obj_plot$variable == 1] <- 'ja'
            obj_plot$variable[obj_plot$variable == 0] <- 'nein'
        }
    
        ordering <- obj_plot$variable[order(obj_plot$value, decreasing = T)]
        obj_plot <- obj_plot[order(obj_plot$value, decreasing = T), ]
        
        obj_plot$variable <- factor(obj_plot$variable, levels = ordering)
    
        if(nrow(obj_plot)==2 & unique(obj_plot$variable %in% c('ja', 'nein'))){
            obj_plot$variable <- as.character(obj_plot$variable)
            obj_plot <- obj_plot[order(obj_plot$variable, decreasing = F), ]
            obj_plot$variable <- factor(obj_plot$variable, levels = c('ja', 'nein'))
        }
    
        plot <- ggplot(obj_plot, aes(x="", y=value, fill=variable)) +
          geom_bar(stat="identity", width=1, color="white") +
          coord_polar("y", start=0) +
          ggtitle(paste0(variable)) +
          theme_void() + # remove background, grid, numeric labels
          theme(
                plot.title = element_text(size = 40, hjust = 0.5, face = 'bold'),
                legend.title=element_text(size=40),
                legend.text=element_text(size=40)) +
        scale_fill_manual(name = '', values = hue_pal()(length(unique(obj_plot$variable))), labels = paste0(obj_plot$variable, ' (', obj_plot$value, '%)'))
        plot
        ggsave(plot, file = paste0(dir_outs, name, '.pdf'), width = width, height = height)
    ggsave(plot, file = paste0(dir_outs, name, '.svg'), width = width, height = height)
        write.csv(obj_plot, file = paste0(dir_outs, name, '.csv'))
}
# Piechart
name <- 'Diagnose'
width <- 15
height <- 11
set_figsize(width, height)
obj <- data
variable <- 'Diagnose'
variable_name <- variable #for ggtitle
obj$variable <- obj[[variable]]

val_variable <- unique(obj[[variable]])
obj_plot <- data.frame(matrix(NA, nrow = length(val_variable), ncol = 2))
colnames(obj_plot) <- c('variable', 'value')
obj_plot$variable <- val_variable

for(i in 1:nrow(obj_plot)){
    obj_plot$value[i] <- 100*nrow(subset(obj, variable == val_variable[i])) / nrow(obj)
    obj_plot$value[i] <- round(obj_plot$value[i], 2)
}

ordering <- obj_plot$variable[order(obj_plot$value, decreasing = T)]
obj_plot <- obj_plot[order(obj_plot$value, decreasing = T), ]
obj_plot$variable <- factor(obj_plot$variable, levels = ordering)

plot <- ggplot(obj_plot, aes(x="", y=value, fill=variable)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) +
  ggtitle(paste0(variable)) +
  theme_void() + # remove background, grid, numeric labels
  theme(
        plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
        legend.title=element_text(size=15),
        legend.text=element_text(size=15)) +
scale_fill_manual(name = '', values = hue_pal()(length(unique(obj_plot$variable))), labels = paste0(obj_plot$variable, ' (', obj_plot$value, '%)'))
plot
ggsave(plot, file = paste0(dir_outs, name, '.pdf'), width = width, height = height)
ggsave(plot, file = paste0(dir_outs, name, '.svg'), width = width, height = height)
write.csv(obj_plot, file = paste0(dir_outs, name, '.csv'))
#plot continues variables
variables_to_plot <- colnames(data)[21:ncol(data)]
variables_to_plot <- c('Alter', 'Behandlungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops')
obj_plot <- data
width <- 6
height <- 6
set_figsize(width, height)

for(i in 1:length(variables_to_plot)){
        variable <- variables_to_plot[i]
        obj_plot_subset <- obj_plot[variable]
        colnames(obj_plot_subset) <- 'Variable'
        plot <- ggplot(obj_plot_subset, aes(x = 0, y=Variable, fill = hue_pal()(1))) +
          geom_boxplot(width = 0.6, outlier.shape = NA) +
          geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
          xlab("") +
            ggtitle(paste0(variable)) +
          ylab("") +
          scale_fill_manual(values=hue_pal()(1)) +
          cowplot::theme_cowplot() + 
          theme(axis.text.x = element_text(angle = 45, hjust = 1, size=15),
                 plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
                axis.text.y = element_text(size=15),
               legend.position = 'none')
        ggsave(plot, file = paste0(dir_outs, variable, '_boxplot.pdf'), width = width, height = height)
    ggsave(plot, file = paste0(dir_outs, variable, '_boxplot.svg'), width = width, height = height)
}

Take a look at the wins within the years:

grep(colnames(data_matrices), pattern = 'Gesamt', value = TRUE)
data_matrices[c('Patientennamen', 'Entlassungsdatum', 'Gesamt_Gesamt')]

sum(data_matrices[['Verlustfall']] == 1)
sum(data_matrices[['Verlustfall']] == 0)
#for year 2020:
print('2020')

data_year <- data_matrices[grep(data_matrices[['Entlassungsdatum']], pattern = 2020), ]
sum(data_year[['Gesamt_Gesamt']])
print('mean and sd')
mean(data_year[['Gesamt_Gesamt']])
sd(data_year[['Gesamt_Gesamt']])
print('median and iqr')
median(data_year[['Gesamt_Gesamt']])
IQR(data_year[['Gesamt_Gesamt']])
print('number of verlustfall')
sum(data_year[['Verlustfall']] == 1)
100*sum(data_year[['Verlustfall']] == 1) / nrow(data_year)

print('2021')

data_year <- data_matrices[grep(data_matrices[['Entlassungsdatum']], pattern = 2021), ]
sum(data_year[['Gesamt_Gesamt']])
print('mean and sd')
mean(data_year[['Gesamt_Gesamt']])
sd(data_year[['Gesamt_Gesamt']])
print('median and iqr')
median(data_year[['Gesamt_Gesamt']])
IQR(data_year[['Gesamt_Gesamt']])
print('number of verlustfall')
sum(data_year[['Verlustfall']] == 1)
100*sum(data_year[['Verlustfall']] == 1) / nrow(data_year)

print('2022')

data_year <- data_matrices[grep(data_matrices[['Entlassungsdatum']], pattern = 2022), ]
sum(data_year[['Gesamt_Gesamt']])
print('mean and sd')
mean(data_year[['Gesamt_Gesamt']])
sd(data_year[['Gesamt_Gesamt']])
print('median and iqr')
median(data_year[['Gesamt_Gesamt']])
IQR(data_year[['Gesamt_Gesamt']])
print('number of verlustfall')
sum(data_year[['Verlustfall']] == 1)
100*sum(data_year[['Verlustfall']] == 1) / nrow(data_year)
#test wheter parametric or not, if P > 0.05 then param
ad.test(data_year[['Gesamt_Gesamt']])
# Summ up the data:

Test proportions

outs_dir <- './outs_proportions/'
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
    if(length(unique(data[[i]])) == 2){
        variables <- c(variables, colnames(data)[i])
    } else {variables_cont <- c(variables_cont, colnames(data)[i])}
}

data_small <- data[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables


data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
#create the overview of cases overall
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 2))
colnames(df) <- c('Variable', 'absolut(%)')
df$Variable <- variables
for(i in 1:nrow(df)){
    #number of positives:
    number_of_positive <- sum(data[[df$Variable[i]]] == 1)
    df[['absolut(%)']][i] <- paste0(number_of_positive, '(', round(100*number_of_positive/nrow(data), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data_all.csv'))
df
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
    variable <- variables[i]
    
    number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
    nein_all <- sum(data_small[['Gruppe']] == 'nein')
    df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
    
    number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
    ja_all <- sum(data_small[['Gruppe']] == 'ja')
    df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}

write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small

for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    results <- rbind(results, results_subset)
}

results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)

results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 1, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height = 
                    .2, color = "gray50") +
        xlim(-2, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Odds Ratio") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 0, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height = 
                    .2, color = "gray75") +
        xlim(-max_lim, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Log(OR)") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    
    #plot the fractions diagramm
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long
    
    plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('')+
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
}

Create table of quantitave values, test quantitive values

outs_dir <- './outs_quantitative/'
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
    if(length(unique(data[[i]])) == 2){
        variables <- c(variables, colnames(data)[i])
    } else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
variables_cont
variables <- c('Alter', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten', 'OP_Folgeops_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('Variable', 'mean(SD)', 'median(IQR)')
df$Variable <- variables

for(i in 1:nrow(df)){
    df[['mean(SD)']][i] <- paste0(round(mean(data[[df$Variable[i]]]), 2), '(', round(sd(data[[df$Variable[i]]]), 2), ')')
    df[['median(IQR)']][i] <- paste0(round(median(data[[df$Variable[i]]]), 2), '(', round(IQR(data[[df$Variable[i]]]), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave_all.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten', 'OP_Folgeops_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 4))
colnames(df) <- c('variables', 'description', 'ja', 'nein')
df$variables <- variables

obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
data_nein <- filter(obj, Gruppe == 'nein')
data_ja <- filter(obj, Gruppe == 'ja')

for(i in 1:nrow(df)){
    variable <- variables[i]
    #test wheter parametric or not, if P > 0.05 then param
    p_value <- ad.test(obj[[variable]])[[2]]
    if(p_value < 0.05){
        df$description[i] <- 'median (IQR)'
        df$nein[i] <- paste0(round(median(data_nein[[variable]], na.rm = T), 2), ' (', round(IQR(data_nein[[variable]], na.rm = T), 2), ')')
        df$ja[i] <- paste0(round(median(data_ja[[variable]], na.rm = T), 2), ' (', round(IQR(data_ja[[variable]], na.rm = T), 2), ')')
    } else {
        df$description[i] <- 'mean (SD)'
        df$nein[i] <- paste0(round(mean(data_nein[[variable]], na.rm = T), 2), ' (', round(sd(data_nein[[variable]], na.rm = T), 2), ')')
        df$ja[i] <- paste0(round(mean(data_ja[[variable]], na.rm = T), 2), ' (', round(sd(data_ja[[variable]], na.rm = T), 2), ')')
    }
}

write.csv(df, file = paste0(outs_dir, 'summary_quantitave.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten', 'OP_Folgeops_Minuten')
results <- c()
obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'

for(i in 1:length(variables)){
    variable_ja <- variables[i]
    data_subset <- obj[c(variables[i], 'Gruppe')]
    colnames(data_subset)[colnames(data_subset) != 'Gruppe'] <- 'Variable_to_test'
    p_value <- ad.test(obj[[variable_ja]])[[2]]
    if(p_value < 0.05){
        statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 'wilcox.test', p.adjust.method = 'bonferroni', paired = FALSE)
        statistics[1] <- variable_ja
    } else {
        statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 't.test', p.adjust.method = 'bonferroni', paired = FALSE)
        statistics[1] <- variable_ja
    }
    obj_plot <- obj[c(variable_ja, 'Gruppe')]
    colnames(obj_plot)[1] <- 'Variable'
    plot <- ggplot(obj_plot, aes(x = Gruppe, y = Variable, fill = Gruppe)) +
          geom_boxplot(width = 0.6, outlier.shape = NA) +
          geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
          ggtitle(paste0(variable_ja, '\n(', statistics$method[1], ' Test, p = ', round(statistics$p.adj[1], 3), ')')) +
          xlab("Verlustfall?") +
          ylab("") +
          cowplot::theme_cowplot() + 
          theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
                axis.title.x  = element_text(size=17),
                 plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
                axis.text.y = element_text(size=17),
               legend.position = 'none')
    ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.pdf'), width = 6, height = 6)
     ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.svg'), width = 6, height = 6)
    results <- rbind(results, statistics)
}


results
write.csv(results, file = paste0(outs_dir, 'quantitave_testing.csv'))

Analysis of the costs with correlation

dir_outs <- './outs_costs_correlation/'
dir.create(dir_outs)

#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
obj <- data_matrices[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}
head(obj)
variables
# test correlation of the parameters on the outcome of 'Gesamt_Gesamt'
variables <- variables[!(variables %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-'))]
results <- c()
for(i in 1:length(variables)){
    correlation_result  <- cor.test(obj[[variables[i]]], obj[['Gesamt_Gesamt']], method = "spearman")
    subset_results <- data.frame(Variable = variables[i], rho = correlation_result$estimate, p_value = correlation_result$p.value)

# Create the dot plot
obj_plot <- obj[c(variables[i], 'Gesamt_Gesamt')]
colnames(obj_plot)[colnames(obj_plot) == variables[i]] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Variable, y = Gesamt_Gesamt)) +
  geom_point() +
  geom_smooth() +
          ggtitle(paste0(variables[i], ', Spearman, \np = ', round(subset_results$p_value[1], 0), 
                         ', rho = ', round(subset_results$rho[1], 3), ')')) +
          xlab(variables[i]) +
          ylab("Gesamterlös") +
          cowplot::theme_cowplot() + 
          theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
                axis.title.x  = element_text(size=17),
                 plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
                axis.text.y = element_text(size=17),
               legend.position = 'none')
ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.svg'), width = 6, height = 6)

    #combine the test results
    results <- rbind(results, subset_results)
}
results
write.csv(results, file = paste0(dir_outs, 'spearman_correlation.csv'))

Analysis of costs with frequency of Verlust per category

obj_subset <- obj
obj_subset <- obj_subset[1:ncol(obj_subset)-1]
obj_subset[obj_subset > 0] <- 0
obj_subset[obj_subset < 0] <- 1
obj_subset$Verlustfall <- obj$Verlustfall
head(obj_subset)
outs_dir <- './outs_costs_fractions/'
dir.create(outs_dir)

#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
data_small <- obj_subset[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables

data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
    variable <- variables[i]
    
    number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
    nein_all <- sum(data_small[['Gruppe']] == 'nein')
    df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
    
    number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
    ja_all <- sum(data_small[['Gruppe']] == 'ja')
    df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}

write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small

for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    results <- rbind(results, results_subset)
}

results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)

results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 1, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height = 
                    .2, color = "gray50") +
        xlim(-2, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Odds Ratio") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 0, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height = 
                    .2, color = "gray75") +
        xlim(-max_lim, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Log(OR)") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    
    #plot the fractions diagramm
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long
    
    plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('')+
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
    
#plot the fractions diagramm transposed (x axis - how many in Verlustfall, y - how many times the variable in Verlust)
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, Verlustfall == data_to_test_long$Verlustfall[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long

 plot <- ggplot(data_to_test_long, aes(fill=name, y=relative, x=Verlustfall)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('Verlustfall?')+
        guides(fill=guide_legend("Variable negativ?")) +
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.svg'), width = 6, height = 6)
}

Take a deeper look at the OP costs and ANA costs

OP

outs_dir <- './outs_costs_fractions/OP/'
dir.create(outs_dir)

#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '4 OP', value = T), 'Verlustfall')
variables
obj <- data_matrices[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

obj_subset <- obj
obj_subset <- obj_subset[1:ncol(obj_subset)-1]
obj_subset[obj_subset > 0] <- 0
obj_subset[obj_subset < 0] <- 1
obj_subset$Verlustfall <- obj$Verlustfall

data_small <- obj_subset

data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables

data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
    variable <- variables[i]
    
    number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
    nein_all <- sum(data_small[['Gruppe']] == 'nein')
    df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
    
    number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
    ja_all <- sum(data_small[['Gruppe']] == 'ja')
    df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}

write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small

for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    results <- rbind(results, results_subset)
}

results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)

results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 1, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height = 
                    .2, color = "gray50") +
        xlim(-2, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Odds Ratio") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 0, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height = 
                    .2, color = "gray75") +
        xlim(-max_lim, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Log(OR)") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    
    #plot the fractions diagramm
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long
    
    plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('')+
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
    
    #plot the fractions diagramm transposed (x axis - how many in Verlustfall, y - how many times the variable in Verlust)
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, Verlustfall == data_to_test_long$Verlustfall[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long

 plot <- ggplot(data_to_test_long, aes(fill=name, y=relative, x=Verlustfall)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('Verlustfall?')+
        guides(fill=guide_legend("Variable negativ?")) +
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.svg'), width = 6, height = 6)
}

ANA

outs_dir <- './outs_costs_fractions/ANA/'
dir.create(outs_dir)

#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '5 ANA', value = T), 'Verlustfall')
variables
obj <- data_matrices[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

obj_subset <- obj
obj_subset <- obj_subset[1:ncol(obj_subset)-1]
obj_subset[obj_subset > 0] <- 0
obj_subset[obj_subset < 0] <- 1
obj_subset$Verlustfall <- obj$Verlustfall

data_small <- obj_subset

data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables

data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
    variable <- variables[i]
    
    number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
    nein_all <- sum(data_small[['Gruppe']] == 'nein')
    df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
    
    number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
    ja_all <- sum(data_small[['Gruppe']] == 'ja')
    df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}

write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small

for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    results <- rbind(results, results_subset)
}

results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)

results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 1, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height = 
                    .2, color = "gray50") +
        xlim(-2, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Odds Ratio") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)


width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 0, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height = 
                    .2, color = "gray75") +
        xlim(-max_lim, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Log(OR)") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    
    #plot the fractions diagramm
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long
    
    plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('')+
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
    
    #plot the fractions diagramm transposed (x axis - how many in Verlustfall, y - how many times the variable in Verlust)
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, Verlustfall == data_to_test_long$Verlustfall[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long

 plot <- ggplot(data_to_test_long, aes(fill=name, y=relative, x=Verlustfall)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('Verlustfall?')+
        guides(fill=guide_legend("Variable negativ?")) +
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.svg'), width = 6, height = 6)
}

Verlustfälle: wer hatte den größten Anteil am Verlust?

outs_dir <- './outs_costs_fractions/anteil_verlust/'
dir.create(outs_dir)

first the columns

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
height
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))

#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
       
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)

then the rows

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)

#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)


#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)

plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)

Gewinnfälle: wer hatte den größten Anteil am Verlust?

outs_dir <- './outs_costs_fractions/anteil_gewinn/'
dir.create(outs_dir)

first the columns

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))

#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
       
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)

then the rows

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)

#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)

#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)

plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)

Run multiple regression

outs_dir <- 'outs'
dir.create(outs_dir)
colnames(data)
columns_remove <- c('Patientennummer', 'Geschlecht', 'Alter', 'BMI', 'Extremitaetenverlust')
obj <- data[!(colnames(data) %in% columns_remove)]

#remove NAs

#rename the gruppe variable for logistic regression
obj$Gruppe[obj$Gruppe == 'nein'] <- 0
obj$Gruppe[obj$Gruppe == 'ja'] <- 1
unique(obj$Gruppe)

obj <- na.omit(obj)
nrow(obj)
head(obj)
#Perform multinomial logistic regression with LR 

likehoodmultinom_p <- function(model_lmm) {i <- 1

  variables <-c("No funciona")
  Pvalues <- c("No funciona") 


  for (var in model_lmm$coefnames[-1]) { 

  variables[i] =paste(var)
  Pvalues[i]= lrtest(model_lmm, var)[[5]][2]
  i=i+1
  }
  return (data.frame(variables, Pvalues))
}

#compute the multinomial regression model
model <- multinom(formula = Gruppe ~ ., data = obj, trace = F)

#transform to or
model_or <- exp(cbind(OR = coef(model), confint(model)))
model_or <- data.frame(model_or)

#add coefficitents
model_or$Coefficients <- as.numeric(paste(coef(model)))
#model_or <- round(model_or, digits=2)

#perform LR statistics
test <- likehoodmultinom_p(model)
model_or <- model_or[2:nrow(model_or), ]

#combine results
model_or <- cbind(model_or, test$Pvalues)
colnames(model_or) <- c('OR', '2.5CI', '97.5CI', 'Coefficients', 'Pvalues')
for(i in 1:ncol(model_or)){
    model_or[[i]] <- round(as.numeric(model_or[[i]]), 2)
}

model_or$significance <- 'yes'
model_or$significance[model_or$Pvalues > 0.05] <- 'no'
model_or

write.csv(model_or, file = paste0(outs_dir, 'multinomial_regression_all.csv'))
set_figsize(10, 8)
color_qual_flow2 <- c("TRUE" = "#D3556E", yes = "#D3556E", "FALSE" = "lightgrey", no = "lightgrey")

data_plot <- model_or
data_plot$Variables <- rownames(data_plot)
data_plot <- data_plot[order(data_plot$Coefficients, decreasing = F),]
data_plot$Variables  <- factor(data_plot$Variables, levels = unique(data_plot$Variables))

data_plot$Values <- data_plot$Coefficients
plot <- ggplot(data_plot, aes(y=Variables, x= Values, fill = significance == 'yes'))+
        #xlim(-max(data$FC), max(data$FC))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 0, color = "grey", linetype="dashed") +
        geom_segment( aes(yend=Variables, xend=0), col= "black") +
        geom_point(shape=21, aes(size = 4)) +
        #xlim(min(data_plot$Values), max(data_plot$Values)) +
        theme_light()+ xlab("Regressionskoeffiziente") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1), 
              legend.position = "right")+ ggtitle('Regressionskoeffiziente (Extremitätenerhalt)')
plot
ggsave(plot, file=paste0(outs_dir, 'koeff_all_multinomial.pdf'), width = 10, height = 8)

Check the data between the years

data_matrices_old <- data_matrices
data_old <- data

2020

year_analysis <- 2020
data_year <- data_old[grep(data_old[['Entlassungsdatum']], pattern = year_analysis), ]
data_matrices_year <- data_matrices_old[grep(data_matrices_old[['Entlassungsdatum']], pattern = year_analysis), ]
data <- data_year
nrow(data)

Test proportions

outs_dir <- paste0('./outs_proportions_', year_analysis, '/')
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
    if(length(unique(data[[i]])) == 2){
        variables <- c(variables, colnames(data)[i])
    } else {variables_cont <- c(variables_cont, colnames(data)[i])}
}

data_small <- data[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables


data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
#create the overview of cases overall
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 2))
colnames(df) <- c('Variable', 'absolut(%)')
df$Variable <- variables
for(i in 1:nrow(df)){
    #number of positives:
    number_of_positive <- sum(data[[df$Variable[i]]] == 1)
    df[['absolut(%)']][i] <- paste0(number_of_positive, '(', round(100*number_of_positive/nrow(data), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data_all.csv'))
df
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
    variable <- variables[i]
    
    number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
    nein_all <- sum(data_small[['Gruppe']] == 'nein')
    df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
    
    number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
    ja_all <- sum(data_small[['Gruppe']] == 'ja')
    df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}

write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small

for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    results <- rbind(results, results_subset)
}

results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)

results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 1, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height = 
                    .2, color = "gray50") +
        xlim(-2, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Odds Ratio") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 0, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height = 
                    .2, color = "gray75") +
        xlim(-max_lim, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Log(OR)") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    
    #plot the fractions diagramm
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long
    
    plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('')+
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
}

Create table of quantitave values, test quantitive values

outs_dir <- paste0('./outs_quantitative', year_analysis, '/')
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
    if(length(unique(data[[i]])) == 2){
        variables <- c(variables, colnames(data)[i])
    } else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
variables_cont
variables <- c('Alter', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten', 'OP_Folgeops_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('Variable', 'mean(SD)', 'median(IQR)')
df$Variable <- variables

for(i in 1:nrow(df)){
    df[['mean(SD)']][i] <- paste0(round(mean(data[[df$Variable[i]]]), 2), '(', round(sd(data[[df$Variable[i]]]), 2), ')')
    df[['median(IQR)']][i] <- paste0(round(median(data[[df$Variable[i]]]), 2), '(', round(IQR(data[[df$Variable[i]]]), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave_all.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 4))
colnames(df) <- c('variables', 'description', 'ja', 'nein')
df$variables <- variables

obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
data_nein <- filter(obj, Gruppe == 'nein')
data_ja <- filter(obj, Gruppe == 'ja')

for(i in 1:nrow(df)){
    variable <- variables[i]
    #test wheter parametric or not, if P > 0.05 then param
    p_value <- ad.test(obj[[variable]])[[2]]
    if(p_value < 0.05){
        df$description[i] <- 'median (IQR)'
        df$nein[i] <- paste0(round(median(data_nein[[variable]], na.rm = T), 2), ' (', round(IQR(data_nein[[variable]], na.rm = T), 2), ')')
        df$ja[i] <- paste0(round(median(data_ja[[variable]], na.rm = T), 2), ' (', round(IQR(data_ja[[variable]], na.rm = T), 2), ')')
    } else {
        df$description[i] <- 'mean (SD)'
        df$nein[i] <- paste0(round(mean(data_nein[[variable]], na.rm = T), 2), ' (', round(sd(data_nein[[variable]], na.rm = T), 2), ')')
        df$ja[i] <- paste0(round(mean(data_ja[[variable]], na.rm = T), 2), ' (', round(sd(data_ja[[variable]], na.rm = T), 2), ')')
    }
}

write.csv(df, file = paste0(outs_dir, 'summary_quantitave.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten')
results <- c()
obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'

for(i in 1:length(variables)){
    variable_ja <- variables[i]
    data_subset <- obj[c(variables[i], 'Gruppe')]
    colnames(data_subset)[colnames(data_subset) != 'Gruppe'] <- 'Variable_to_test'
    p_value <- ad.test(obj[[variable_ja]])[[2]]
    if(p_value < 0.05){
        statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 'wilcox.test', p.adjust.method = 'bonferroni', paired = FALSE)
        statistics[1] <- variable_ja
    } else {
        statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 't.test', p.adjust.method = 'bonferroni', paired = FALSE)
        statistics[1] <- variable_ja
    }
    obj_plot <- obj[c(variable_ja, 'Gruppe')]
    colnames(obj_plot)[1] <- 'Variable'
    plot <- ggplot(obj_plot, aes(x = Gruppe, y = Variable, fill = Gruppe)) +
          geom_boxplot(width = 0.6, outlier.shape = NA) +
          geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
          ggtitle(paste0(variable_ja, '\n(', statistics$method[1], ' Test, p = ', round(statistics$p.adj[1], 3), ')')) +
          xlab("Verlustfall?") +
          ylab("") +
          cowplot::theme_cowplot() + 
          theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
                axis.title.x  = element_text(size=17),
                 plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
                axis.text.y = element_text(size=17),
               legend.position = 'none')
    ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.pdf'), width = 6, height = 6)
     ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.svg'), width = 6, height = 6)
    results <- rbind(results, statistics)
}


results
write.csv(results, file = paste0(outs_dir, 'quantitave_testing.csv'))

Analysis of the costs with correlation

dir_outs <- paste0('./outs_costs_correlation', year_analysis, '/')
dir.create(dir_outs)

#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
obj <- data_matrices[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}
head(obj)
variables
# test correlation of the parameters on the outcome of 'Gesamt_Gesamt'
variables <- variables[!(variables %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-'))]
results <- c()
for(i in 1:length(variables)){
    correlation_result  <- cor.test(obj[[variables[i]]], obj[['Gesamt_Gesamt']], method = "spearman")
    subset_results <- data.frame(Variable = variables[i], rho = correlation_result$estimate, p_value = correlation_result$p.value)

# Create the dot plot
obj_plot <- obj[c(variables[i], 'Gesamt_Gesamt')]
colnames(obj_plot)[colnames(obj_plot) == variables[i]] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Variable, y = Gesamt_Gesamt)) +
  geom_point() +
  geom_smooth() +
          ggtitle(paste0(variables[i], ', Spearman, \np = ', round(subset_results$p_value[1], 0), 
                         ', rho = ', round(subset_results$rho[1], 3), ')')) +
          xlab(variables[i]) +
          ylab("Gesamterlös") +
          cowplot::theme_cowplot() + 
          theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
                axis.title.x  = element_text(size=17),
                 plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
                axis.text.y = element_text(size=17),
               legend.position = 'none')
ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.svg'), width = 6, height = 6)

    #combine the test results
    results <- rbind(results, subset_results)
}
results
write.csv(results, file = paste0(dir_outs, 'spearman_correlation.csv'))

Analysis of costs with frequency of Verlust per category

obj_subset <- obj
obj_subset <- obj_subset[1:ncol(obj_subset)-1]
obj_subset[obj_subset > 0] <- 0
obj_subset[obj_subset < 0] <- 1
obj_subset$Verlustfall <- obj$Verlustfall
head(obj_subset)
outs_dir <- paste0('./outs_costs_fractions', year_analysis, '/')
dir.create(outs_dir)

#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
data_small <- obj_subset[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables

data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
    variable <- variables[i]
    
    number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
    nein_all <- sum(data_small[['Gruppe']] == 'nein')
    df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
    
    number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
    ja_all <- sum(data_small[['Gruppe']] == 'ja')
    df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}

write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small

for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    results <- rbind(results, results_subset)
}

results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)

results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 1, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height = 
                    .2, color = "gray50") +
        xlim(-2, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Odds Ratio") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 0, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height = 
                    .2, color = "gray75") +
        xlim(-max_lim, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Log(OR)") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    
    #plot the fractions diagramm
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long
    
    plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('')+
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
    
#plot the fractions diagramm transposed (x axis - how many in Verlustfall, y - how many times the variable in Verlust)
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, Verlustfall == data_to_test_long$Verlustfall[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long

 plot <- ggplot(data_to_test_long, aes(fill=name, y=relative, x=Verlustfall)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('Verlustfall?')+
        guides(fill=guide_legend("Variable negativ?")) +
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.svg'), width = 6, height = 6)
}

Verlustfälle: wer hatte den größten Anteil am Verlust?

outs_dir <- paste0('./outs_costs_fractions/anteil_verlust', year_analysis, '/')
dir.create(outs_dir)

first the columns

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))

#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
   
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(10,5)
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)

then the rows

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)

#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)


#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)

plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(5, 10)
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)

Gewinnfälle: wer hatte den größten Anteil am Verlust?

outs_dir <- paste0('./outs_costs_fractions/anteil_gewinn', year_analysis, '/')
dir.create(outs_dir)

first the columns

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))

#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
       
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(10, 5)
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)

then the rows

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)

#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)

#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)

plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)

2021

year_analysis <- 2021
data_year <- data_old[grep(data_old[['Entlassungsdatum']], pattern = year_analysis), ]
data_matrices_year <- data_matrices_old[grep(data_matrices_old[['Entlassungsdatum']], pattern = year_analysis), ]
data <- data_year
nrow(data)

Test proportions

outs_dir <- paste0('./outs_proportions_', year_analysis, '/')
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
    if(length(unique(data[[i]])) == 2){
        variables <- c(variables, colnames(data)[i])
    } else {variables_cont <- c(variables_cont, colnames(data)[i])}
}

data_small <- data[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables


data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
#create the overview of cases overall
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 2))
colnames(df) <- c('Variable', 'absolut(%)')
df$Variable <- variables
for(i in 1:nrow(df)){
    #number of positives:
    number_of_positive <- sum(data[[df$Variable[i]]] == 1)
    df[['absolut(%)']][i] <- paste0(number_of_positive, '(', round(100*number_of_positive/nrow(data), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data_all.csv'))
df
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
    variable <- variables[i]
    
    number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
    nein_all <- sum(data_small[['Gruppe']] == 'nein')
    df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
    
    number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
    ja_all <- sum(data_small[['Gruppe']] == 'ja')
    df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}

write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small

for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    results <- rbind(results, results_subset)
}

results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)

results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 1, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height = 
                    .2, color = "gray50") +
        xlim(-2, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Odds Ratio") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 0, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height = 
                    .2, color = "gray75") +
        xlim(-max_lim, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Log(OR)") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    
    #plot the fractions diagramm
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long
    
    plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('')+
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
}

Create table of quantitave values, test quantitive values

outs_dir <- paste0('./outs_quantitative', year_analysis, '/')
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
    if(length(unique(data[[i]])) == 2){
        variables <- c(variables, colnames(data)[i])
    } else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
variables_cont
variables <- c('Alter', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten', 'OP_Folgeops_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('Variable', 'mean(SD)', 'median(IQR)')
df$Variable <- variables

for(i in 1:nrow(df)){
    df[['mean(SD)']][i] <- paste0(round(mean(data[[df$Variable[i]]]), 2), '(', round(sd(data[[df$Variable[i]]]), 2), ')')
    df[['median(IQR)']][i] <- paste0(round(median(data[[df$Variable[i]]]), 2), '(', round(IQR(data[[df$Variable[i]]]), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave_all.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 4))
colnames(df) <- c('variables', 'description', 'ja', 'nein')
df$variables <- variables

obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
data_nein <- filter(obj, Gruppe == 'nein')
data_ja <- filter(obj, Gruppe == 'ja')

for(i in 1:nrow(df)){
    variable <- variables[i]
    #test wheter parametric or not, if P > 0.05 then param
    p_value <- ad.test(obj[[variable]])[[2]]
    if(p_value < 0.05){
        df$description[i] <- 'median (IQR)'
        df$nein[i] <- paste0(round(median(data_nein[[variable]], na.rm = T), 2), ' (', round(IQR(data_nein[[variable]], na.rm = T), 2), ')')
        df$ja[i] <- paste0(round(median(data_ja[[variable]], na.rm = T), 2), ' (', round(IQR(data_ja[[variable]], na.rm = T), 2), ')')
    } else {
        df$description[i] <- 'mean (SD)'
        df$nein[i] <- paste0(round(mean(data_nein[[variable]], na.rm = T), 2), ' (', round(sd(data_nein[[variable]], na.rm = T), 2), ')')
        df$ja[i] <- paste0(round(mean(data_ja[[variable]], na.rm = T), 2), ' (', round(sd(data_ja[[variable]], na.rm = T), 2), ')')
    }
}

write.csv(df, file = paste0(outs_dir, 'summary_quantitave.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten')
results <- c()
obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'

for(i in 1:length(variables)){
    variable_ja <- variables[i]
    data_subset <- obj[c(variables[i], 'Gruppe')]
    colnames(data_subset)[colnames(data_subset) != 'Gruppe'] <- 'Variable_to_test'
    p_value <- ad.test(obj[[variable_ja]])[[2]]
    if(p_value < 0.05){
        statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 'wilcox.test', p.adjust.method = 'bonferroni', paired = FALSE)
        statistics[1] <- variable_ja
    } else {
        statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 't.test', p.adjust.method = 'bonferroni', paired = FALSE)
        statistics[1] <- variable_ja
    }
    obj_plot <- obj[c(variable_ja, 'Gruppe')]
    colnames(obj_plot)[1] <- 'Variable'
    plot <- ggplot(obj_plot, aes(x = Gruppe, y = Variable, fill = Gruppe)) +
          geom_boxplot(width = 0.6, outlier.shape = NA) +
          geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
          ggtitle(paste0(variable_ja, '\n(', statistics$method[1], ' Test, p = ', round(statistics$p.adj[1], 3), ')')) +
          xlab("Verlustfall?") +
          ylab("") +
          cowplot::theme_cowplot() + 
          theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
                axis.title.x  = element_text(size=17),
                 plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
                axis.text.y = element_text(size=17),
               legend.position = 'none')
    ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.pdf'), width = 6, height = 6)
     ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.svg'), width = 6, height = 6)
    results <- rbind(results, statistics)
}


results
write.csv(results, file = paste0(outs_dir, 'quantitave_testing.csv'))

Analysis of the costs with correlation

dir_outs <- paste0('./outs_costs_correlation', year_analysis, '/')
dir.create(dir_outs)

#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
obj <- data_matrices[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}
head(obj)
variables
# test correlation of the parameters on the outcome of 'Gesamt_Gesamt'
variables <- variables[!(variables %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-'))]
results <- c()
for(i in 1:length(variables)){
    correlation_result  <- cor.test(obj[[variables[i]]], obj[['Gesamt_Gesamt']], method = "spearman")
    subset_results <- data.frame(Variable = variables[i], rho = correlation_result$estimate, p_value = correlation_result$p.value)

# Create the dot plot
obj_plot <- obj[c(variables[i], 'Gesamt_Gesamt')]
colnames(obj_plot)[colnames(obj_plot) == variables[i]] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Variable, y = Gesamt_Gesamt)) +
  geom_point() +
  geom_smooth() +
          ggtitle(paste0(variables[i], ', Spearman, \np = ', round(subset_results$p_value[1], 0), 
                         ', rho = ', round(subset_results$rho[1], 3), ')')) +
          xlab(variables[i]) +
          ylab("Gesamterlös") +
          cowplot::theme_cowplot() + 
          theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
                axis.title.x  = element_text(size=17),
                 plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
                axis.text.y = element_text(size=17),
               legend.position = 'none')
ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.svg'), width = 6, height = 6)

    #combine the test results
    results <- rbind(results, subset_results)
}
results
write.csv(results, file = paste0(dir_outs, 'spearman_correlation.csv'))

Analysis of costs with frequency of Verlust per category

obj_subset <- obj
obj_subset <- obj_subset[1:ncol(obj_subset)-1]
obj_subset[obj_subset > 0] <- 0
obj_subset[obj_subset < 0] <- 1
obj_subset$Verlustfall <- obj$Verlustfall
head(obj_subset)
outs_dir <- paste0('./outs_costs_fractions', year_analysis, '/')
dir.create(outs_dir)

#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
data_small <- obj_subset[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables

data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
    variable <- variables[i]
    
    number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
    nein_all <- sum(data_small[['Gruppe']] == 'nein')
    df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
    
    number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
    ja_all <- sum(data_small[['Gruppe']] == 'ja')
    df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}

write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small

for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    results <- rbind(results, results_subset)
}

results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)

results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 1, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height = 
                    .2, color = "gray50") +
        xlim(-2, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Odds Ratio") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 0, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height = 
                    .2, color = "gray75") +
        xlim(-max_lim, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Log(OR)") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    
    #plot the fractions diagramm
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long
    
    plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('')+
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
    
#plot the fractions diagramm transposed (x axis - how many in Verlustfall, y - how many times the variable in Verlust)
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, Verlustfall == data_to_test_long$Verlustfall[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long

 plot <- ggplot(data_to_test_long, aes(fill=name, y=relative, x=Verlustfall)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('Verlustfall?')+
        guides(fill=guide_legend("Variable negativ?")) +
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.svg'), width = 6, height = 6)
}

Verlustfälle: wer hatte den größten Anteil am Verlust?

outs_dir <- paste0('./outs_costs_fractions/anteil_verlust', year_analysis, '/')
dir.create(outs_dir)

first the columns

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))

#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
   
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(10,5)
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)

then the rows

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)

#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)


#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)

plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(5, 10)
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)

Gewinnfälle: wer hatte den größten Anteil am Verlust?

outs_dir <- paste0('./outs_costs_fractions/anteil_gewinn', year_analysis, '/')
dir.create(outs_dir)

first the columns

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))

#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
       
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(10, 5)
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)

then the rows

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)

#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)

#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)

plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)

2022

year_analysis <- 2022
data_year <- data_old[grep(data_old[['Entlassungsdatum']], pattern = year_analysis), ]
data_matrices_year <- data_matrices_old[grep(data_matrices_old[['Entlassungsdatum']], pattern = year_analysis), ]
data <- data_year
nrow(data)

Test proportions

outs_dir <- paste0('./outs_proportions_', year_analysis, '/')
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
    if(length(unique(data[[i]])) == 2){
        variables <- c(variables, colnames(data)[i])
    } else {variables_cont <- c(variables_cont, colnames(data)[i])}
}

data_small <- data[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables


data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
#create the overview of cases overall
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 2))
colnames(df) <- c('Variable', 'absolut(%)')
df$Variable <- variables
for(i in 1:nrow(df)){
    #number of positives:
    number_of_positive <- sum(data[[df$Variable[i]]] == 1)
    df[['absolut(%)']][i] <- paste0(number_of_positive, '(', round(100*number_of_positive/nrow(data), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data_all.csv'))
df
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
    variable <- variables[i]
    
    number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
    nein_all <- sum(data_small[['Gruppe']] == 'nein')
    df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
    
    number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
    ja_all <- sum(data_small[['Gruppe']] == 'ja')
    df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}

write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small

for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    results <- rbind(results, results_subset)
}

results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)

results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 1, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height = 
                    .2, color = "gray50") +
        xlim(-2, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Odds Ratio") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 0, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height = 
                    .2, color = "gray75") +
        xlim(-max_lim, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Log(OR)") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    
    #plot the fractions diagramm
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long
    
    plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('')+
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
}

Create table of quantitave values, test quantitive values

outs_dir <- paste0('./outs_quantitative', year_analysis, '/')
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
    if(length(unique(data[[i]])) == 2){
        variables <- c(variables, colnames(data)[i])
    } else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
variables_cont
variables <- c('Alter', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten', 'OP_Folgeops_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('Variable', 'mean(SD)', 'median(IQR)')
df$Variable <- variables

for(i in 1:nrow(df)){
    df[['mean(SD)']][i] <- paste0(round(mean(data[[df$Variable[i]]]), 2), '(', round(sd(data[[df$Variable[i]]]), 2), ')')
    df[['median(IQR)']][i] <- paste0(round(median(data[[df$Variable[i]]]), 2), '(', round(IQR(data[[df$Variable[i]]]), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave_all.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 4))
colnames(df) <- c('variables', 'description', 'ja', 'nein')
df$variables <- variables

obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
data_nein <- filter(obj, Gruppe == 'nein')
data_ja <- filter(obj, Gruppe == 'ja')

for(i in 1:nrow(df)){
    variable <- variables[i]
    #test wheter parametric or not, if P > 0.05 then param
    p_value <- ad.test(obj[[variable]])[[2]]
    if(p_value < 0.05){
        df$description[i] <- 'median (IQR)'
        df$nein[i] <- paste0(round(median(data_nein[[variable]], na.rm = T), 2), ' (', round(IQR(data_nein[[variable]], na.rm = T), 2), ')')
        df$ja[i] <- paste0(round(median(data_ja[[variable]], na.rm = T), 2), ' (', round(IQR(data_ja[[variable]], na.rm = T), 2), ')')
    } else {
        df$description[i] <- 'mean (SD)'
        df$nein[i] <- paste0(round(mean(data_nein[[variable]], na.rm = T), 2), ' (', round(sd(data_nein[[variable]], na.rm = T), 2), ')')
        df$ja[i] <- paste0(round(mean(data_ja[[variable]], na.rm = T), 2), ' (', round(sd(data_ja[[variable]], na.rm = T), 2), ')')
    }
}

write.csv(df, file = paste0(outs_dir, 'summary_quantitave.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten')
results <- c()
obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'

for(i in 1:length(variables)){
    variable_ja <- variables[i]
    data_subset <- obj[c(variables[i], 'Gruppe')]
    colnames(data_subset)[colnames(data_subset) != 'Gruppe'] <- 'Variable_to_test'
    p_value <- ad.test(obj[[variable_ja]])[[2]]
    if(p_value < 0.05){
        statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 'wilcox.test', p.adjust.method = 'bonferroni', paired = FALSE)
        statistics[1] <- variable_ja
    } else {
        statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 't.test', p.adjust.method = 'bonferroni', paired = FALSE)
        statistics[1] <- variable_ja
    }
    obj_plot <- obj[c(variable_ja, 'Gruppe')]
    colnames(obj_plot)[1] <- 'Variable'
    plot <- ggplot(obj_plot, aes(x = Gruppe, y = Variable, fill = Gruppe)) +
          geom_boxplot(width = 0.6, outlier.shape = NA) +
          geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
          ggtitle(paste0(variable_ja, '\n(', statistics$method[1], ' Test, p = ', round(statistics$p.adj[1], 3), ')')) +
          xlab("Verlustfall?") +
          ylab("") +
          cowplot::theme_cowplot() + 
          theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
                axis.title.x  = element_text(size=17),
                 plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
                axis.text.y = element_text(size=17),
               legend.position = 'none')
    ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.pdf'), width = 6, height = 6)
     ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.svg'), width = 6, height = 6)
    results <- rbind(results, statistics)
}


results
write.csv(results, file = paste0(outs_dir, 'quantitave_testing.csv'))

Analysis of the costs with correlation

dir_outs <- paste0('./outs_costs_correlation', year_analysis, '/')
dir.create(dir_outs)

#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
obj <- data_matrices[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}
head(obj)
variables
# test correlation of the parameters on the outcome of 'Gesamt_Gesamt'
variables <- variables[!(variables %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-'))]
results <- c()
for(i in 1:length(variables)){
    correlation_result  <- cor.test(obj[[variables[i]]], obj[['Gesamt_Gesamt']], method = "spearman")
    subset_results <- data.frame(Variable = variables[i], rho = correlation_result$estimate, p_value = correlation_result$p.value)

# Create the dot plot
obj_plot <- obj[c(variables[i], 'Gesamt_Gesamt')]
colnames(obj_plot)[colnames(obj_plot) == variables[i]] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Variable, y = Gesamt_Gesamt)) +
  geom_point() +
  geom_smooth() +
          ggtitle(paste0(variables[i], ', Spearman, \np = ', round(subset_results$p_value[1], 0), 
                         ', rho = ', round(subset_results$rho[1], 3), ')')) +
          xlab(variables[i]) +
          ylab("Gesamterlös") +
          cowplot::theme_cowplot() + 
          theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
                axis.title.x  = element_text(size=17),
                 plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
                axis.text.y = element_text(size=17),
               legend.position = 'none')
ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.svg'), width = 6, height = 6)

    #combine the test results
    results <- rbind(results, subset_results)
}
results
write.csv(results, file = paste0(dir_outs, 'spearman_correlation.csv'))

Analysis of costs with frequency of Verlust per category

obj_subset <- obj
obj_subset <- obj_subset[1:ncol(obj_subset)-1]
obj_subset[obj_subset > 0] <- 0
obj_subset[obj_subset < 0] <- 1
obj_subset$Verlustfall <- obj$Verlustfall
head(obj_subset)
outs_dir <- paste0('./outs_costs_fractions', year_analysis, '/')
dir.create(outs_dir)

#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
data_small <- obj_subset[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables

data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
    variable <- variables[i]
    
    number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
    nein_all <- sum(data_small[['Gruppe']] == 'nein')
    df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
    
    number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
    ja_all <- sum(data_small[['Gruppe']] == 'ja')
    df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}

write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small

for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    results <- rbind(results, results_subset)
}

results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)

results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 1, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height = 
                    .2, color = "gray50") +
        xlim(-2, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Odds Ratio") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)

color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable  <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))


plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
        scale_fill_manual(values = color_qual_flow2)+
        geom_vline(xintercept = 0, color = "black", linetype="dashed") +
        geom_point(shape=21, aes(size = 4)) +
        geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height = 
                    .2, color = "gray75") +
        xlim(-max_lim, max_lim) +
        #theme_tufte()+ 
        theme_light()+
        xlab("Log(OR)") + ylab("") +
        theme(text=element_text(family="Helvetica"),
            title = element_text(colour = 'black', size = 16),
            axis.text.y = element_text(colour = 'black', size = 16),
            axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'), 
              legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')

plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
    variable <- variables[i]
    #variable - columns, gruppe - rows
    
    
    data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
    data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
    data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
    data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
    
    
    #run the test
    
    test <- fisher.test(data_to_test)
    results_subset <- data.frame(matrix(NA, ncol =  6, nrow = 1))
    colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
    results_subset$Variable <- variable
    results_subset$p.value <- test$p.value
    if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
        results_subset$significancy <- 'no'
    }
    results_subset$CI_low <- test$conf.int[1]
    results_subset$CI_high <- test$conf.int[2]
    results_subset$OR <- test$estimate[[1]]
    
    #plot the fractions diagramm
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long
    
    plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('')+
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
    
#plot the fractions diagramm transposed (x axis - how many in Verlustfall, y - how many times the variable in Verlust)
    data_to_test$Verlustfall <- c('ja', 'nein')
    data_to_test_long <- pivot_longer(
      data_to_test, cols = 1:2, 
    )
    data_to_test_long$relative <- 0
    for(i in 1:nrow(data_to_test_long)){
        data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, Verlustfall == data_to_test_long$Verlustfall[i])$value)
    }
    data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
    data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
    data_to_test_long

 plot <- ggplot(data_to_test_long, aes(fill=name, y=relative, x=Verlustfall)) +
        geom_bar(position="stack", stat="identity", ) + 
        ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) + 
        ylab('Fraktion der Fälle')+ xlab('Verlustfall?')+
        guides(fill=guide_legend("Variable negativ?")) +
        theme(
        plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
        text=element_text(family="Helvetica", size = 15),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.y = element_text(colour = 'black', size = 16), 
        axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.pdf'), width = 6, height = 6)
    ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.svg'), width = 6, height = 6)
}

Verlustfälle: wer hatte den größten Anteil am Verlust?

outs_dir <- paste0('./outs_costs_fractions/anteil_verlust', year_analysis, '/')
dir.create(outs_dir)

first the columns

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))

#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
   
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(10,5)
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)

then the rows

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)

#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)


#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)

plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(5, 10)
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)

Gewinnfälle: wer hatte den größten Anteil am Verlust?

outs_dir <- paste0('./outs_costs_fractions/anteil_gewinn', year_analysis, '/')
dir.create(outs_dir)

first the columns

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))

#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
       
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(10, 5)
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)

then the rows

#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
    obj[[i]]<- as.numeric(obj[[i]])
}

colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)

#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal()+
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)

#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)

plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) + 
  geom_tile() + 
  cowplot::theme_cowplot() + 
  #grids(linetype = "dashed", size = 0.1) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab('') +
  xlab('') +
  theme(axis.ticks = element_blank()) + coord_equal() +
   #scale_fill_viridis()
  scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)

Test the continuous values between the years with ANOVA

outs_dir <- './comparison_years/'
dir.create(outs_dir)
data_matrices <- data_matrices_old
data <- data_old
data_matrices$Year <- NA
data_matrices$Year[grep(data_matrices[['Entlassungsdatum']], pattern = 2020)] <- 2020
data_matrices$Year[grep(data_matrices[['Entlassungsdatum']], pattern = 2021)] <- 2021
data_matrices$Year[grep(data_matrices[['Entlassungsdatum']], pattern = 2022)] <- 2022
unique(data_matrices$Year)
length(grep(data_matrices[['Entlassungsdatum']], pattern = 2022))
data_obj <- data_matrices
data_obj[is.na(data_obj)] <- 0
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data_matrices)){
    if(length(unique(data_matrices[[i]])) == 2){
        variables <- c(variables, colnames(data_matrices)[i])
    } else {variables_cont <- c(variables_cont, colnames(data_matrices)[i])}
}
variables_cont
variables_inek <- variables_cont[grep(variables_cont, pattern = 'Gesamt')]
variables_inek
for(i in 1:length(variables_inek)){
    data_obj[[variables_inek[i]]] <- as.numeric(data_obj[[variables_inek[i]]])
}
data_obj[is.na(data_obj)] <- 0
variables <- c('Alter', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten', variables_inek)
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 7))
colnames(df) <- c('Variable', '2020, mean(SD)', '2020, median(IQR)', '2021, mean(SD)', '2021, median(IQR)', '2022, mean(SD)', '2022, median(IQR)')
df$Variable <- variables

for(i in 1:nrow(df)){
    df[['2020, mean(SD)']][i] <- paste0(round(mean(data_obj[[df$Variable[i]]][data_obj$Year == 2020], na.rm = TRUE), 2), '(', round(sd(data_obj[[df$Variable[i]]][data_obj$Year == 2020], na.rm = TRUE), 2), ')')
    df[['2020, median(IQR)']][i] <- paste0(round(median(data_obj[[df$Variable[i]]][data_obj$Year == 2020], na.rm = TRUE), 2), '(', round(IQR(data_obj[[df$Variable[i]]][data_obj$Year == 2020], na.rm = TRUE), 2), ')')
    df[['2021, mean(SD)']][i] <- paste0(round(mean(data_obj[[df$Variable[i]]][data_obj$Year == 2021], na.rm = TRUE), 2), '(', round(sd(data_obj[[df$Variable[i]]][data_obj$Year == 2021], na.rm = TRUE), 2), ')')
    df[['2021, median(IQR)']][i] <- paste0(round(median(data_obj[[df$Variable[i]]][data_obj$Year == 2021], na.rm = TRUE), 2), '(', round(IQR(data_obj[[df$Variable[i]]][data_obj$Year == 2021], na.rm = TRUE), 2), ')')
    df[['2022, mean(SD)']][i] <- paste0(round(mean(data_obj[[df$Variable[i]]][data_obj$Year == 2022], na.rm = TRUE), 2), '(', round(sd(data_obj[[df$Variable[i]]][data_obj$Year == 2022], na.rm = TRUE), 2), ')')
    df[['2022, median(IQR)']][i] <- paste0(round(median(data_obj[[df$Variable[i]]][data_obj$Year == 2022], na.rm = TRUE), 2), '(', round(IQR(data_obj[[df$Variable[i]]][data_obj$Year == 2022], na.rm = TRUE), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave_all.csv'))
df

Perform anova

#prepare the column names
data_to_test <- data_obj[c(variables, 'Year')]
data_to_test$Year <- as.factor(data_to_test$Year)
colnames(data_to_test) <- gsub(" ", "_", colnames(data_to_test))
colnames(data_to_test) <- gsub("^\\d+_", "", colnames(data_to_test))
colnames(data_to_test) <- gsub("^\\d+\\w_", "", colnames(data_to_test))
colnames(data_to_test)
#select variables
variables_to_test <- colnames(data_to_test)[1:(length(colnames(data_to_test))-2)]
variables_to_test
#test the values in loop
i <- 1
results_anova <- c()
for(i in 1:length(variables_to_test)){
        model <- aov(as.formula(paste0(variables_to_test[i], '~ Year')), data = data_to_test)
        TukeyHSD_results <- TukeyHSD(model)
        TukeyHSD_results <- as.data.frame(TukeyHSD_results[[1]])
        TukeyHSD_results$Comparisons <- rownames(TukeyHSD_results)
        TukeyHSD_results$Test <- 'TukeyHSD'
        TukeyHSD_results$Variable <- variables_to_test[i]
        TukeyHSD_results[['p adj']] <- round(TukeyHSD_results[['p adj']], 3)
        TukeyHSD_results$Significant <- 'no'
        TukeyHSD_results$Significant[TukeyHSD_results[['p adj']] < 0.05] <- 'yes'
        rownames(TukeyHSD_results) <- NULL
        results_anova <- rbind(results_anova, TukeyHSD_results)
}
results_anova <- results_anova[order(results_anova$Significant, decreasing = TRUE), ]
rownames(results_anova) <- NULL
results_anova
write.csv(results_anova, file = paste0(outs_dir, 'summary_anova_all.csv'))
#plot the once that are significant
variables_to_plot <- results_anova$Variable[results_anova$Significant == 'yes']
variables_to_plot <- unique(variables_to_plot)
variables_to_plot

for(i in 1:length(variables_to_plot)){
    variable_ja <- variables_to_plot[i]
    obj_plot <- data_to_test[c(variable_ja, 'Year')]
    
    plot <- ggplot(obj_plot, aes_string(x = 'Year', y = variable_ja, fill = 'Year')) +
          geom_boxplot(width = 0.6, outlier.shape = NA) +
          geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
          ggtitle(paste0(variable_ja)) +
          xlab("Jahr") +
          ylab("") +
          cowplot::theme_cowplot() + 
          theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
                axis.title.x  = element_text(size=17),
                 plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
                axis.text.y = element_text(size=17),
               legend.position = 'none')
     print(plot)
    ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.pdf'), width = 6, height = 6)
}

take a look at the data per year and search for outliers

data_year_all <- data_obj[c('Patientennamen', 'Fallnummer', 'Year', 'Entlassungsdatum', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten', 'Gesamt_Gesamt')]
write.csv(data_year_all, file = paste0(outs_dir, '_original_data_all.csv'))
write.csv(subset(data_year_all, Year == 2020), file = paste0(outs_dir, '_original_data_2020.csv'))
write.csv(subset(data_year_all, Year == 2021), file = paste0(outs_dir, '_original_data_2021.csv'))
write.csv(subset(data_year_all, Year == 2022), file = paste0(outs_dir, '_original_data_2022.csv'))
nrow(data_obj)
median(subset(data_obj, Verlustfall == 0)[['Mittlere GrenzVwD']])
#determine IQR
iqr_gesamt <- IQR(data_year_all$Gesamt_Gesamt)
median_gesamt <- median(data_year_all$Gesamt_Gesamt)
cutoff <- median_gesamt - 1.5*iqr_gesamt
data_year_outliers <- subset(data_year_all, Gesamt_Gesamt < cutoff)
data_year_outliers
write.csv(data_year_outliers, file = paste0(outs_dir, '_outliers.csv'))
# Schneider, Kurt: 32 Tage Aufenthalt, dabei einzeitig sogar Lappendeckung. Lnager Aufenthalt, lange Mobi bei Seitenband Reko.
# Lutz, Margot: Borggreve Plastik, OP 580 Minuten, viele EKs gegeben, dabei größter Verlust bei OP mit 11816 Euro verlust. Sehr interessanter Fall! Eigentlich unter mVD.
# Etter, Ingrid: Wahrscheinlich kodierungsbedingt bei gutartigem Befund (Fibromatose Desmoidtyp), am meisten im OP verloren bei OP zeit von 200 minuten
data_fall <-as.data.frame(t(data_obj[data_obj$Fallnummer == '202033759', variables]))
data_fall$Variable <- rownames(data_fall)
data_fall[order(data_fall[1], decreasing = F), ]

Create table of quantitave values, test quantitive values between freie Lappenplastik und nicht

outs_dir <- './outs_quantitative_lappenplastik/'
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
    if(length(unique(data[[i]])) == 2){
        variables <- c(variables, colnames(data)[i])
    } else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
variables_cont
variables <- c('Alter', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten', 'OP_Folgeops_Minuten', 'Gesamt_Gesamt')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('Variable', 'mean(SD)', 'median(IQR)')
df$Variable <- variables

for(i in 1:nrow(df)){
    df[['mean(SD)']][i] <- paste0(round(mean(data_matrices[[df$Variable[i]]]), 2), '(', round(sd(data_matrices[[df$Variable[i]]]), 2), ')')
    df[['median(IQR)']][i] <- paste0(round(median(data_matrices[[df$Variable[i]]]), 2), '(', round(IQR(data_matrices[[df$Variable[i]]]), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave_all.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten', 'OP_Folgeops_Minuten', 'Gesamt_Gesamt')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 4))
colnames(df) <- c('variables', 'description', 'ja', 'nein')
df$variables <- variables

obj <- data_matrices
obj$Gruppe <- data$Freie_Lappenplastik
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
data_nein <- filter(obj, Gruppe == 'nein')
data_ja <- filter(obj, Gruppe == 'ja')

for(i in 1:nrow(df)){
    variable <- variables[i]
    #test wheter parametric or not, if P > 0.05 then param
    p_value <- ad.test(obj[[variable]])[[2]]
    if(p_value < 0.05){
        df$description[i] <- 'median (IQR)'
        df$nein[i] <- paste0(round(median(data_nein[[variable]], na.rm = T), 2), ' (', round(IQR(data_nein[[variable]], na.rm = T), 2), ')')
        df$ja[i] <- paste0(round(median(data_ja[[variable]], na.rm = T), 2), ' (', round(IQR(data_ja[[variable]], na.rm = T), 2), ')')
    } else {
        df$description[i] <- 'mean (SD)'
        df$nein[i] <- paste0(round(mean(data_nein[[variable]], na.rm = T), 2), ' (', round(sd(data_nein[[variable]], na.rm = T), 2), ')')
        df$ja[i] <- paste0(round(mean(data_ja[[variable]], na.rm = T), 2), ' (', round(sd(data_ja[[variable]], na.rm = T), 2), ')')
    }
}

write.csv(df, file = paste0(outs_dir, 'summary_quantitave.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops', 
               'OP_Erstop_Minuten', 'OP_Folgeops_Minuten', 'Gesamt_Gesamt')
results <- c()
obj <- data_matrices
obj$Gruppe <- data$Freie_Lappenplastik
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'

for(i in 1:length(variables)){
    variable_ja <- variables[i]
    data_subset <- obj[c(variables[i], 'Gruppe')]
    colnames(data_subset)[colnames(data_subset) != 'Gruppe'] <- 'Variable_to_test'
    p_value <- ad.test(obj[[variable_ja]])[[2]]
    if(p_value < 0.05){
        statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 'wilcox.test', p.adjust.method = 'bonferroni', paired = FALSE)
        statistics[1] <- variable_ja
    } else {
        statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 't.test', p.adjust.method = 'bonferroni', paired = FALSE)
        statistics[1] <- variable_ja
    }
    obj_plot <- obj[c(variable_ja, 'Gruppe')]
    colnames(obj_plot)[1] <- 'Variable'
    plot <- ggplot(obj_plot, aes(x = Gruppe, y = Variable, fill = Gruppe)) +
          geom_boxplot(width = 0.6, outlier.shape = NA) +
          geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
          ggtitle(paste0(variable_ja, '\n(', statistics$method[1], ' Test, p = ', round(statistics$p.adj[1], 3), ')')) +
          xlab("Verlustfall?") +
          ylab("") +
          cowplot::theme_cowplot() + 
          theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
                axis.title.x  = element_text(size=17),
                 plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
                axis.text.y = element_text(size=17),
               legend.position = 'none')
    ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.pdf'), width = 6, height = 6)
     ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.svg'), width = 6, height = 6)
    results <- rbind(results, statistics)
}


results
write.csv(results, file = paste0(outs_dir, 'quantitave_testing.csv'))