library(devtools)
library(dplyr)
library(Matrix)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(patchwork)
library(ggrepel)
library(scales)
library(ggthemes)
library(purrr)
library(ggrastr)
library(nortest)
library(nnet)
library(MASS)
library(lmtest)
library(ggpubr)
library(readxl)
library(viridis)
library(svglite)
sessionInfo()
R version 4.0.5 (2021-03-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS
Matrix products: default
BLAS/LAPACK: /home/INIM/vladyslav.kavaka/miniconda3/envs/azimuth/lib/libopenblasp-r0.3.17.so
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C
[4] LC_COLLATE=C LC_MONETARY=C LC_MESSAGES=C
[7] LC_PAPER=C LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] svglite_2.1.3 viridis_0.6.3 viridisLite_0.4.2 readxl_1.3.1
[5] ggpubr_0.4.0 lmtest_0.9-40 zoo_1.8-12 MASS_7.3-55
[9] nnet_7.3-17 nortest_1.0-4 ggrastr_1.0.1 purrr_1.0.1
[13] scales_1.2.1 ggrepel_0.9.3 patchwork_1.1.2 ggthemes_4.2.4
[17] ggplot2_3.4.2 tidyr_1.1.4 Matrix_1.6-0 dplyr_1.0.7
[21] devtools_2.4.5 usethis_2.2.2
loaded via a namespace (and not attached):
[1] fs_1.6.2 repr_1.1.4 tools_4.0.5 profvis_0.3.8
[5] backports_1.4.1 utf8_1.2.3 R6_2.5.1 vipor_0.4.5
[9] DBI_1.1.3 colorspace_2.0-3 urlchecker_1.0.1 withr_2.5.0
[13] tidyselect_1.2.0 gridExtra_2.3 prettyunits_1.1.1 processx_3.8.2
[17] compiler_4.0.5 cli_3.6.1 callr_3.7.3 pbdZMQ_0.3-7
[21] systemfonts_1.0.3 stringr_1.5.0 digest_0.6.33 base64enc_0.1-3
[25] pkgconfig_2.0.3 htmltools_0.5.5 sessioninfo_1.2.2 fastmap_1.1.1
[29] htmlwidgets_1.6.2 rlang_1.1.1 shiny_1.7.4.1 generics_0.1.3
[33] jsonlite_1.8.7 car_3.0-12 magrittr_2.0.3 Rcpp_1.0.11
[37] ggbeeswarm_0.6.0 IRkernel_1.3 munsell_0.5.0 fansi_1.0.4
[41] abind_1.4-5 lifecycle_1.0.3 stringi_1.7.6 carData_3.0-5
[45] pkgbuild_1.4.2 grid_4.0.5 promises_1.2.0.1 crayon_1.5.2
[49] miniUI_0.1.1.1 lattice_0.20-45 IRdisplay_1.1 ps_1.7.5
[53] pillar_1.9.0 uuid_1.1-0 ggsignif_0.6.3 pkgload_1.3.2.1
[57] glue_1.6.2 evaluate_0.21 remotes_2.4.2 vctrs_0.6.3
[61] httpuv_1.6.11 cellranger_1.1.0 gtable_0.3.3 assertthat_0.2.1
[65] cachem_1.0.8 mime_0.12 xtable_1.8-4 broom_0.7.12
[69] rstatix_0.7.0 later_1.3.1 tibble_3.1.8 beeswarm_0.4.0
[73] memoise_2.0.1 ellipsis_0.3.2
set_figsize <- function(width, height){
options(repr.plot.width = width,
repr.plot.height = height)
}
Load in the dataset
data <- read_excel('./Matrices/Fallliste_20240525.xlsx')
#create the grading High grade column
data$Grading_alt[data$Grading_alt %in% c('Low grade')] <- 'Low-grade'
data$High_grade <- data$Grading_alt
data$High_grade[data$Grading_alt == '2'] <- 1
data$High_grade[data$Grading_alt == '3'] <- 1
data$High_grade[data$Grading_alt %in% c('Low-grade', 'Low grade')] <- 0
data$High_grade[data$Grading_alt == '1'] <- 0
data$High_grade[data$Grading_alt == 'NA'] <- 0
data$High_grade[data$Grading_alt == 'High-grade'] <- 1
data$High_grade <- as.numeric(data$High_grade)
data$Grading_alt[data$Grading_alt == '1'] <- 'Low-grade'
data$Grading_alt[data$Grading_alt %in% c('2', '3')] <- 'High-grade'
unique(data$Grading_alt)
unique(data$High_grade)
#Rekonstruktion
data$Freie_Lappenplastik <- 0
data$Freie_Lappenplastik[data$Rekonstruktion == 3] <- 1
data$Lokale_Lappenplastik <- 0
data$Lokale_Lappenplastik[data$Rekonstruktion == 2] <- 1
data$SHTx <- 0
data$SHTx[data$Rekonstruktion == 1] <- 1
#add the column for the fall unter der mittleren VwD
data$Über_der_mittleren_VwD <- 1
for(i in 1:nrow(data)){
if(data$Berechnungstage[i] < data[['Mittlere GrenzVwD']][i]){data$Über_der_mittleren_VwD[i] <- 0}
}
head(data)
colnames(data)
#read in the cost matrices
list_matrices <- list.files('./Matrices/')
list_matrices <- grep(x = list_matrices, pattern = 'Kosten', value = TRUE)
matrices <- c()
for(i in 1:length(list_matrices)){
matrice_subset <- read_excel(paste0('./Matrices/', list_matrices[i]))
matrices <- rbind(matrice_subset, matrices)
}
#make a wider pivot to combine the values with the list of patients for subsequent analysis
matrices <- pivot_wider(data = matrices, names_from = 'Kosten_KoStKurz', values_from = 3:ncol(matrices))
#sort only the values that are in both tables
colnames(matrices)[colnames(matrices) == 'FallID'] <- 'Fallnummer'
data <- data[data$Fallnummer %in% matrices$Fallnummer, ]
#combine two tables
data_matrices <- merge(data, matrices, by = 'Fallnummer')
head(data_matrices)
colnames(data_matrices)
#create the Verlustfall column
data_matrices$Verlustfall <- 1
data_matrices$Verlustfall[data_matrices$Gesamt_Gesamt > 0] <- 0
table(data_matrices$Verlustfall)
#add the column to the table without cost matrices
data <- merge(data, data_matrices[c('Fallnummer', 'Verlustfall')], by = 'Fallnummer')
head(data)
dir_outs <- './outs/'
dir.create(dir_outs)
colnames(data)
nrow(data_matrices)
Take a look at the number of cases, distribution etc
variables_to_plot <- colnames(data)[21:ncol(data)]
for(i in 1:length(variables_to_plot)){
# Piechart
name <- variables_to_plot[i]
width <- 15
height <- 11
set_figsize(width, height)
obj <- data
variable <- variables_to_plot[i]
variable_name <- variable #for ggtitle
obj$variable <- obj[[variable]]
val_variable <- unique(obj[[variable]])
obj_plot <- data.frame(matrix(NA, nrow = length(val_variable), ncol = 2))
colnames(obj_plot) <- c('variable', 'value')
obj_plot$variable <- val_variable
for(i in 1:nrow(obj_plot)){
obj_plot$value[i] <- 100*nrow(subset(obj, variable == val_variable[i])) / nrow(obj)
obj_plot$value[i] <- round(obj_plot$value[i], 2)
}
if(nrow(obj_plot)==2){
obj_plot$variable[obj_plot$variable == 1] <- 'ja'
obj_plot$variable[obj_plot$variable == 0] <- 'nein'
}
ordering <- obj_plot$variable[order(obj_plot$value, decreasing = T)]
obj_plot <- obj_plot[order(obj_plot$value, decreasing = T), ]
obj_plot$variable <- factor(obj_plot$variable, levels = ordering)
if(nrow(obj_plot)==2 & unique(obj_plot$variable %in% c('ja', 'nein'))){
obj_plot$variable <- as.character(obj_plot$variable)
obj_plot <- obj_plot[order(obj_plot$variable, decreasing = F), ]
obj_plot$variable <- factor(obj_plot$variable, levels = c('ja', 'nein'))
}
plot <- ggplot(obj_plot, aes(x="", y=value, fill=variable)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
ggtitle(paste0(variable)) +
theme_void() + # remove background, grid, numeric labels
theme(
plot.title = element_text(size = 40, hjust = 0.5, face = 'bold'),
legend.title=element_text(size=40),
legend.text=element_text(size=40)) +
scale_fill_manual(name = '', values = hue_pal()(length(unique(obj_plot$variable))), labels = paste0(obj_plot$variable, ' (', obj_plot$value, '%)'))
plot
ggsave(plot, file = paste0(dir_outs, name, '.pdf'), width = width, height = height)
ggsave(plot, file = paste0(dir_outs, name, '.svg'), width = width, height = height)
write.csv(obj_plot, file = paste0(dir_outs, name, '.csv'))
}
# Piechart
name <- 'Diagnose'
width <- 15
height <- 11
set_figsize(width, height)
obj <- data
variable <- 'Diagnose'
variable_name <- variable #for ggtitle
obj$variable <- obj[[variable]]
val_variable <- unique(obj[[variable]])
obj_plot <- data.frame(matrix(NA, nrow = length(val_variable), ncol = 2))
colnames(obj_plot) <- c('variable', 'value')
obj_plot$variable <- val_variable
for(i in 1:nrow(obj_plot)){
obj_plot$value[i] <- 100*nrow(subset(obj, variable == val_variable[i])) / nrow(obj)
obj_plot$value[i] <- round(obj_plot$value[i], 2)
}
ordering <- obj_plot$variable[order(obj_plot$value, decreasing = T)]
obj_plot <- obj_plot[order(obj_plot$value, decreasing = T), ]
obj_plot$variable <- factor(obj_plot$variable, levels = ordering)
plot <- ggplot(obj_plot, aes(x="", y=value, fill=variable)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
ggtitle(paste0(variable)) +
theme_void() + # remove background, grid, numeric labels
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
legend.title=element_text(size=15),
legend.text=element_text(size=15)) +
scale_fill_manual(name = '', values = hue_pal()(length(unique(obj_plot$variable))), labels = paste0(obj_plot$variable, ' (', obj_plot$value, '%)'))
plot
ggsave(plot, file = paste0(dir_outs, name, '.pdf'), width = width, height = height)
ggsave(plot, file = paste0(dir_outs, name, '.svg'), width = width, height = height)
write.csv(obj_plot, file = paste0(dir_outs, name, '.csv'))
#plot continues variables
variables_to_plot <- colnames(data)[21:ncol(data)]
variables_to_plot <- c('Alter', 'Behandlungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops')
obj_plot <- data
width <- 6
height <- 6
set_figsize(width, height)
for(i in 1:length(variables_to_plot)){
variable <- variables_to_plot[i]
obj_plot_subset <- obj_plot[variable]
colnames(obj_plot_subset) <- 'Variable'
plot <- ggplot(obj_plot_subset, aes(x = 0, y=Variable, fill = hue_pal()(1))) +
geom_boxplot(width = 0.6, outlier.shape = NA) +
geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
xlab("") +
ggtitle(paste0(variable)) +
ylab("") +
scale_fill_manual(values=hue_pal()(1)) +
cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=15),
plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
axis.text.y = element_text(size=15),
legend.position = 'none')
ggsave(plot, file = paste0(dir_outs, variable, '_boxplot.pdf'), width = width, height = height)
ggsave(plot, file = paste0(dir_outs, variable, '_boxplot.svg'), width = width, height = height)
}
Take a look at the wins within the years:
grep(colnames(data_matrices), pattern = 'Gesamt', value = TRUE)
data_matrices[c('Patientennamen', 'Entlassungsdatum', 'Gesamt_Gesamt')]
sum(data_matrices[['Verlustfall']] == 1)
sum(data_matrices[['Verlustfall']] == 0)
#for year 2020:
print('2020')
data_year <- data_matrices[grep(data_matrices[['Entlassungsdatum']], pattern = 2020), ]
sum(data_year[['Gesamt_Gesamt']])
print('mean and sd')
mean(data_year[['Gesamt_Gesamt']])
sd(data_year[['Gesamt_Gesamt']])
print('median and iqr')
median(data_year[['Gesamt_Gesamt']])
IQR(data_year[['Gesamt_Gesamt']])
print('number of verlustfall')
sum(data_year[['Verlustfall']] == 1)
100*sum(data_year[['Verlustfall']] == 1) / nrow(data_year)
print('2021')
data_year <- data_matrices[grep(data_matrices[['Entlassungsdatum']], pattern = 2021), ]
sum(data_year[['Gesamt_Gesamt']])
print('mean and sd')
mean(data_year[['Gesamt_Gesamt']])
sd(data_year[['Gesamt_Gesamt']])
print('median and iqr')
median(data_year[['Gesamt_Gesamt']])
IQR(data_year[['Gesamt_Gesamt']])
print('number of verlustfall')
sum(data_year[['Verlustfall']] == 1)
100*sum(data_year[['Verlustfall']] == 1) / nrow(data_year)
print('2022')
data_year <- data_matrices[grep(data_matrices[['Entlassungsdatum']], pattern = 2022), ]
sum(data_year[['Gesamt_Gesamt']])
print('mean and sd')
mean(data_year[['Gesamt_Gesamt']])
sd(data_year[['Gesamt_Gesamt']])
print('median and iqr')
median(data_year[['Gesamt_Gesamt']])
IQR(data_year[['Gesamt_Gesamt']])
print('number of verlustfall')
sum(data_year[['Verlustfall']] == 1)
100*sum(data_year[['Verlustfall']] == 1) / nrow(data_year)
#test wheter parametric or not, if P > 0.05 then param
ad.test(data_year[['Gesamt_Gesamt']])
# Summ up the data:
Test proportions
outs_dir <- './outs_proportions/'
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
if(length(unique(data[[i]])) == 2){
variables <- c(variables, colnames(data)[i])
} else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
data_small <- data[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables
data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
#create the overview of cases overall
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 2))
colnames(df) <- c('Variable', 'absolut(%)')
df$Variable <- variables
for(i in 1:nrow(df)){
#number of positives:
number_of_positive <- sum(data[[df$Variable[i]]] == 1)
df[['absolut(%)']][i] <- paste0(number_of_positive, '(', round(100*number_of_positive/nrow(data), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data_all.csv'))
df
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
variable <- variables[i]
number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
nein_all <- sum(data_small[['Gruppe']] == 'nein')
df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
ja_all <- sum(data_small[['Gruppe']] == 'ja')
df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
results <- rbind(results, results_subset)
}
results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)
results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 1, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height =
.2, color = "gray50") +
xlim(-2, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Odds Ratio") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 0, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height =
.2, color = "gray75") +
xlim(-max_lim, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Log(OR)") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
#plot the fractions diagramm
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('')+
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
}
Create table of quantitave values, test quantitive values
outs_dir <- './outs_quantitative/'
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
if(length(unique(data[[i]])) == 2){
variables <- c(variables, colnames(data)[i])
} else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
variables_cont
variables <- c('Alter', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten', 'OP_Folgeops_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('Variable', 'mean(SD)', 'median(IQR)')
df$Variable <- variables
for(i in 1:nrow(df)){
df[['mean(SD)']][i] <- paste0(round(mean(data[[df$Variable[i]]]), 2), '(', round(sd(data[[df$Variable[i]]]), 2), ')')
df[['median(IQR)']][i] <- paste0(round(median(data[[df$Variable[i]]]), 2), '(', round(IQR(data[[df$Variable[i]]]), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave_all.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten', 'OP_Folgeops_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 4))
colnames(df) <- c('variables', 'description', 'ja', 'nein')
df$variables <- variables
obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
data_nein <- filter(obj, Gruppe == 'nein')
data_ja <- filter(obj, Gruppe == 'ja')
for(i in 1:nrow(df)){
variable <- variables[i]
#test wheter parametric or not, if P > 0.05 then param
p_value <- ad.test(obj[[variable]])[[2]]
if(p_value < 0.05){
df$description[i] <- 'median (IQR)'
df$nein[i] <- paste0(round(median(data_nein[[variable]], na.rm = T), 2), ' (', round(IQR(data_nein[[variable]], na.rm = T), 2), ')')
df$ja[i] <- paste0(round(median(data_ja[[variable]], na.rm = T), 2), ' (', round(IQR(data_ja[[variable]], na.rm = T), 2), ')')
} else {
df$description[i] <- 'mean (SD)'
df$nein[i] <- paste0(round(mean(data_nein[[variable]], na.rm = T), 2), ' (', round(sd(data_nein[[variable]], na.rm = T), 2), ')')
df$ja[i] <- paste0(round(mean(data_ja[[variable]], na.rm = T), 2), ' (', round(sd(data_ja[[variable]], na.rm = T), 2), ')')
}
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten', 'OP_Folgeops_Minuten')
results <- c()
obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
for(i in 1:length(variables)){
variable_ja <- variables[i]
data_subset <- obj[c(variables[i], 'Gruppe')]
colnames(data_subset)[colnames(data_subset) != 'Gruppe'] <- 'Variable_to_test'
p_value <- ad.test(obj[[variable_ja]])[[2]]
if(p_value < 0.05){
statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 'wilcox.test', p.adjust.method = 'bonferroni', paired = FALSE)
statistics[1] <- variable_ja
} else {
statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 't.test', p.adjust.method = 'bonferroni', paired = FALSE)
statistics[1] <- variable_ja
}
obj_plot <- obj[c(variable_ja, 'Gruppe')]
colnames(obj_plot)[1] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Gruppe, y = Variable, fill = Gruppe)) +
geom_boxplot(width = 0.6, outlier.shape = NA) +
geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
ggtitle(paste0(variable_ja, '\n(', statistics$method[1], ' Test, p = ', round(statistics$p.adj[1], 3), ')')) +
xlab("Verlustfall?") +
ylab("") +
cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
axis.title.x = element_text(size=17),
plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
axis.text.y = element_text(size=17),
legend.position = 'none')
ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.svg'), width = 6, height = 6)
results <- rbind(results, statistics)
}
results
write.csv(results, file = paste0(outs_dir, 'quantitave_testing.csv'))
Analysis of the costs with correlation
dir_outs <- './outs_costs_correlation/'
dir.create(dir_outs)
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
obj <- data_matrices[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
head(obj)
variables
# test correlation of the parameters on the outcome of 'Gesamt_Gesamt'
variables <- variables[!(variables %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-'))]
results <- c()
for(i in 1:length(variables)){
correlation_result <- cor.test(obj[[variables[i]]], obj[['Gesamt_Gesamt']], method = "spearman")
subset_results <- data.frame(Variable = variables[i], rho = correlation_result$estimate, p_value = correlation_result$p.value)
# Create the dot plot
obj_plot <- obj[c(variables[i], 'Gesamt_Gesamt')]
colnames(obj_plot)[colnames(obj_plot) == variables[i]] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Variable, y = Gesamt_Gesamt)) +
geom_point() +
geom_smooth() +
ggtitle(paste0(variables[i], ', Spearman, \np = ', round(subset_results$p_value[1], 0),
', rho = ', round(subset_results$rho[1], 3), ')')) +
xlab(variables[i]) +
ylab("Gesamterlös") +
cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
axis.title.x = element_text(size=17),
plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
axis.text.y = element_text(size=17),
legend.position = 'none')
ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.svg'), width = 6, height = 6)
#combine the test results
results <- rbind(results, subset_results)
}
results
write.csv(results, file = paste0(dir_outs, 'spearman_correlation.csv'))
Analysis of costs with frequency of Verlust per category
obj_subset <- obj
obj_subset <- obj_subset[1:ncol(obj_subset)-1]
obj_subset[obj_subset > 0] <- 0
obj_subset[obj_subset < 0] <- 1
obj_subset$Verlustfall <- obj$Verlustfall
head(obj_subset)
outs_dir <- './outs_costs_fractions/'
dir.create(outs_dir)
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
data_small <- obj_subset[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables
data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
variable <- variables[i]
number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
nein_all <- sum(data_small[['Gruppe']] == 'nein')
df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
ja_all <- sum(data_small[['Gruppe']] == 'ja')
df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
results <- rbind(results, results_subset)
}
results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)
results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 1, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height =
.2, color = "gray50") +
xlim(-2, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Odds Ratio") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 0, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height =
.2, color = "gray75") +
xlim(-max_lim, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Log(OR)") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
#plot the fractions diagramm
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('')+
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
#plot the fractions diagramm transposed (x axis - how many in Verlustfall, y - how many times the variable in Verlust)
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, Verlustfall == data_to_test_long$Verlustfall[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=name, y=relative, x=Verlustfall)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('Verlustfall?')+
guides(fill=guide_legend("Variable negativ?")) +
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.svg'), width = 6, height = 6)
}
Take a deeper look at the OP costs and ANA costs
OP
outs_dir <- './outs_costs_fractions/OP/'
dir.create(outs_dir)
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '4 OP', value = T), 'Verlustfall')
variables
obj <- data_matrices[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
obj_subset <- obj
obj_subset <- obj_subset[1:ncol(obj_subset)-1]
obj_subset[obj_subset > 0] <- 0
obj_subset[obj_subset < 0] <- 1
obj_subset$Verlustfall <- obj$Verlustfall
data_small <- obj_subset
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables
data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
variable <- variables[i]
number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
nein_all <- sum(data_small[['Gruppe']] == 'nein')
df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
ja_all <- sum(data_small[['Gruppe']] == 'ja')
df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
results <- rbind(results, results_subset)
}
results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)
results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 1, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height =
.2, color = "gray50") +
xlim(-2, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Odds Ratio") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 0, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height =
.2, color = "gray75") +
xlim(-max_lim, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Log(OR)") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
#plot the fractions diagramm
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('')+
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
#plot the fractions diagramm transposed (x axis - how many in Verlustfall, y - how many times the variable in Verlust)
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, Verlustfall == data_to_test_long$Verlustfall[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=name, y=relative, x=Verlustfall)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('Verlustfall?')+
guides(fill=guide_legend("Variable negativ?")) +
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.svg'), width = 6, height = 6)
}
ANA
outs_dir <- './outs_costs_fractions/ANA/'
dir.create(outs_dir)
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '5 ANA', value = T), 'Verlustfall')
variables
obj <- data_matrices[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
obj_subset <- obj
obj_subset <- obj_subset[1:ncol(obj_subset)-1]
obj_subset[obj_subset > 0] <- 0
obj_subset[obj_subset < 0] <- 1
obj_subset$Verlustfall <- obj$Verlustfall
data_small <- obj_subset
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables
data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
variable <- variables[i]
number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
nein_all <- sum(data_small[['Gruppe']] == 'nein')
df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
ja_all <- sum(data_small[['Gruppe']] == 'ja')
df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
results <- rbind(results, results_subset)
}
results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)
results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 1, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height =
.2, color = "gray50") +
xlim(-2, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Odds Ratio") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 0, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height =
.2, color = "gray75") +
xlim(-max_lim, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Log(OR)") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
#plot the fractions diagramm
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('')+
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
#plot the fractions diagramm transposed (x axis - how many in Verlustfall, y - how many times the variable in Verlust)
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, Verlustfall == data_to_test_long$Verlustfall[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=name, y=relative, x=Verlustfall)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('Verlustfall?')+
guides(fill=guide_legend("Variable negativ?")) +
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.svg'), width = 6, height = 6)
}
Verlustfälle: wer hatte den größten Anteil am Verlust?
outs_dir <- './outs_costs_fractions/anteil_verlust/'
dir.create(outs_dir)
first the columns
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
height
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)
then the rows
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)
#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)
Gewinnfälle: wer hatte den größten Anteil am Verlust?
outs_dir <- './outs_costs_fractions/anteil_gewinn/'
dir.create(outs_dir)
first the columns
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)
then the rows
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)
#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)
Run multiple regression
outs_dir <- 'outs'
dir.create(outs_dir)
colnames(data)
columns_remove <- c('Patientennummer', 'Geschlecht', 'Alter', 'BMI', 'Extremitaetenverlust')
obj <- data[!(colnames(data) %in% columns_remove)]
#remove NAs
#rename the gruppe variable for logistic regression
obj$Gruppe[obj$Gruppe == 'nein'] <- 0
obj$Gruppe[obj$Gruppe == 'ja'] <- 1
unique(obj$Gruppe)
obj <- na.omit(obj)
nrow(obj)
head(obj)
#Perform multinomial logistic regression with LR
likehoodmultinom_p <- function(model_lmm) {i <- 1
variables <-c("No funciona")
Pvalues <- c("No funciona")
for (var in model_lmm$coefnames[-1]) {
variables[i] =paste(var)
Pvalues[i]= lrtest(model_lmm, var)[[5]][2]
i=i+1
}
return (data.frame(variables, Pvalues))
}
#compute the multinomial regression model
model <- multinom(formula = Gruppe ~ ., data = obj, trace = F)
#transform to or
model_or <- exp(cbind(OR = coef(model), confint(model)))
model_or <- data.frame(model_or)
#add coefficitents
model_or$Coefficients <- as.numeric(paste(coef(model)))
#model_or <- round(model_or, digits=2)
#perform LR statistics
test <- likehoodmultinom_p(model)
model_or <- model_or[2:nrow(model_or), ]
#combine results
model_or <- cbind(model_or, test$Pvalues)
colnames(model_or) <- c('OR', '2.5CI', '97.5CI', 'Coefficients', 'Pvalues')
for(i in 1:ncol(model_or)){
model_or[[i]] <- round(as.numeric(model_or[[i]]), 2)
}
model_or$significance <- 'yes'
model_or$significance[model_or$Pvalues > 0.05] <- 'no'
model_or
write.csv(model_or, file = paste0(outs_dir, 'multinomial_regression_all.csv'))
set_figsize(10, 8)
color_qual_flow2 <- c("TRUE" = "#D3556E", yes = "#D3556E", "FALSE" = "lightgrey", no = "lightgrey")
data_plot <- model_or
data_plot$Variables <- rownames(data_plot)
data_plot <- data_plot[order(data_plot$Coefficients, decreasing = F),]
data_plot$Variables <- factor(data_plot$Variables, levels = unique(data_plot$Variables))
data_plot$Values <- data_plot$Coefficients
plot <- ggplot(data_plot, aes(y=Variables, x= Values, fill = significance == 'yes'))+
#xlim(-max(data$FC), max(data$FC))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 0, color = "grey", linetype="dashed") +
geom_segment( aes(yend=Variables, xend=0), col= "black") +
geom_point(shape=21, aes(size = 4)) +
#xlim(min(data_plot$Values), max(data_plot$Values)) +
theme_light()+ xlab("Regressionskoeffiziente") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right")+ ggtitle('Regressionskoeffiziente (Extremitätenerhalt)')
plot
ggsave(plot, file=paste0(outs_dir, 'koeff_all_multinomial.pdf'), width = 10, height = 8)
Check the data between the years
data_matrices_old <- data_matrices
data_old <- data
2020
year_analysis <- 2020
data_year <- data_old[grep(data_old[['Entlassungsdatum']], pattern = year_analysis), ]
data_matrices_year <- data_matrices_old[grep(data_matrices_old[['Entlassungsdatum']], pattern = year_analysis), ]
data <- data_year
nrow(data)
Test proportions
outs_dir <- paste0('./outs_proportions_', year_analysis, '/')
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
if(length(unique(data[[i]])) == 2){
variables <- c(variables, colnames(data)[i])
} else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
data_small <- data[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables
data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
#create the overview of cases overall
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 2))
colnames(df) <- c('Variable', 'absolut(%)')
df$Variable <- variables
for(i in 1:nrow(df)){
#number of positives:
number_of_positive <- sum(data[[df$Variable[i]]] == 1)
df[['absolut(%)']][i] <- paste0(number_of_positive, '(', round(100*number_of_positive/nrow(data), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data_all.csv'))
df
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
variable <- variables[i]
number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
nein_all <- sum(data_small[['Gruppe']] == 'nein')
df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
ja_all <- sum(data_small[['Gruppe']] == 'ja')
df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
results <- rbind(results, results_subset)
}
results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)
results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 1, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height =
.2, color = "gray50") +
xlim(-2, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Odds Ratio") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 0, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height =
.2, color = "gray75") +
xlim(-max_lim, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Log(OR)") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
#plot the fractions diagramm
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('')+
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
}
Create table of quantitave values, test quantitive values
outs_dir <- paste0('./outs_quantitative', year_analysis, '/')
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
if(length(unique(data[[i]])) == 2){
variables <- c(variables, colnames(data)[i])
} else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
variables_cont
variables <- c('Alter', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten', 'OP_Folgeops_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('Variable', 'mean(SD)', 'median(IQR)')
df$Variable <- variables
for(i in 1:nrow(df)){
df[['mean(SD)']][i] <- paste0(round(mean(data[[df$Variable[i]]]), 2), '(', round(sd(data[[df$Variable[i]]]), 2), ')')
df[['median(IQR)']][i] <- paste0(round(median(data[[df$Variable[i]]]), 2), '(', round(IQR(data[[df$Variable[i]]]), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave_all.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 4))
colnames(df) <- c('variables', 'description', 'ja', 'nein')
df$variables <- variables
obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
data_nein <- filter(obj, Gruppe == 'nein')
data_ja <- filter(obj, Gruppe == 'ja')
for(i in 1:nrow(df)){
variable <- variables[i]
#test wheter parametric or not, if P > 0.05 then param
p_value <- ad.test(obj[[variable]])[[2]]
if(p_value < 0.05){
df$description[i] <- 'median (IQR)'
df$nein[i] <- paste0(round(median(data_nein[[variable]], na.rm = T), 2), ' (', round(IQR(data_nein[[variable]], na.rm = T), 2), ')')
df$ja[i] <- paste0(round(median(data_ja[[variable]], na.rm = T), 2), ' (', round(IQR(data_ja[[variable]], na.rm = T), 2), ')')
} else {
df$description[i] <- 'mean (SD)'
df$nein[i] <- paste0(round(mean(data_nein[[variable]], na.rm = T), 2), ' (', round(sd(data_nein[[variable]], na.rm = T), 2), ')')
df$ja[i] <- paste0(round(mean(data_ja[[variable]], na.rm = T), 2), ' (', round(sd(data_ja[[variable]], na.rm = T), 2), ')')
}
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten')
results <- c()
obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
for(i in 1:length(variables)){
variable_ja <- variables[i]
data_subset <- obj[c(variables[i], 'Gruppe')]
colnames(data_subset)[colnames(data_subset) != 'Gruppe'] <- 'Variable_to_test'
p_value <- ad.test(obj[[variable_ja]])[[2]]
if(p_value < 0.05){
statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 'wilcox.test', p.adjust.method = 'bonferroni', paired = FALSE)
statistics[1] <- variable_ja
} else {
statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 't.test', p.adjust.method = 'bonferroni', paired = FALSE)
statistics[1] <- variable_ja
}
obj_plot <- obj[c(variable_ja, 'Gruppe')]
colnames(obj_plot)[1] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Gruppe, y = Variable, fill = Gruppe)) +
geom_boxplot(width = 0.6, outlier.shape = NA) +
geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
ggtitle(paste0(variable_ja, '\n(', statistics$method[1], ' Test, p = ', round(statistics$p.adj[1], 3), ')')) +
xlab("Verlustfall?") +
ylab("") +
cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
axis.title.x = element_text(size=17),
plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
axis.text.y = element_text(size=17),
legend.position = 'none')
ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.svg'), width = 6, height = 6)
results <- rbind(results, statistics)
}
results
write.csv(results, file = paste0(outs_dir, 'quantitave_testing.csv'))
Analysis of the costs with correlation
dir_outs <- paste0('./outs_costs_correlation', year_analysis, '/')
dir.create(dir_outs)
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
obj <- data_matrices[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
head(obj)
variables
# test correlation of the parameters on the outcome of 'Gesamt_Gesamt'
variables <- variables[!(variables %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-'))]
results <- c()
for(i in 1:length(variables)){
correlation_result <- cor.test(obj[[variables[i]]], obj[['Gesamt_Gesamt']], method = "spearman")
subset_results <- data.frame(Variable = variables[i], rho = correlation_result$estimate, p_value = correlation_result$p.value)
# Create the dot plot
obj_plot <- obj[c(variables[i], 'Gesamt_Gesamt')]
colnames(obj_plot)[colnames(obj_plot) == variables[i]] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Variable, y = Gesamt_Gesamt)) +
geom_point() +
geom_smooth() +
ggtitle(paste0(variables[i], ', Spearman, \np = ', round(subset_results$p_value[1], 0),
', rho = ', round(subset_results$rho[1], 3), ')')) +
xlab(variables[i]) +
ylab("Gesamterlös") +
cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
axis.title.x = element_text(size=17),
plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
axis.text.y = element_text(size=17),
legend.position = 'none')
ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.svg'), width = 6, height = 6)
#combine the test results
results <- rbind(results, subset_results)
}
results
write.csv(results, file = paste0(dir_outs, 'spearman_correlation.csv'))
Analysis of costs with frequency of Verlust per category
obj_subset <- obj
obj_subset <- obj_subset[1:ncol(obj_subset)-1]
obj_subset[obj_subset > 0] <- 0
obj_subset[obj_subset < 0] <- 1
obj_subset$Verlustfall <- obj$Verlustfall
head(obj_subset)
outs_dir <- paste0('./outs_costs_fractions', year_analysis, '/')
dir.create(outs_dir)
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
data_small <- obj_subset[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables
data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
variable <- variables[i]
number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
nein_all <- sum(data_small[['Gruppe']] == 'nein')
df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
ja_all <- sum(data_small[['Gruppe']] == 'ja')
df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
results <- rbind(results, results_subset)
}
results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)
results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 1, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height =
.2, color = "gray50") +
xlim(-2, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Odds Ratio") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 0, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height =
.2, color = "gray75") +
xlim(-max_lim, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Log(OR)") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
#plot the fractions diagramm
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('')+
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
#plot the fractions diagramm transposed (x axis - how many in Verlustfall, y - how many times the variable in Verlust)
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, Verlustfall == data_to_test_long$Verlustfall[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=name, y=relative, x=Verlustfall)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('Verlustfall?')+
guides(fill=guide_legend("Variable negativ?")) +
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.svg'), width = 6, height = 6)
}
Verlustfälle: wer hatte den größten Anteil am Verlust?
outs_dir <- paste0('./outs_costs_fractions/anteil_verlust', year_analysis, '/')
dir.create(outs_dir)
first the columns
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(10,5)
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)
then the rows
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)
#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(5, 10)
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)
Gewinnfälle: wer hatte den größten Anteil am Verlust?
outs_dir <- paste0('./outs_costs_fractions/anteil_gewinn', year_analysis, '/')
dir.create(outs_dir)
first the columns
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(10, 5)
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)
then the rows
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)
#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)
2021
year_analysis <- 2021
data_year <- data_old[grep(data_old[['Entlassungsdatum']], pattern = year_analysis), ]
data_matrices_year <- data_matrices_old[grep(data_matrices_old[['Entlassungsdatum']], pattern = year_analysis), ]
data <- data_year
nrow(data)
Test proportions
outs_dir <- paste0('./outs_proportions_', year_analysis, '/')
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
if(length(unique(data[[i]])) == 2){
variables <- c(variables, colnames(data)[i])
} else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
data_small <- data[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables
data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
#create the overview of cases overall
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 2))
colnames(df) <- c('Variable', 'absolut(%)')
df$Variable <- variables
for(i in 1:nrow(df)){
#number of positives:
number_of_positive <- sum(data[[df$Variable[i]]] == 1)
df[['absolut(%)']][i] <- paste0(number_of_positive, '(', round(100*number_of_positive/nrow(data), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data_all.csv'))
df
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
variable <- variables[i]
number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
nein_all <- sum(data_small[['Gruppe']] == 'nein')
df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
ja_all <- sum(data_small[['Gruppe']] == 'ja')
df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
results <- rbind(results, results_subset)
}
results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)
results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 1, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height =
.2, color = "gray50") +
xlim(-2, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Odds Ratio") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 0, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height =
.2, color = "gray75") +
xlim(-max_lim, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Log(OR)") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
#plot the fractions diagramm
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('')+
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
}
Create table of quantitave values, test quantitive values
outs_dir <- paste0('./outs_quantitative', year_analysis, '/')
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
if(length(unique(data[[i]])) == 2){
variables <- c(variables, colnames(data)[i])
} else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
variables_cont
variables <- c('Alter', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten', 'OP_Folgeops_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('Variable', 'mean(SD)', 'median(IQR)')
df$Variable <- variables
for(i in 1:nrow(df)){
df[['mean(SD)']][i] <- paste0(round(mean(data[[df$Variable[i]]]), 2), '(', round(sd(data[[df$Variable[i]]]), 2), ')')
df[['median(IQR)']][i] <- paste0(round(median(data[[df$Variable[i]]]), 2), '(', round(IQR(data[[df$Variable[i]]]), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave_all.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 4))
colnames(df) <- c('variables', 'description', 'ja', 'nein')
df$variables <- variables
obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
data_nein <- filter(obj, Gruppe == 'nein')
data_ja <- filter(obj, Gruppe == 'ja')
for(i in 1:nrow(df)){
variable <- variables[i]
#test wheter parametric or not, if P > 0.05 then param
p_value <- ad.test(obj[[variable]])[[2]]
if(p_value < 0.05){
df$description[i] <- 'median (IQR)'
df$nein[i] <- paste0(round(median(data_nein[[variable]], na.rm = T), 2), ' (', round(IQR(data_nein[[variable]], na.rm = T), 2), ')')
df$ja[i] <- paste0(round(median(data_ja[[variable]], na.rm = T), 2), ' (', round(IQR(data_ja[[variable]], na.rm = T), 2), ')')
} else {
df$description[i] <- 'mean (SD)'
df$nein[i] <- paste0(round(mean(data_nein[[variable]], na.rm = T), 2), ' (', round(sd(data_nein[[variable]], na.rm = T), 2), ')')
df$ja[i] <- paste0(round(mean(data_ja[[variable]], na.rm = T), 2), ' (', round(sd(data_ja[[variable]], na.rm = T), 2), ')')
}
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten')
results <- c()
obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
for(i in 1:length(variables)){
variable_ja <- variables[i]
data_subset <- obj[c(variables[i], 'Gruppe')]
colnames(data_subset)[colnames(data_subset) != 'Gruppe'] <- 'Variable_to_test'
p_value <- ad.test(obj[[variable_ja]])[[2]]
if(p_value < 0.05){
statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 'wilcox.test', p.adjust.method = 'bonferroni', paired = FALSE)
statistics[1] <- variable_ja
} else {
statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 't.test', p.adjust.method = 'bonferroni', paired = FALSE)
statistics[1] <- variable_ja
}
obj_plot <- obj[c(variable_ja, 'Gruppe')]
colnames(obj_plot)[1] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Gruppe, y = Variable, fill = Gruppe)) +
geom_boxplot(width = 0.6, outlier.shape = NA) +
geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
ggtitle(paste0(variable_ja, '\n(', statistics$method[1], ' Test, p = ', round(statistics$p.adj[1], 3), ')')) +
xlab("Verlustfall?") +
ylab("") +
cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
axis.title.x = element_text(size=17),
plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
axis.text.y = element_text(size=17),
legend.position = 'none')
ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.svg'), width = 6, height = 6)
results <- rbind(results, statistics)
}
results
write.csv(results, file = paste0(outs_dir, 'quantitave_testing.csv'))
Analysis of the costs with correlation
dir_outs <- paste0('./outs_costs_correlation', year_analysis, '/')
dir.create(dir_outs)
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
obj <- data_matrices[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
head(obj)
variables
# test correlation of the parameters on the outcome of 'Gesamt_Gesamt'
variables <- variables[!(variables %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-'))]
results <- c()
for(i in 1:length(variables)){
correlation_result <- cor.test(obj[[variables[i]]], obj[['Gesamt_Gesamt']], method = "spearman")
subset_results <- data.frame(Variable = variables[i], rho = correlation_result$estimate, p_value = correlation_result$p.value)
# Create the dot plot
obj_plot <- obj[c(variables[i], 'Gesamt_Gesamt')]
colnames(obj_plot)[colnames(obj_plot) == variables[i]] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Variable, y = Gesamt_Gesamt)) +
geom_point() +
geom_smooth() +
ggtitle(paste0(variables[i], ', Spearman, \np = ', round(subset_results$p_value[1], 0),
', rho = ', round(subset_results$rho[1], 3), ')')) +
xlab(variables[i]) +
ylab("Gesamterlös") +
cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
axis.title.x = element_text(size=17),
plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
axis.text.y = element_text(size=17),
legend.position = 'none')
ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.svg'), width = 6, height = 6)
#combine the test results
results <- rbind(results, subset_results)
}
results
write.csv(results, file = paste0(dir_outs, 'spearman_correlation.csv'))
Analysis of costs with frequency of Verlust per category
obj_subset <- obj
obj_subset <- obj_subset[1:ncol(obj_subset)-1]
obj_subset[obj_subset > 0] <- 0
obj_subset[obj_subset < 0] <- 1
obj_subset$Verlustfall <- obj$Verlustfall
head(obj_subset)
outs_dir <- paste0('./outs_costs_fractions', year_analysis, '/')
dir.create(outs_dir)
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
data_small <- obj_subset[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables
data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
variable <- variables[i]
number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
nein_all <- sum(data_small[['Gruppe']] == 'nein')
df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
ja_all <- sum(data_small[['Gruppe']] == 'ja')
df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
results <- rbind(results, results_subset)
}
results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)
results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 1, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height =
.2, color = "gray50") +
xlim(-2, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Odds Ratio") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 0, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height =
.2, color = "gray75") +
xlim(-max_lim, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Log(OR)") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
#plot the fractions diagramm
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('')+
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
#plot the fractions diagramm transposed (x axis - how many in Verlustfall, y - how many times the variable in Verlust)
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, Verlustfall == data_to_test_long$Verlustfall[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=name, y=relative, x=Verlustfall)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('Verlustfall?')+
guides(fill=guide_legend("Variable negativ?")) +
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.svg'), width = 6, height = 6)
}
Verlustfälle: wer hatte den größten Anteil am Verlust?
outs_dir <- paste0('./outs_costs_fractions/anteil_verlust', year_analysis, '/')
dir.create(outs_dir)
first the columns
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(10,5)
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)
then the rows
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)
#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(5, 10)
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)
Gewinnfälle: wer hatte den größten Anteil am Verlust?
outs_dir <- paste0('./outs_costs_fractions/anteil_gewinn', year_analysis, '/')
dir.create(outs_dir)
first the columns
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(10, 5)
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)
then the rows
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)
#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)
2022
year_analysis <- 2022
data_year <- data_old[grep(data_old[['Entlassungsdatum']], pattern = year_analysis), ]
data_matrices_year <- data_matrices_old[grep(data_matrices_old[['Entlassungsdatum']], pattern = year_analysis), ]
data <- data_year
nrow(data)
Test proportions
outs_dir <- paste0('./outs_proportions_', year_analysis, '/')
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
if(length(unique(data[[i]])) == 2){
variables <- c(variables, colnames(data)[i])
} else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
data_small <- data[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables
data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
#create the overview of cases overall
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 2))
colnames(df) <- c('Variable', 'absolut(%)')
df$Variable <- variables
for(i in 1:nrow(df)){
#number of positives:
number_of_positive <- sum(data[[df$Variable[i]]] == 1)
df[['absolut(%)']][i] <- paste0(number_of_positive, '(', round(100*number_of_positive/nrow(data), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data_all.csv'))
df
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
variable <- variables[i]
number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
nein_all <- sum(data_small[['Gruppe']] == 'nein')
df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
ja_all <- sum(data_small[['Gruppe']] == 'ja')
df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
results <- rbind(results, results_subset)
}
results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)
results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 1, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height =
.2, color = "gray50") +
xlim(-2, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Odds Ratio") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 0, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height =
.2, color = "gray75") +
xlim(-max_lim, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Log(OR)") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
#plot the fractions diagramm
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('')+
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
}
Create table of quantitave values, test quantitive values
outs_dir <- paste0('./outs_quantitative', year_analysis, '/')
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
if(length(unique(data[[i]])) == 2){
variables <- c(variables, colnames(data)[i])
} else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
variables_cont
variables <- c('Alter', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten', 'OP_Folgeops_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('Variable', 'mean(SD)', 'median(IQR)')
df$Variable <- variables
for(i in 1:nrow(df)){
df[['mean(SD)']][i] <- paste0(round(mean(data[[df$Variable[i]]]), 2), '(', round(sd(data[[df$Variable[i]]]), 2), ')')
df[['median(IQR)']][i] <- paste0(round(median(data[[df$Variable[i]]]), 2), '(', round(IQR(data[[df$Variable[i]]]), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave_all.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 4))
colnames(df) <- c('variables', 'description', 'ja', 'nein')
df$variables <- variables
obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
data_nein <- filter(obj, Gruppe == 'nein')
data_ja <- filter(obj, Gruppe == 'ja')
for(i in 1:nrow(df)){
variable <- variables[i]
#test wheter parametric or not, if P > 0.05 then param
p_value <- ad.test(obj[[variable]])[[2]]
if(p_value < 0.05){
df$description[i] <- 'median (IQR)'
df$nein[i] <- paste0(round(median(data_nein[[variable]], na.rm = T), 2), ' (', round(IQR(data_nein[[variable]], na.rm = T), 2), ')')
df$ja[i] <- paste0(round(median(data_ja[[variable]], na.rm = T), 2), ' (', round(IQR(data_ja[[variable]], na.rm = T), 2), ')')
} else {
df$description[i] <- 'mean (SD)'
df$nein[i] <- paste0(round(mean(data_nein[[variable]], na.rm = T), 2), ' (', round(sd(data_nein[[variable]], na.rm = T), 2), ')')
df$ja[i] <- paste0(round(mean(data_ja[[variable]], na.rm = T), 2), ' (', round(sd(data_ja[[variable]], na.rm = T), 2), ')')
}
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten')
results <- c()
obj <- data
obj$Gruppe <- data$Verlustfall
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
for(i in 1:length(variables)){
variable_ja <- variables[i]
data_subset <- obj[c(variables[i], 'Gruppe')]
colnames(data_subset)[colnames(data_subset) != 'Gruppe'] <- 'Variable_to_test'
p_value <- ad.test(obj[[variable_ja]])[[2]]
if(p_value < 0.05){
statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 'wilcox.test', p.adjust.method = 'bonferroni', paired = FALSE)
statistics[1] <- variable_ja
} else {
statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 't.test', p.adjust.method = 'bonferroni', paired = FALSE)
statistics[1] <- variable_ja
}
obj_plot <- obj[c(variable_ja, 'Gruppe')]
colnames(obj_plot)[1] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Gruppe, y = Variable, fill = Gruppe)) +
geom_boxplot(width = 0.6, outlier.shape = NA) +
geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
ggtitle(paste0(variable_ja, '\n(', statistics$method[1], ' Test, p = ', round(statistics$p.adj[1], 3), ')')) +
xlab("Verlustfall?") +
ylab("") +
cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
axis.title.x = element_text(size=17),
plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
axis.text.y = element_text(size=17),
legend.position = 'none')
ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.svg'), width = 6, height = 6)
results <- rbind(results, statistics)
}
results
write.csv(results, file = paste0(outs_dir, 'quantitave_testing.csv'))
Analysis of the costs with correlation
dir_outs <- paste0('./outs_costs_correlation', year_analysis, '/')
dir.create(dir_outs)
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
obj <- data_matrices[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
head(obj)
variables
# test correlation of the parameters on the outcome of 'Gesamt_Gesamt'
variables <- variables[!(variables %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-'))]
results <- c()
for(i in 1:length(variables)){
correlation_result <- cor.test(obj[[variables[i]]], obj[['Gesamt_Gesamt']], method = "spearman")
subset_results <- data.frame(Variable = variables[i], rho = correlation_result$estimate, p_value = correlation_result$p.value)
# Create the dot plot
obj_plot <- obj[c(variables[i], 'Gesamt_Gesamt')]
colnames(obj_plot)[colnames(obj_plot) == variables[i]] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Variable, y = Gesamt_Gesamt)) +
geom_point() +
geom_smooth() +
ggtitle(paste0(variables[i], ', Spearman, \np = ', round(subset_results$p_value[1], 0),
', rho = ', round(subset_results$rho[1], 3), ')')) +
xlab(variables[i]) +
ylab("Gesamterlös") +
cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
axis.title.x = element_text(size=17),
plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
axis.text.y = element_text(size=17),
legend.position = 'none')
ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(dir_outs, variables[i], '_correlationplot.svg'), width = 6, height = 6)
#combine the test results
results <- rbind(results, subset_results)
}
results
write.csv(results, file = paste0(dir_outs, 'spearman_correlation.csv'))
Analysis of costs with frequency of Verlust per category
obj_subset <- obj
obj_subset <- obj_subset[1:ncol(obj_subset)-1]
obj_subset[obj_subset > 0] <- 0
obj_subset[obj_subset < 0] <- 1
obj_subset$Verlustfall <- obj$Verlustfall
head(obj_subset)
outs_dir <- paste0('./outs_costs_fractions', year_analysis, '/')
dir.create(outs_dir)
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt', value = T), 'Verlustfall')
data_small <- obj_subset[c(variables)]
data_small$Gruppe <- data_small$Verlustfall
data_small$Verlustfall <- NULL
variables <- variables[!(variables %in% 'Verlustfall')]
variables
data_small$Gruppe[data_small$Gruppe == 0] <- 'nein'
data_small$Gruppe[data_small$Gruppe == 1] <- 'ja'
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('variables', 'ja', 'nein')
df$variables <- variables
for(i in 1:nrow(df)){
variable <- variables[i]
number_nein <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'nein')
nein_all <- sum(data_small[['Gruppe']] == 'nein')
df$nein[i] <- paste0(number_nein, ' (', round(100*number_nein/nein_all, 2), '%)')
number_ja <- sum(data_small[[variable]] == 1 & data_small[['Gruppe']] == 'ja')
ja_all <- sum(data_small[['Gruppe']] == 'ja')
df$ja[i] <- paste0(number_ja, ' (', round(100*number_ja/ja_all, 2), '%)')
}
write.csv(df, file = paste0(outs_dir, 'proportional_variable_data.csv'))
df
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
results <- rbind(results, results_subset)
}
results$logOR <- log(results$OR)
results$logCI_low <- log(results$CI_low)
results$logCI_high <- log(results$CI_high)
results
write.csv(results, file = paste0(outs_dir, 'proportions_testing.csv'))
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
#results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- 43
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= OR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 1, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = CI_high, xmin = CI_low), size = .5, height =
.2, color = "gray50") +
xlim(-2, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Odds Ratio") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Odds Ratio (nein vs ja)')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing.svg'), width = width, height = height)
width <- 14
height <- 8
set_figsize(width, height)
color_qual_flow2 <- c("TRUE" = "#D3556E", significant = "#D3556E", "FALSE" = "lightgrey", unsignificant = "lightgrey")
#plot the results
results_to_plot <- results[results$OR != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_high != Inf, ]
results_to_plot <- results_to_plot[results_to_plot$CI_low != Inf, ]
max_lim <- max(results_to_plot$logCI_high)
#min_lim <- min(results_to_plot$logCI_low)
#results_to_plot <- results_to_plot[results_to_plot$CI_high < max_lim, ]
results_to_plot <- results_to_plot[order(results_to_plot$OR, decreasing = T), ]
results_to_plot$Variable <- factor(results_to_plot$Variable, levels = unique(results_to_plot$Variable))
plot <- ggplot(results_to_plot, aes(y=Variable, x= logOR, fill = significancy == 'yes'))+
scale_fill_manual(values = color_qual_flow2)+
geom_vline(xintercept = 0, color = "black", linetype="dashed") +
geom_point(shape=21, aes(size = 4)) +
geom_errorbarh(aes(xmax = logCI_high, xmin = logCI_low), size = .5, height =
.2, color = "gray75") +
xlim(-max_lim, max_lim) +
#theme_tufte()+
theme_light()+
xlab("Log(OR)") + ylab("") +
theme(text=element_text(family="Helvetica"),
title = element_text(colour = 'black', size = 16),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 16, colour = 'black'),
legend.position = "right")+ ggtitle('Logarithmische Odds Ratio')
plot
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.pdf'), width = width, height = height)
ggsave(plot, file = paste0(outs_dir, 'OR_proportion_testing_logarithmic.svg'), width = width, height = height)
#create frequency plots with p values
results <- c()
obj <- data_small
for(i in 1:length(variables)){
variable <- variables[i]
#variable - columns, gruppe - rows
data_to_test <- data.frame(matrix(NA, nrow = 2, ncol = 2))
data_to_test[1, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'ja')
data_to_test[2, 1] <- sum(obj[[variable]] == 1 & obj[['Gruppe']] == 'nein')
data_to_test[1, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'ja')
data_to_test[2, 2] <- sum(obj[[variable]] == 0 & obj[['Gruppe']] == 'nein')
#run the test
test <- fisher.test(data_to_test)
results_subset <- data.frame(matrix(NA, ncol = 6, nrow = 1))
colnames(results_subset) <- c('Variable', 'p.value', 'significancy', 'CI_low', 'CI_high', 'OR')
results_subset$Variable <- variable
results_subset$p.value <- test$p.value
if(results_subset$p.value < 0.05){results_subset$significancy <- 'yes'} else {
results_subset$significancy <- 'no'
}
results_subset$CI_low <- test$conf.int[1]
results_subset$CI_high <- test$conf.int[2]
results_subset$OR <- test$estimate[[1]]
#plot the fractions diagramm
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, name == data_to_test_long$name[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=Verlustfall, y=relative, x=name)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('')+
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot.svg'), width = 6, height = 6)
#plot the fractions diagramm transposed (x axis - how many in Verlustfall, y - how many times the variable in Verlust)
data_to_test$Verlustfall <- c('ja', 'nein')
data_to_test_long <- pivot_longer(
data_to_test, cols = 1:2,
)
data_to_test_long$relative <- 0
for(i in 1:nrow(data_to_test_long)){
data_to_test_long$relative[i] <- 100*data_to_test_long$value[i]/sum(filter(data_to_test_long, Verlustfall == data_to_test_long$Verlustfall[i])$value)
}
data_to_test_long$name[data_to_test_long$name == 'X1'] <- paste0(variable, ' (ja)')
data_to_test_long$name[data_to_test_long$name == 'X2'] <- paste0(variable, ' (nein)')
data_to_test_long
plot <- ggplot(data_to_test_long, aes(fill=name, y=relative, x=Verlustfall)) +
geom_bar(position="stack", stat="identity", ) +
ggtitle(paste0('Exakter Test nach Fisher, p = ', round(results_subset$p.value, 2))) +
ylab('Fraktion der Fälle')+ xlab('Verlustfall?')+
guides(fill=guide_legend("Variable negativ?")) +
theme(
plot.title = element_text(size = 15, hjust = 0.5, face = 'bold'),
text=element_text(family="Helvetica", size = 15),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = 'black', size = 16),
axis.text.x = element_text(angle = 55, hjust = 1, size = 16, colour = 'black'))
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable, '_proportions_plot_t.svg'), width = 6, height = 6)
}
Verlustfälle: wer hatte den größten Anteil am Verlust?
outs_dir <- paste0('./outs_costs_fractions/anteil_verlust', year_analysis, '/')
dir.create(outs_dir)
first the columns
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(10,5)
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)
then the rows
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 1, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)
#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(5, 10)
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)
Gewinnfälle: wer hatte den größten Anteil am Verlust?
outs_dir <- paste0('./outs_costs_fractions/anteil_gewinn', year_analysis, '/')
dir.create(outs_dir)
first the columns
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = '_Gesamt', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall'))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'spalten_mittelwert.csv'))
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
set_figsize(10, 5)
plot
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'spalten_unscaled.svg'), width = 10, height = 5)
then the rows
#first the columns
obj <- data_matrices[data_matrices$Verlustfall == 0, ]
#select the variables
variables <- c(grep(x = colnames(data_matrices), pattern = 'Gesamt_', value = T), 'Verlustfall')
obj <- obj[variables]
obj[is.na(obj)] <- 0
obj[obj == '-'] <- 0
for(i in 1:ncol(obj)){
obj[[i]]<- as.numeric(obj[[i]])
}
colnames(obj)
head(obj)
#prepare the table for the heatmap
obj_heatmap <- obj[!(colnames(obj) %in% c('Gesamt_Gesamt', 'Verlustfall', 'Gesamt_-', 'Gesamt_6 KS'))]
#obj_heatmap <- obj_heatmap[order(colnames(obj_heatmap))]
obj_heatmap[1, ] <- colSums(obj_heatmap)/nrow(obj_heatmap)
obj_heatmap <- obj_heatmap[1, ]
obj_heatmap <- data.frame(t(obj_heatmap))
colnames(obj_heatmap) <- 'Value'
obj_heatmap$Variable <- rownames(obj_heatmap)
obj_heatmap$Value_scaled <- scale(obj_heatmap$Value)
obj_heatmap
write.csv(obj_heatmap, file = paste0(outs_dir, 'reihen_mittelwert.csv'))
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = obj_heatmap$Variable)
#plot the scaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled.svg'), width = 10, height = 5)
#plot the unscaled values
plot <- ggplot(obj_heatmap, aes(x = Variable, y = 1, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal()+
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
plot
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.pdf'), width = 10, height = 5)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled.svg'), width = 10, height = 5)
#plot the vertical plot for the future if needed
obj_heatmap$Variable <- factor(obj_heatmap$Variable, levels = rev(obj_heatmap$Variable))
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value_scaled)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_scaled_vertical.svg'), width = 5, height = 10)
plot <- ggplot(obj_heatmap, aes(x = 1, y = Variable, fill = Value)) +
geom_tile() +
cowplot::theme_cowplot() +
#grids(linetype = "dashed", size = 0.1) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab('') +
xlab('') +
theme(axis.ticks = element_blank()) + coord_equal() +
#scale_fill_viridis()
scale_fill_gradient2(low = '#1b4a6b', mid = "white", high = "#A20606", na.value = 'white')
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.pdf'), width = 5, height = 10)
ggsave(plot, file = paste0(outs_dir, 'reihen_unscaled_vertical.svg'), width = 5, height = 10)
Test the continuous values between the years with ANOVA
outs_dir <- './comparison_years/'
dir.create(outs_dir)
data_matrices <- data_matrices_old
data <- data_old
data_matrices$Year <- NA
data_matrices$Year[grep(data_matrices[['Entlassungsdatum']], pattern = 2020)] <- 2020
data_matrices$Year[grep(data_matrices[['Entlassungsdatum']], pattern = 2021)] <- 2021
data_matrices$Year[grep(data_matrices[['Entlassungsdatum']], pattern = 2022)] <- 2022
unique(data_matrices$Year)
length(grep(data_matrices[['Entlassungsdatum']], pattern = 2022))
data_obj <- data_matrices
data_obj[is.na(data_obj)] <- 0
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data_matrices)){
if(length(unique(data_matrices[[i]])) == 2){
variables <- c(variables, colnames(data_matrices)[i])
} else {variables_cont <- c(variables_cont, colnames(data_matrices)[i])}
}
variables_cont
variables_inek <- variables_cont[grep(variables_cont, pattern = 'Gesamt')]
variables_inek
for(i in 1:length(variables_inek)){
data_obj[[variables_inek[i]]] <- as.numeric(data_obj[[variables_inek[i]]])
}
data_obj[is.na(data_obj)] <- 0
variables <- c('Alter', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten', variables_inek)
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 7))
colnames(df) <- c('Variable', '2020, mean(SD)', '2020, median(IQR)', '2021, mean(SD)', '2021, median(IQR)', '2022, mean(SD)', '2022, median(IQR)')
df$Variable <- variables
for(i in 1:nrow(df)){
df[['2020, mean(SD)']][i] <- paste0(round(mean(data_obj[[df$Variable[i]]][data_obj$Year == 2020], na.rm = TRUE), 2), '(', round(sd(data_obj[[df$Variable[i]]][data_obj$Year == 2020], na.rm = TRUE), 2), ')')
df[['2020, median(IQR)']][i] <- paste0(round(median(data_obj[[df$Variable[i]]][data_obj$Year == 2020], na.rm = TRUE), 2), '(', round(IQR(data_obj[[df$Variable[i]]][data_obj$Year == 2020], na.rm = TRUE), 2), ')')
df[['2021, mean(SD)']][i] <- paste0(round(mean(data_obj[[df$Variable[i]]][data_obj$Year == 2021], na.rm = TRUE), 2), '(', round(sd(data_obj[[df$Variable[i]]][data_obj$Year == 2021], na.rm = TRUE), 2), ')')
df[['2021, median(IQR)']][i] <- paste0(round(median(data_obj[[df$Variable[i]]][data_obj$Year == 2021], na.rm = TRUE), 2), '(', round(IQR(data_obj[[df$Variable[i]]][data_obj$Year == 2021], na.rm = TRUE), 2), ')')
df[['2022, mean(SD)']][i] <- paste0(round(mean(data_obj[[df$Variable[i]]][data_obj$Year == 2022], na.rm = TRUE), 2), '(', round(sd(data_obj[[df$Variable[i]]][data_obj$Year == 2022], na.rm = TRUE), 2), ')')
df[['2022, median(IQR)']][i] <- paste0(round(median(data_obj[[df$Variable[i]]][data_obj$Year == 2022], na.rm = TRUE), 2), '(', round(IQR(data_obj[[df$Variable[i]]][data_obj$Year == 2022], na.rm = TRUE), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave_all.csv'))
df
#prepare the column names
data_to_test <- data_obj[c(variables, 'Year')]
data_to_test$Year <- as.factor(data_to_test$Year)
colnames(data_to_test) <- gsub(" ", "_", colnames(data_to_test))
colnames(data_to_test) <- gsub("^\\d+_", "", colnames(data_to_test))
colnames(data_to_test) <- gsub("^\\d+\\w_", "", colnames(data_to_test))
colnames(data_to_test)
#select variables
variables_to_test <- colnames(data_to_test)[1:(length(colnames(data_to_test))-2)]
variables_to_test
#test the values in loop
i <- 1
results_anova <- c()
for(i in 1:length(variables_to_test)){
model <- aov(as.formula(paste0(variables_to_test[i], '~ Year')), data = data_to_test)
TukeyHSD_results <- TukeyHSD(model)
TukeyHSD_results <- as.data.frame(TukeyHSD_results[[1]])
TukeyHSD_results$Comparisons <- rownames(TukeyHSD_results)
TukeyHSD_results$Test <- 'TukeyHSD'
TukeyHSD_results$Variable <- variables_to_test[i]
TukeyHSD_results[['p adj']] <- round(TukeyHSD_results[['p adj']], 3)
TukeyHSD_results$Significant <- 'no'
TukeyHSD_results$Significant[TukeyHSD_results[['p adj']] < 0.05] <- 'yes'
rownames(TukeyHSD_results) <- NULL
results_anova <- rbind(results_anova, TukeyHSD_results)
}
results_anova <- results_anova[order(results_anova$Significant, decreasing = TRUE), ]
rownames(results_anova) <- NULL
results_anova
write.csv(results_anova, file = paste0(outs_dir, 'summary_anova_all.csv'))
#plot the once that are significant
variables_to_plot <- results_anova$Variable[results_anova$Significant == 'yes']
variables_to_plot <- unique(variables_to_plot)
variables_to_plot
for(i in 1:length(variables_to_plot)){
variable_ja <- variables_to_plot[i]
obj_plot <- data_to_test[c(variable_ja, 'Year')]
plot <- ggplot(obj_plot, aes_string(x = 'Year', y = variable_ja, fill = 'Year')) +
geom_boxplot(width = 0.6, outlier.shape = NA) +
geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
ggtitle(paste0(variable_ja)) +
xlab("Jahr") +
ylab("") +
cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
axis.title.x = element_text(size=17),
plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
axis.text.y = element_text(size=17),
legend.position = 'none')
print(plot)
ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.pdf'), width = 6, height = 6)
}
take a look at the data per year and search for outliers
data_year_all <- data_obj[c('Patientennamen', 'Fallnummer', 'Year', 'Entlassungsdatum', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten', 'Gesamt_Gesamt')]
write.csv(data_year_all, file = paste0(outs_dir, '_original_data_all.csv'))
write.csv(subset(data_year_all, Year == 2020), file = paste0(outs_dir, '_original_data_2020.csv'))
write.csv(subset(data_year_all, Year == 2021), file = paste0(outs_dir, '_original_data_2021.csv'))
write.csv(subset(data_year_all, Year == 2022), file = paste0(outs_dir, '_original_data_2022.csv'))
nrow(data_obj)
median(subset(data_obj, Verlustfall == 0)[['Mittlere GrenzVwD']])
#determine IQR
iqr_gesamt <- IQR(data_year_all$Gesamt_Gesamt)
median_gesamt <- median(data_year_all$Gesamt_Gesamt)
cutoff <- median_gesamt - 1.5*iqr_gesamt
data_year_outliers <- subset(data_year_all, Gesamt_Gesamt < cutoff)
data_year_outliers
write.csv(data_year_outliers, file = paste0(outs_dir, '_outliers.csv'))
# Schneider, Kurt: 32 Tage Aufenthalt, dabei einzeitig sogar Lappendeckung. Lnager Aufenthalt, lange Mobi bei Seitenband Reko.
# Lutz, Margot: Borggreve Plastik, OP 580 Minuten, viele EKs gegeben, dabei größter Verlust bei OP mit 11816 Euro verlust. Sehr interessanter Fall! Eigentlich unter mVD.
# Etter, Ingrid: Wahrscheinlich kodierungsbedingt bei gutartigem Befund (Fibromatose Desmoidtyp), am meisten im OP verloren bei OP zeit von 200 minuten
data_fall <-as.data.frame(t(data_obj[data_obj$Fallnummer == '202033759', variables]))
data_fall$Variable <- rownames(data_fall)
data_fall[order(data_fall[1], decreasing = F), ]
Create table of quantitave values, test quantitive values between freie Lappenplastik und nicht
outs_dir <- './outs_quantitative_lappenplastik/'
dir.create(outs_dir)
colnames(data)
variables <- c()
variables_cont <- c()
for(i in 1:ncol(data)){
if(length(unique(data[[i]])) == 2){
variables <- c(variables, colnames(data)[i])
} else {variables_cont <- c(variables_cont, colnames(data)[i])}
}
variables_cont
variables <- c('Alter', 'Berechnungstage', 'Mittlere GrenzVwD', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten', 'OP_Folgeops_Minuten', 'Gesamt_Gesamt')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 3))
colnames(df) <- c('Variable', 'mean(SD)', 'median(IQR)')
df$Variable <- variables
for(i in 1:nrow(df)){
df[['mean(SD)']][i] <- paste0(round(mean(data_matrices[[df$Variable[i]]]), 2), '(', round(sd(data_matrices[[df$Variable[i]]]), 2), ')')
df[['median(IQR)']][i] <- paste0(round(median(data_matrices[[df$Variable[i]]]), 2), '(', round(IQR(data_matrices[[df$Variable[i]]]), 2), ')')
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave_all.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten', 'OP_Folgeops_Minuten', 'Gesamt_Gesamt')
df <- data.frame(matrix(NA, nrow = length(variables), ncol = 4))
colnames(df) <- c('variables', 'description', 'ja', 'nein')
df$variables <- variables
obj <- data_matrices
obj$Gruppe <- data$Freie_Lappenplastik
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
data_nein <- filter(obj, Gruppe == 'nein')
data_ja <- filter(obj, Gruppe == 'ja')
for(i in 1:nrow(df)){
variable <- variables[i]
#test wheter parametric or not, if P > 0.05 then param
p_value <- ad.test(obj[[variable]])[[2]]
if(p_value < 0.05){
df$description[i] <- 'median (IQR)'
df$nein[i] <- paste0(round(median(data_nein[[variable]], na.rm = T), 2), ' (', round(IQR(data_nein[[variable]], na.rm = T), 2), ')')
df$ja[i] <- paste0(round(median(data_ja[[variable]], na.rm = T), 2), ' (', round(IQR(data_ja[[variable]], na.rm = T), 2), ')')
} else {
df$description[i] <- 'mean (SD)'
df$nein[i] <- paste0(round(mean(data_nein[[variable]], na.rm = T), 2), ' (', round(sd(data_nein[[variable]], na.rm = T), 2), ')')
df$ja[i] <- paste0(round(mean(data_ja[[variable]], na.rm = T), 2), ' (', round(sd(data_ja[[variable]], na.rm = T), 2), ')')
}
}
write.csv(df, file = paste0(outs_dir, 'summary_quantitave.csv'))
df
variables <- c('Alter', 'Berechnungstage', 'Anzahl_Nebendiagnosen', 'Anzahl_ops',
'OP_Erstop_Minuten', 'OP_Folgeops_Minuten', 'Gesamt_Gesamt')
results <- c()
obj <- data_matrices
obj$Gruppe <- data$Freie_Lappenplastik
obj$Gruppe[obj$Gruppe == 0] <- 'nein'
obj$Gruppe[obj$Gruppe == 1] <- 'ja'
for(i in 1:length(variables)){
variable_ja <- variables[i]
data_subset <- obj[c(variables[i], 'Gruppe')]
colnames(data_subset)[colnames(data_subset) != 'Gruppe'] <- 'Variable_to_test'
p_value <- ad.test(obj[[variable_ja]])[[2]]
if(p_value < 0.05){
statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 'wilcox.test', p.adjust.method = 'bonferroni', paired = FALSE)
statistics[1] <- variable_ja
} else {
statistics <- compare_means(Variable_to_test ~ Gruppe, data = data_subset, method = 't.test', p.adjust.method = 'bonferroni', paired = FALSE)
statistics[1] <- variable_ja
}
obj_plot <- obj[c(variable_ja, 'Gruppe')]
colnames(obj_plot)[1] <- 'Variable'
plot <- ggplot(obj_plot, aes(x = Gruppe, y = Variable, fill = Gruppe)) +
geom_boxplot(width = 0.6, outlier.shape = NA) +
geom_point(alpha = 0.2, position = position_dodge(width=0.6)) +
ggtitle(paste0(variable_ja, '\n(', statistics$method[1], ' Test, p = ', round(statistics$p.adj[1], 3), ')')) +
xlab("Verlustfall?") +
ylab("") +
cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=17),
axis.title.x = element_text(size=17),
plot.title = element_text(size = 20, hjust = 0.5, face = 'bold'),
axis.text.y = element_text(size=17),
legend.position = 'none')
ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.pdf'), width = 6, height = 6)
ggsave(plot, file = paste0(outs_dir, variable_ja, '_boxplot.svg'), width = 6, height = 6)
results <- rbind(results, statistics)
}
results
write.csv(results, file = paste0(outs_dir, 'quantitave_testing.csv'))