import pandas as pd from lifelines import KaplanMeierFitter import matplotlib.pyplot as plt from datetime import date def plotThecurve(kmf, subgroups, attribute): # Plot survival curves for each subgroup plt.figure(figsize=(10, 6)) for name, group in subgroups: kmf.fit(group["Survival Time days"], event_observed=~group["Event"].isnull(), label=name) kmf.plot() plt.title(f"Kaplan-Meier Survival Curve by {attribute}") plt.xlabel("Time (Days)") plt.ylabel("Survival Probability") plt.grid() plt.legend() # Save the plot plt.savefig(f"plots/Kaplan_Meier_Survival_Curve_by_{attribute}.png") plt.show() def justification(): print(''' The reason to select few of these features for subgroup in my opinion has a significant impacts on health condition for each person. Age: Age is a factor often correlates with disease incidence, progression, and overall health status. Analyzing survival outcomes across different age groups can provide insights into age-related patterns of disease occurrence and mortality. Gender: Gender differences may exist in disease prevalence, progression, response to treatment, and overall survival. Stratifying by gender allows for the examination of potential disparities and gender-specific risk factors affecting survival. Disease Stage: The stage of disease at diagnosis is a critical prognostic factor in many medical conditions. Patients with advanced disease stages often have poorer outcomes compared to those diagnosed at earlier stages. Analyzing survival by disease stage helps assess the impact of disease severity on survival probabilities. Treatment Received: Treatment modalities such as surgery, chemotherapy, radiation therapy, or immunotherapy can significantly influence survival outcomes. Comparing survival among patients who received different treatments provides insights into treatment efficacy and optimal therapeutic strategies. Smoking Status: Smoking is a well known risk factor for various diseases, including cancer, and respiratory disorders. Analyzing survival outcomes based on smoking status helps to assess the role of smoking in disease progression and mortality. Family History: Family history of certain diseases can indicate genetic predisposition and may influence disease risk. Examining survival outcomes by family history helps elucidate the contribution of genetic and familial factors to disease outcomes. Vital Signs: Vital signs such as blood pressure, heart rate, respiratory rate, and body temperature provide important indicators of physiological function and health status. Abnormal vital signs may signal underlying medical conditions or disease progression, impacting survival probabilities. ''') def main(): df = pd.read_csv("KaplanMeierDataset/Book1CSV.csv") df["Todays_date"] = pd.to_datetime( date.today()) # Adding Todays date to get till date survival days if no death recorded print(df.head(5)) # Printing first 5 records # Data Preprocessing # Convert date columns to datetime date_columns = ["Date of Birth", "Date of Diagnosis", "Date of Death", "Date of Last Follow-Up", "Date of Admission", "Date of Discharge"] for col in date_columns: df[col] = pd.to_datetime(df[col], format="%d.%m.%Y", errors="coerce") # Filter date of diagnosis which are of future , reason to have t correct survival time, # current survival time is coming negative df = df[(df['Date of Diagnosis'] <= df['Date of Last Follow-Up']) & (df['Date of Diagnosis'] <= df["Todays_date"])] # Create a new column to indicate whether the event occurred or not. # if event is occurred set it to (1), else set it to (0) df["Event"] = df["Date of Death"].notnull().astype(int) print(df.head(5)) def calculate_survival_time(row): if row["Event"]: # If the event (death) occurred return (row["Date of Death"] - row["Date of Diagnosis"]).days else: # If the event (death) did not occur return (row["Todays_date"] - row["Date of Diagnosis"]).days # Apply the function to each row of the DataFrame to compute the survival time df["Survival Time days"] = df.apply(calculate_survival_time, axis=1) # Identify relevant attributes for subgroup analysis attributes = ["Age", "Gender", "Disease Stage", "Treatment Received", "Smoking Status", "Family History", "Vital Signs"] # Conduct subgroup analysis for each attribute for attribute in attributes: # Stratify the dataset based on the attribute subgroups = df.groupby(attribute) # Create a Kaplan-Meier fitter kmf = KaplanMeierFitter() # Plot the curve plotThecurve(kmf, subgroups, attribute) justification() # Main program starts from here if __name__ == "__main__": main()