KaplanMeier_Bonus_main.py · CodingChallenge

import pandas as pd
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
from datetime import date


def plotThecurve(kmf, subgroups, attribute):
    # Plot survival curves for each subgroup
    plt.figure(figsize=(10, 6))
    for name, group in subgroups:
        kmf.fit(group["Survival Time days"], event_observed=~group["Event"].isnull(), label=name)
        kmf.plot()
    plt.title(f"Kaplan-Meier Survival Curve by {attribute}")
    plt.xlabel("Time (Days)")
    plt.ylabel("Survival Probability")
    plt.grid()
    plt.legend()
    # Save the plot
    plt.savefig(f"plots/Kaplan_Meier_Survival_Curve_by_{attribute}.png")
    plt.show()

def justification():
    print('''
    The reason to select few of these features for subgroup in my opinion has a significant impacts on health condition for each person.

    Age: Age is a factor often correlates with disease incidence, progression, and overall health status.
         Analyzing survival outcomes across different age groups can provide insights into age-related patterns of disease occurrence and mortality.

    Gender: Gender differences may exist in disease prevalence, progression, response to treatment, and overall survival.
            Stratifying by gender allows for the examination of potential disparities and gender-specific risk factors affecting survival.

    Disease Stage: The stage of disease at diagnosis is a critical prognostic factor in many medical conditions.
                   Patients with advanced disease stages often have poorer outcomes compared to those diagnosed at earlier stages.
      Analyzing survival by disease stage helps assess the impact of disease severity on survival probabilities.

    Treatment Received: Treatment modalities such as surgery, chemotherapy, radiation therapy, or immunotherapy can significantly influence survival outcomes.
                        Comparing survival among patients who received different treatments provides insights into treatment efficacy and optimal therapeutic strategies.

    Smoking Status: Smoking is a well known risk factor for various diseases, including cancer, and respiratory disorders.
                    Analyzing survival outcomes based on smoking status helps to assess the role of smoking in disease progression and mortality.

    Family History: Family history of certain diseases can indicate genetic predisposition and may influence disease risk. Examining survival outcomes by family history helps
    elucidate the contribution of genetic and familial factors to disease outcomes.

    Vital Signs: Vital signs such as blood pressure, heart rate, respiratory rate, and body temperature provide important indicators of physiological function and health status.
    Abnormal vital signs may signal underlying medical conditions or disease progression, impacting survival probabilities.

    ''')
def main():
    df = pd.read_csv("KaplanMeierDataset/Book1CSV.csv")
    df["Todays_date"] = pd.to_datetime(
        date.today())  # Adding Todays date to get till date survival days if no death recorded
    print(df.head(5))  # Printing first 5 records

    # Data Preprocessing
    # Convert date columns to datetime
    date_columns = ["Date of Birth", "Date of Diagnosis", "Date of Death", "Date of Last Follow-Up",
                    "Date of Admission",
                    "Date of Discharge"]
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], format="%d.%m.%Y", errors="coerce")

    # Filter date of diagnosis which are of future , reason to have t correct survival time,
    # current survival time is coming negative
    df = df[(df['Date of Diagnosis'] <= df['Date of Last Follow-Up']) & (df['Date of Diagnosis'] <= df["Todays_date"])]

    # Create a new column to indicate whether the event occurred or not.
    # if event is occurred set it to (1), else set it to (0)
    df["Event"] = df["Date of Death"].notnull().astype(int)
    print(df.head(5))

    def calculate_survival_time(row):
        if row["Event"]:  # If the event (death) occurred
            return (row["Date of Death"] - row["Date of Diagnosis"]).days
        else:  # If the event (death) did not occur
            return (row["Todays_date"] - row["Date of Diagnosis"]).days

    # Apply the function to each row of the DataFrame to compute the survival time
    df["Survival Time days"] = df.apply(calculate_survival_time, axis=1)

    # Identify relevant attributes for subgroup analysis
    attributes = ["Age", "Gender", "Disease Stage", "Treatment Received", "Smoking Status", "Family History", "Vital Signs"]
    # Conduct subgroup analysis for each attribute
    for attribute in attributes:
        # Stratify the dataset based on the attribute
        subgroups = df.groupby(attribute)
        # Create a Kaplan-Meier fitter
        kmf = KaplanMeierFitter()
        # Plot the curve
        plotThecurve(kmf, subgroups, attribute)

    justification()


# Main program starts from here
if __name__ == "__main__":
    main()