CodingChallenge / KaplanMeier_main.py
KaplanMeier_main.py
Raw
import pandas as pd
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
from datetime import date




def plotThecurve(kmf):
    # Plot the Kaplan-Meier survival curve
    plt.figure(figsize=(10, 6))
    kmf.plot_survival_function()
    plt.title("Kaplan-Meier Survival Curve")
    plt.xlabel("Time (days)")
    plt.ylabel("Survival Probability")
    plt.grid()
    # Save the plot
    plt.savefig('plots/Kaplan_Meier_Survival_Curve.png')
    plt.show()

# Function to calculate the survival time
def calculate_survival_time(row):
    if row["Event"]:  # If the event (death) occurred
        return (row["Date of Death"] - row["Date of Diagnosis"]).days
    else:  # If the event (death) did not occur
        return (row["Todays_date"] - row["Date of Diagnosis"]).days

# Main function
def main():
    df = pd.read_csv("KaplanMeierDataset/Book1CSV.csv")
    df["Todays_date"] = pd.to_datetime(date.today()) # Adding Todays date to get till date survival days if no death recorded
    print(df.head(5)) # Printing first 5 records

    # Data Preprocessing
    # Convert date columns to datetime
    date_columns = ["Date of Birth", "Date of Diagnosis", "Date of Death", "Date of Last Follow-Up", "Date of Admission",
                    "Date of Discharge"]
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], format="%d.%m.%Y", errors="coerce")

    # Filter date of diagnosis which are of future , reason to have t correct survival time,
    # current survival time is coming negative
    df = df[(df['Date of Diagnosis'] <= df['Date of Last Follow-Up']) & (df['Date of Diagnosis'] <= df["Todays_date"])]
    print(df.shape)

    # Create a new column to indicate whether the event occurred or not.
    # if event is occurred set it to (1), else set it to (0)
    df["Event"] = df["Date of Death"].notnull().astype(int)
    print(df.head(5))



    # Apply the function to each row of the DataFrame to compute the survival time
    df["Survival Time days"] = df.apply(calculate_survival_time, axis=1)
    print(df.head(5))

    # Fit the Kaplan-Meier estimator
    kmf = KaplanMeierFitter()
    kmf.fit(durations=df["Survival Time days"], event_observed=df["Event"])

    # Plot the curve
    plotThecurve(kmf)


# Main program starts from here
if __name__ == "__main__":
    main()