melanoma_aec/evaluate_latent_reps_clustering.py · ARMED-MixedEffectsDL

'''
Create scatterplots of inter-cluster, intra-PDX latent distance vs.
intra-cluster, inter-PDX latent distance. 
'''

import pandas as pd
import numpy as np
import tqdm
import multiprocessing as mp

from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score
from scipy.spatial.distance import cdist
from armed.misc import expand_results_path, expand_data_path

import matplotlib.pyplot as plt
import seaborn as sns

strLatentsPath = expand_results_path('melanoma_aec_tiedweights/base_model_v3/epoch010_latents.pkl')
strImageListPath = expand_data_path('melanoma/4pdx6dates/data_train.csv')
strScatterSavePath = expand_results_path('melanoma_aec_tiedweights/base_model_v3/epoch010_latents_distances.svg')

dfLatents = pd.read_pickle(strLatentsPath)
dfImages = pd.read_csv(strImageListPath, index_col=0)

dfLatents -= dfLatents.mean(axis=0)
dfLatents /= dfLatents.std(axis=0)

db = davies_bouldin_score(dfLatents, dfImages['date'])
ch = calinski_harabasz_score(dfLatents, dfImages['date'])

print(f'Clustering scores:'
      f'\n\tDavies-Bouldin (higher is better): {db}'
      f'\n\tCalinski-Harabasz (lower is better): {ch}'
)

with open(strLatentsPath.replace('.pkl', '_clustering.txt'), 'w') as f:
    f.write(f'Davies-Bouldin: {db}'
            f'\nCalinski-Harabasz: {ch}')

dfLatents['Date'] = dfImages['date'].astype(str).values
dfLatents['Cell'] = dfImages['cell'].values
dfLatents['PDX'] = dfImages['celltype'].values

# Compute pairwise Euclidean distances: intra-PDX inter-day and intra-day inter-PDX
print('Computing inter-PDX and inter-date Euclidean distances')
dfDistances = pd.DataFrame(columns=['PDX', 'Other PDX', 'Date0', 'Date1', 'Intra-PDX', 'Inter-PDX'])
arrCellTypes = dfLatents['PDX'].unique()
arrCellTypes.sort()
arrDates = dfLatents['Date'].unique()
arrDates.sort()
i = 0

def mean_str(col):
    # Calculates the mean if values are numeric or the string if values are strings
    if pd.api.types.is_numeric_dtype(col):
        return col.mean()
    else:
        return col.unique() if col.nunique() == 1 else np.nan

def distance(args):
    from_pdx = args['from_pdx']
    from_date = args['from_date']
    to_date = args['to_date']
    # Compute pairwise distances between 
    # 1. Cells from the same day but two different PDXs
    # 2. Cells from the same PDX but two different days
    # Return the mean of the cell pairwise distances for each comparison
    dfLatentPDX = dfLatents.loc[dfLatents['PDX'] == from_pdx]
    # Compute mean latent vector (across frames) for each cell
    dfLatentPDX = dfLatentPDX.groupby('Cell').agg(mean_str)
    
    dfDate0 = dfLatentPDX.loc[dfLatentPDX['Date'] == from_date]
    dfDate0 = dfDate0[range(56)]
    
    dfDate1 = dfLatentPDX.loc[dfLatentPDX['Date'] == to_date]
    dfDate1 = dfDate1[range(56)]
    arrDistIntraPDX = cdist(dfDate0, dfDate1, metric='euclidean')
    fDistIntraPDX = np.mean(arrDistIntraPDX[np.triu_indices_from(arrDistIntraPDX, k=1)])
    
    # Loop through all other PDX's acquired on the same date
    arrToPDX = dfLatents.loc[dfLatents['Date'] == from_date, 'PDX'].unique()
    arrToPDX = arrToPDX[arrToPDX != from_pdx]

    lsInterPDX = []
    for to_pdx in arrToPDX:
        # Compute pairwise distances between each cell of this PDX and each cell other PDXs on the same date
        dfOtherPDX = dfLatents.loc[(dfLatents['PDX'] == to_pdx) & (dfLatents['Date'] == from_date)]
        # Compute mean latent vector across frames for each cell
        dfOtherPDX = dfOtherPDX.groupby('Cell').agg(mean_str)
        dfOtherPDX = dfOtherPDX[range(56)]
        arrDistInterPDX = cdist(dfDate0, dfOtherPDX, metric='euclidean')

        fDistInterPDX = np.mean(arrDistInterPDX[np.triu_indices_from(arrDistInterPDX, k=1)])
        lsInterPDX += [{'PDX': from_pdx, 
                        'Other PDX': to_pdx, 
                        'Date0': from_date, 
                        'Date1': to_date, 
                        'Intra-PDX': fDistIntraPDX, 
                        'Inter-PDX': fDistInterPDX}]
        
    return pd.DataFrame(lsInterPDX)
    
# Build the list of comparisons to be made
lsArgs = []       
for from_pdx in arrCellTypes:
    arrPDXDates = dfLatents.loc[dfLatents['PDX'] == from_pdx, 'Date'].unique()
    for from_date in arrPDXDates:
        arrToDates = arrPDXDates[arrPDXDates != from_date]
        for to_date in arrToDates:
            lsArgs += [{'from_pdx': from_pdx, 'from_date': from_date, 'to_date': to_date}]
        
# Run in parallel
with mp.Pool(8) as pool:
    lsDistances = list(tqdm.tqdm(pool.imap(distance, lsArgs), total=len(lsArgs)))
    
dfDistances = pd.concat(lsDistances)
dfDistancesNorm = dfDistances.copy()
dfDistancesNorm[['Intra-PDX', 'Inter-PDX']] -= dfDistancesNorm[['Intra-PDX', 'Inter-PDX']].values.min()
dfDistancesNorm[['Intra-PDX', 'Inter-PDX']] /= dfDistancesNorm[['Intra-PDX', 'Inter-PDX']].values.max()
fig, ax = plt.subplots(figsize=(7, 7))
sns.scatterplot(data=dfDistancesNorm, x='Intra-PDX', y='Inter-PDX', hue='PDX', style='PDX', 
                hue_order=arrCellTypes,
                style_order=arrCellTypes,
                ax=ax)
ax.set_xlabel('Mean distance between cells from same PDX, different days')
ax.set_ylabel('Mean distance to cells from other PDXs, same day')
minval = dfDistancesNorm[['Intra-PDX', 'Inter-PDX']].values.min()
ax.plot([minval, 1], [minval, 1], 'k--')
fig.savefig(strScatterSavePath)
fig.savefig(strScatterSavePath.replace('svg', 'png'))

print('{:.03f}% of comparisons below diagonal'.format(
    100 * (dfDistancesNorm['Intra-PDX'] > dfDistancesNorm['Inter-PDX']).mean()
    ))