src/hrm_analysis.ipynb · hrm-analysis

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "hrm_analysis.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "r84Xnsq4Ixts"
      },
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "from scipy.signal import savgol_filter\n",
        "\n",
        "import math\n",
        "\n",
        "from numpy.polynomial.polynomial import Polynomial   "
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "T_FUurzVJa4E"
      },
      "source": [
        "data = pd.read_csv(\"HRM.csv\", index_col=0, header=0, sep= \";\", decimal=\",\") #use the separator according to your file. Common separator are \",\", \";\"\n",
        "data.loc[:,'degree']=data.index  \n",
        "data.index = pd.to_datetime(data.index, unit='ms') \n",
        "\n",
        "folder_results = \"results\" # change this to your results folder to save figures\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6TPpDE_S600c"
      },
      "source": [
        "def plot_curve(index, N_sample_1, B_T, sample_1, name, peak):\n",
        "  plt.plot(index, N_sample_1, label='normalized_raw')\n",
        "  plt.plot(index, N_M, label='normalized_EBS')\n",
        "  plt.plot(index, -1*B_T, label='Background')\n",
        "  plt.plot(index, M_T, label='Raw MT')\n",
        "  plt.plot(index, sample_1, label='Raw sample')\n",
        "  plt.legend()\n",
        "  plt.suptitle(name)\n",
        "  plt.title(\"Melting temp\" + repr(index[peak[-1]])) #Peaks on the derivative curves can be used for the identification of the melting temperatures (Tm)\n",
        "  plt.savefig(results_folder+\"/sample_\"+name+\"_melt_curve\")\n",
        "  plt.show()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oMqjVJhC7R6d"
      },
      "source": [
        "def plot_derivative(index, derivative_values, key, peak):\n",
        "  ax = plt.plot(index, derivative_values, c='red')\n",
        "  plt.title(key + \": Negative First Derivative Curve\")\n",
        "  plt.axvline(start, linestyle='--', c='blue')\n",
        "  plt.axvline(end, linestyle='--', c='black')\n",
        "  for p in peak:\n",
        "    plt.axhline(derivative_values[p], linestyle='--', c='grey')\n",
        "    plt.text(index[p], derivative_values[p], \"temp: \"+\"{:.2f}\".format(index[p]))\n",
        "  plt.ylabel(\"-dF/dT\")\n",
        "  plt.xlabel(\"Temperature\")\n",
        "  plt.savefig(folder+\"/\"+key+\"_derivative\")\n",
        "  plt.show()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MQq88oDM7_dw"
      },
      "source": [
        "def plot_melting_region(index, sample_1, key, start, end):\n",
        "  plt.plot(index, sample_1, c='red')  \n",
        "  plt.title(key)\n",
        "  plt.axvline(start, linestyle='--', c='blue')\n",
        "  plt.axvline(end, linestyle='--', c='black')\n",
        "  plt.xlabel(\"Temperature\")\n",
        "  plt.ylabel(\"Fluorescence\")\n",
        "  plt.text(start, np.mean(sample_1)+0.2, \"T_m_start $\"+\"{:.2f}\".format(start)+\"$\")\n",
        "  plt.text(end, np.mean(sample_1)-0.1, \"T_m_end $\"+\"{:.2f}\".format(end)+\"$\")\n",
        "  plt.savefig(folder_results+\"/\"+key) \n",
        "  plt.show()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "axkqXMJW1BDQ"
      },
      "source": [
        "**Defining samples and negative control**\n",
        "\n",
        "Following the example, define which columns belong to each sample and which columns are the negative control"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "x6zwMhE4Tk_P"
      },
      "source": [
        "samples = dict()\n",
        "samples['Control_1'] = ['A2 ', 'A3 '] #Sample \"Control_1\" has two replicas in columns A2 and A3\n",
        "samples['Control_2'] = ['A4 ', 'A5 '] #Sample \"Control_2\" has two replicas in columns A4 and A5\n",
        "samples['A'] = ['A6 ', 'A7', 'A8'] #Sample \"A\" has three replicas in columns A6, A7 and A8\n",
        "samples['Negative'] = ['C2 ','C3 '] #Negative control is in columns C2 and C3\n",
        "\n",
        "control_samples= ['Control_1', \"Control_2\"] #Add your control samples name as you named them before. Separate them using ,\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "PEiVIu-f6CHc"
      },
      "source": [
        "**HRM Analysis**\n",
        "\n",
        "In this part, we process each sample using the algorithm described in the paper. The three steps are signaled by comments: finding melting region, apply EBS  and Melting peak identification "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "tbEupO9PI-43"
      },
      "source": [
        "all_curves = pd.DataFrame()    \n",
        "all_derivatives = pd.DataFrame()   \n",
        "\n",
        "for key in samples:  \n",
        "    try:\n",
        "      columns = list(filter(lambda x: any(s in x for s in samples[key]), data.columns)) \n",
        "      sample = data[columns]  \n",
        "      sample.loc[:,'degree']=data['degree']\n",
        "      sample = sample.resample('1us').mean().interpolate()  \n",
        "      sample = sample.resample('60us').ffill() \n",
        "      sample = sample.dropna()\n",
        "      \n",
        "      #find RecWindow \n",
        "      index = sample['degree'].values\n",
        "      rec_window = len(index[index<index[0]+1]) #number of samples within 1 degree\n",
        "      diff = index - np.roll(index, 1)\n",
        "      delta = np.min(diff[1:])\n",
        "\n",
        "      sample_1 = np.median(sample, axis=1) #we use median of replicas instead of mean \n",
        "      derivative_values = -1*savgol_filter(sample_1, delta = delta, window_length=rec_window*2+1, polyorder=2,deriv=1) #polynomial order is set to 2, window is number of samples within 1 degree RecWindow*2+1\n",
        "      \n",
        "      #Finding melting region \n",
        "      tangent_derivative = savgol_filter(derivative_values,delta= delta/100, window_length=rec_window*2+1, polyorder=2,deriv=1) #polynomial order is set to 2, window is number of samples within 1 degree RecWindow*2+1\n",
        "      angles = np.degrees(np.arctan(tangent_derivative))\n",
        "      start = index[(angles>50)&(angles<55)]   \n",
        "      end = index[(angles<-30)&(angles<35)]         \n",
        "\n",
        "      if len(start)>0 and len(end)>0:  #at least one melting region was found\n",
        "          start = start[0]\n",
        "          end = end[end>start]\n",
        "          end = end[-1]\n",
        "\n",
        "          #plot melting region\n",
        "          plot_melting_region(index, sample_1, key, start, end)\n",
        "\n",
        "          \n",
        "          #apply EBS for melting curve \n",
        "          #the raw fluorescence F(T) is modeled as composing of the melting curve M(T) and an exponentially decaying background B(T). It can be expressed using the following mathematical model\n",
        "          #F(T) = M(T) + B(T)\n",
        "\n",
        "          #first determine TR and Tl where derivative of M(T) is 0, outside of start and end\n",
        "          TL = start - 2\n",
        "          TR = end + 2\n",
        "\n",
        "          tl_i = 0\n",
        "          tr_i = len(index)-1\n",
        "          if len(np.where(index<TL)[0])>0:\n",
        "            tl_i = np.where(index<TL)[-1][-1]\n",
        "          else:\n",
        "            TL = index[0]\n",
        "\n",
        "          if len(np.where(index>TR)[0])>0:\n",
        "            tr_i = np.where(index>TR)[0][0]\n",
        "          else:\n",
        "            TR = index[-1] \n",
        "\n",
        "          #We remove data that is not in the melting region\n",
        "          sample_1 = sample_1[tl_i:tr_i+1]   \n",
        "          n_index = index[tl_i:tr_i+1]\n",
        "\n",
        "          #Melting peak identification \n",
        "          peak = np.where(np.sign(tangent_derivative[:-1]) > np.sign(tangent_derivative[1:]))[0] #tm is where there is a change from positive to negative in tangent derivative \n",
        "\n",
        "          tangent_derivative = tangent_derivative[tl_i:tr_i+1]\n",
        "          plot_derivative(index, derivative_values, key, peak)\n",
        "\n",
        "\n",
        "          index = n_index\n",
        "\n",
        "          #Get Background \n",
        "          a = (math.log(derivative_values[tr_i])-math.log(derivative_values[tl_i]))/(TR-TL)\n",
        "          C =  derivative_values[tl_i] / a                   \n",
        "          exponents = a*(index-TL)                      \n",
        "          B_T =C*np.exp(exponents) \n",
        "          M_T = sample_1 - B_T        \n",
        "          \n",
        "          N_M = 100 * (M_T-np.min(M_T))/(np.max(M_T)-np.min(M_T))\n",
        "\n",
        "          #Normalize sample \n",
        "          N_sample_1 = 100 * (sample_1-np.min(sample_1))/(np.max(sample_1)-np.min(sample_1))        \n",
        "\n",
        "\n",
        "          #Plot samples and normalized \n",
        "          plot_curve(index, N_sample_1, B_T, sample_1, key, peak)\n",
        "\n",
        "\n",
        "          #save the curve for plotting all together \n",
        "          sample_data = pd.DataFrame(N_M, index=index, columns=[key])\n",
        "          all_curves = pd.concat([all_curves, sample_data], axis=1)\n",
        "\n",
        "          derivative = -1*savgol_filter(N_M, delta=delta, window_length=rec_window*2+1, polyorder=2,deriv=1) \n",
        "          sample_data = pd.DataFrame(derivative, index=index, columns=[key])\n",
        "          all_derivatives = pd.concat([all_derivatives, sample_data], axis=1)\n",
        "    except (Exception) as e:\n",
        "        print(\"error on sample \" + key)\n",
        "        print(e)\n",
        "        sample_data = pd.DataFrame(np.ones(len(index)), index=index, columns=[key])\n",
        "        all_derivatives = pd.concat([all_derivatives, sample_data], axis=1)\n",
        "        continue\n",
        "\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DgN4kwmX6qPS"
      },
      "source": [
        "**Plot results all together**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "qpG5DiLHJEqb"
      },
      "source": [
        "#plot difference plot\n",
        "import seaborn as sns\n",
        "reshaped_df = all_curves.reset_index()\n",
        "reshaped_df = pd.melt(reshaped_df, id_vars='index')\n",
        "g= sns.lineplot(x='index', y='value', hue='variable', data=reshaped_df)\n",
        "plt.title(\"All samples melt curve\")\n",
        "plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
        "g.get_figure().savefig(results_folder+\"/melting_all.png\", bbox_inches='tight', dpi=300)\n",
        "plt.show()\n",
        "\n",
        "reshaped_df = all_derivatives.reset_index()\n",
        "reshaped_df = pd.melt(reshaped_df, id_vars='index')\n",
        "g= sns.lineplot(x='index', y='value', hue='variable', data=reshaped_df)\n",
        "plt.title(\"All samples negative derviative curve\")\n",
        "plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
        "g.get_figure().savefig(results_folder+\"/neg_derivative_all.png\",, bbox_inches='tight', dpi=300)\n",
        "plt.show()\n",
        "\n",
        "med_curve = np.mean(all_curves[control_samples], axis=1) \n",
        "plt.plot(med_curve)\n",
        "plt.show()\n",
        "\n",
        "\n",
        "all_curves_sub =all_curves.sub(med_curve, axis='index')\n",
        "reshaped_df = all_curves_sub.reset_index()\n",
        "reshaped_df = pd.melt(reshaped_df, id_vars='index')\n",
        "g= sns.lineplot(x='index', y='value', hue='variable', data=reshaped_df)\n",
        "plt.title(\"Difference plot\")\n",
        "plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
        "g.get_figure().savefig(results_folder+\"/difference_melt.png\", bbox_inches='tight', dpi=300)\n",
        "plt.show()\n",
        "\n",
        "\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "X2A0XLc-1lrE"
      },
      "source": [
        "#Difference plot with all temperatures\n",
        "med_curve = med_curve.fillna(0)\n",
        "all_curves_sub =all_curves.sub(med_curve, axis='index')\n",
        "all_curves_sub = all_curves_sub.iloc[20:,]\n",
        "reshaped_df = all_curves_sub.reset_index()\n",
        "reshaped_df = pd.melt(reshaped_df, id_vars='index')\n",
        "g= sns.lineplot(x='index', y='value', hue='variable', data=reshaped_df)\n",
        "plt.title(\"Difference plot with all temperatures\")\n",
        "plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
        "g.get_figure().savefig(results_folder+'/difference_melt_all_temps.png', bbox_inches='tight', dpi=300)\n",
        "plt.show()\n"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}