L2EnglishGeneration / EtymDB / analysis_notebooks / Analysis_attemp2.ipynb
Analysis_attemp2.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 203,
   "id": "dc282d33-93c3-4c19-a49a-60feb16ee92a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# To read the .csv files\n",
    "import pandas as pd \n",
    "# To store the direct inheritance relations as a graph\n",
    "import networkx as nx\n",
    "import matplotlib.pyplot as plt\n",
    "from code_to_langs import wiki_code_to_lang\n",
    "import unicodedata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 301,
   "id": "ad6271c3-1521-4fb4-b212-865d07e8e03b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/_1/v79t6xvn6ds15sxk2lxl00cr0000gn/T/ipykernel_38644/2603880363.py:4: ParserWarning: Skipping line 745166: expected 5 fields, saw 6\n",
      "\n",
      "  df_values = pd.read_csv(path_values,\n",
      "/var/folders/_1/v79t6xvn6ds15sxk2lxl00cr0000gn/T/ipykernel_38644/2603880363.py:4: ParserWarning: Skipping line 998371: expected 5 fields, saw 6\n",
      "\n",
      "  df_values = pd.read_csv(path_values,\n"
     ]
    }
   ],
   "source": [
    "path_values = \"../data/split_etymdb/etymdb_values.csv\"\n",
    "path_link = \"../data/split_etymdb/etymdb_links_info.csv\"\n",
    "\n",
    "df_values = pd.read_csv(path_values,\n",
    "                        sep='\\t',\n",
    "                        names=[\"id\", \"lang\", \"field\", \"lexeme\", \"meaning\"],\n",
    "                        dtype={\"id\": int, \"lang\": str, \"field\": int, \"meaning\": str},\n",
    "                        on_bad_lines='warn').set_index(\"id\")\n",
    "\n",
    "df_link = pd.read_csv(path_link,\n",
    "                      sep='\\t',\n",
    "                      names=[\"relation_type\", \"child\", \"parent\"],\n",
    "                      dtype={\"relation_type\": str, \"child\": int, \"parent\": int})\n",
    "\n",
    "df_inher = df_link.loc[df_link['relation_type'].isin([\"inh\"])]\n",
    "df_bor = df_link.loc[df_link['relation_type'].isin([\"bor\"])]\n",
    "df_cog = df_link.loc[df_link['relation_type'].isin([\"cog\"])]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cb8adc5f-d73b-4a36-a38a-23aa5dd08e35",
   "metadata": {},
   "source": [
    "### Data cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 302,
   "id": "17d9a12d-d58f-4df2-8be7-c1353f1a3383",
   "metadata": {},
   "outputs": [],
   "source": [
    "lang_map = {\n",
    "    'itc-ola': 'la', 'la': 'la', 'la-ecc': 'la', 'la-lat': 'la',\n",
    "    'la-med': 'la', 'la-new': 'la', 'la-ren': 'la', 'la-vul': 'la'\n",
    "}\n",
    "\n",
    "# Apply the mapping and fill other values with their original\n",
    "df_values['classification_lang'] = df_values['lang'].map(lang_map).fillna(df_values['lang'])\n",
    "\n",
    "def remove_accents(input_str):\n",
    "    if not isinstance(input_str, str):\n",
    "        return input_str\n",
    "    nfkd_form = unicodedata.normalize('NFKD', input_str)\n",
    "    # Keep only base characters, remove diacritics\n",
    "    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])\n",
    "\n",
    "\n",
    "# Apply the function only to lexemes where 'lang' is 'la'\n",
    "df_values['normalized_form'] = df_values.apply(\n",
    "    lambda row: remove_accents(row['lexeme']) if row['classification_lang'] == 'la' else row['lexeme'],\n",
    "    axis=1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 303,
   "id": "162864e7-ef62-4ecb-9264-c5fafbcd7522",
   "metadata": {},
   "outputs": [],
   "source": [
    "map_series = df_values['normalized_form'] + '_' + df_values['classification_lang']\n",
    "\n",
    "# Replace IDs in df_link using the map\n",
    "df_link['child_lexeme'] = df_link['child'].map(map_series)\n",
    "df_link['parent_lexeme'] = df_link['parent'].map(map_series)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 304,
   "id": "a72a3af4-51c9-4d5a-bf50-f4c21a144650",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lang</th>\n",
       "      <th>field</th>\n",
       "      <th>lexeme</th>\n",
       "      <th>meaning</th>\n",
       "      <th>classification_lang</th>\n",
       "      <th>normalized_form</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1005511</th>\n",
       "      <td>la-new</td>\n",
       "      <td>0</td>\n",
       "      <td>curcuma</td>\n",
       "      <td>NaN</td>\n",
       "      <td>la</td>\n",
       "      <td>curcuma</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1006130</th>\n",
       "      <td>la-new</td>\n",
       "      <td>0</td>\n",
       "      <td>aphis</td>\n",
       "      <td>NaN</td>\n",
       "      <td>la</td>\n",
       "      <td>aphis</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1014623</th>\n",
       "      <td>la-new</td>\n",
       "      <td>0</td>\n",
       "      <td>Opiliones</td>\n",
       "      <td>NaN</td>\n",
       "      <td>la</td>\n",
       "      <td>Opiliones</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1023473</th>\n",
       "      <td>la-new</td>\n",
       "      <td>0</td>\n",
       "      <td>punctātrīx</td>\n",
       "      <td>marker by dots</td>\n",
       "      <td>la</td>\n",
       "      <td>punctatrix</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1026932</th>\n",
       "      <td>la-new</td>\n",
       "      <td>0</td>\n",
       "      <td>Gorgonia</td>\n",
       "      <td>NaN</td>\n",
       "      <td>la</td>\n",
       "      <td>Gorgonia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>975058</th>\n",
       "      <td>la-new</td>\n",
       "      <td>0</td>\n",
       "      <td>molybdaena</td>\n",
       "      <td>any of various substances resembling lead, inc...</td>\n",
       "      <td>la</td>\n",
       "      <td>molybdaena</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>979913</th>\n",
       "      <td>la-new</td>\n",
       "      <td>0</td>\n",
       "      <td>dactylo-</td>\n",
       "      <td>NaN</td>\n",
       "      <td>la</td>\n",
       "      <td>dactylo-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>983202</th>\n",
       "      <td>la-new</td>\n",
       "      <td>0</td>\n",
       "      <td>mōlecula</td>\n",
       "      <td>NaN</td>\n",
       "      <td>la</td>\n",
       "      <td>molecula</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98511</th>\n",
       "      <td>la-new</td>\n",
       "      <td>0</td>\n",
       "      <td>dys-</td>\n",
       "      <td>dysphemism</td>\n",
       "      <td>la</td>\n",
       "      <td>dys-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>993666</th>\n",
       "      <td>la-new</td>\n",
       "      <td>0</td>\n",
       "      <td>butaurus</td>\n",
       "      <td>NaN</td>\n",
       "      <td>la</td>\n",
       "      <td>butaurus</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>688 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           lang  field      lexeme  \\\n",
       "id                                   \n",
       "1005511  la-new      0     curcuma   \n",
       "1006130  la-new      0       aphis   \n",
       "1014623  la-new      0   Opiliones   \n",
       "1023473  la-new      0  punctātrīx   \n",
       "1026932  la-new      0    Gorgonia   \n",
       "...         ...    ...         ...   \n",
       "975058   la-new      0  molybdaena   \n",
       "979913   la-new      0    dactylo-   \n",
       "983202   la-new      0    mōlecula   \n",
       "98511    la-new      0        dys-   \n",
       "993666   la-new      0    butaurus   \n",
       "\n",
       "                                                   meaning  \\\n",
       "id                                                           \n",
       "1005511                                                NaN   \n",
       "1006130                                                NaN   \n",
       "1014623                                                NaN   \n",
       "1023473                                     marker by dots   \n",
       "1026932                                                NaN   \n",
       "...                                                    ...   \n",
       "975058   any of various substances resembling lead, inc...   \n",
       "979913                                                 NaN   \n",
       "983202                                                 NaN   \n",
       "98511                                           dysphemism   \n",
       "993666                                                 NaN   \n",
       "\n",
       "        classification_lang normalized_form  \n",
       "id                                           \n",
       "1005511                  la         curcuma  \n",
       "1006130                  la           aphis  \n",
       "1014623                  la       Opiliones  \n",
       "1023473                  la      punctatrix  \n",
       "1026932                  la        Gorgonia  \n",
       "...                     ...             ...  \n",
       "975058                   la      molybdaena  \n",
       "979913                   la        dactylo-  \n",
       "983202                   la        molecula  \n",
       "98511                    la            dys-  \n",
       "993666                   la        butaurus  \n",
       "\n",
       "[688 rows x 6 columns]"
      ]
     },
     "execution_count": 304,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_values[df_values.eq('la-new').any(axis=1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 305,
   "id": "bb4fde5d-80fe-4fb8-a0d9-754747ed5c19",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lang</th>\n",
       "      <th>field</th>\n",
       "      <th>lexeme</th>\n",
       "      <th>meaning</th>\n",
       "      <th>classification_lang</th>\n",
       "      <th>normalized_form</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>dictionary</td>\n",
       "      <td>dictionary</td>\n",
       "      <td>en</td>\n",
       "      <td>dictionary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>enm</td>\n",
       "      <td>0</td>\n",
       "      <td>free</td>\n",
       "      <td>free</td>\n",
       "      <td>enm</td>\n",
       "      <td>free</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>ru</td>\n",
       "      <td>0</td>\n",
       "      <td>kot</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ru</td>\n",
       "      <td>kot</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1000</th>\n",
       "      <td>fy</td>\n",
       "      <td>0</td>\n",
       "      <td>diele</td>\n",
       "      <td>to divide, to separate</td>\n",
       "      <td>fy</td>\n",
       "      <td>diele</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10000</th>\n",
       "      <td>la</td>\n",
       "      <td>0</td>\n",
       "      <td>proprius</td>\n",
       "      <td>ownership</td>\n",
       "      <td>la</td>\n",
       "      <td>proprius</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999998</th>\n",
       "      <td>gmq-osw</td>\n",
       "      <td>0</td>\n",
       "      <td>þrir</td>\n",
       "      <td>NaN</td>\n",
       "      <td>gmq-osw</td>\n",
       "      <td>þrir</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999999</th>\n",
       "      <td>gmq-oda</td>\n",
       "      <td>0</td>\n",
       "      <td>fæm</td>\n",
       "      <td>NaN</td>\n",
       "      <td>gmq-oda</td>\n",
       "      <td>fæm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1397622</th>\n",
       "      <td>aql-pro</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>aql-pro</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1885631</th>\n",
       "      <td>sa</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "      <td>sa</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>907358</th>\n",
       "      <td>trk-pro</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>trk-pro</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1885104 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            lang  field      lexeme                 meaning  \\\n",
       "id                                                            \n",
       "1             en      0  dictionary              dictionary   \n",
       "10           enm      0        free                    free   \n",
       "100           ru      0         kot                     NaN   \n",
       "1000          fy      0       diele  to divide, to separate   \n",
       "10000         la      0    proprius               ownership   \n",
       "...          ...    ...         ...                     ...   \n",
       "999998   gmq-osw      0        þrir                     NaN   \n",
       "999999   gmq-oda      0         fæm                     NaN   \n",
       "1397622  aql-pro      0         NaN                     NaN   \n",
       "1885631       sa      0                                 NaN   \n",
       "907358   trk-pro      0         NaN                     NaN   \n",
       "\n",
       "        classification_lang normalized_form  \n",
       "id                                           \n",
       "1                        en      dictionary  \n",
       "10                      enm            free  \n",
       "100                      ru             kot  \n",
       "1000                     fy           diele  \n",
       "10000                    la        proprius  \n",
       "...                     ...             ...  \n",
       "999998              gmq-osw            þrir  \n",
       "999999              gmq-oda             fæm  \n",
       "1397622             aql-pro             NaN  \n",
       "1885631                  sa                  \n",
       "907358              trk-pro             NaN  \n",
       "\n",
       "[1885104 rows x 6 columns]"
      ]
     },
     "execution_count": 305,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 306,
   "id": "e59fa0ca-1cbb-43e1-9327-c043ff72d4a4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation_type</th>\n",
       "      <th>child</th>\n",
       "      <th>parent</th>\n",
       "      <th>child_lexeme</th>\n",
       "      <th>parent_lexeme</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>der</td>\n",
       "      <td>1430903</td>\n",
       "      <td>93275</td>\n",
       "      <td>terrado_pt</td>\n",
       "      <td>terra_pt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>inh</td>\n",
       "      <td>124835</td>\n",
       "      <td>239436</td>\n",
       "      <td>skītaną_gem-pro</td>\n",
       "      <td>sḱeyd-_ine-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>inh</td>\n",
       "      <td>124835</td>\n",
       "      <td>239435</td>\n",
       "      <td>skītaną_gem-pro</td>\n",
       "      <td>skeyd-_ine-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>inh</td>\n",
       "      <td>124835</td>\n",
       "      <td>124836</td>\n",
       "      <td>skītaną_gem-pro</td>\n",
       "      <td>sḱeyd- ~ skeyt-_ine-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>inh</td>\n",
       "      <td>124835</td>\n",
       "      <td>1217413</td>\n",
       "      <td>skītaną_gem-pro</td>\n",
       "      <td>skeyd_ine-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724901</th>\n",
       "      <td>inh</td>\n",
       "      <td>251147</td>\n",
       "      <td>354703</td>\n",
       "      <td>kokea_fi</td>\n",
       "      <td>kokedak_fiu-fin-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724902</th>\n",
       "      <td>inh</td>\n",
       "      <td>665740</td>\n",
       "      <td>93416</td>\n",
       "      <td>aluna_pt</td>\n",
       "      <td>alumna_la</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724903</th>\n",
       "      <td>der(s)</td>\n",
       "      <td>1120131</td>\n",
       "      <td>-37200</td>\n",
       "      <td>litográfico_es</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724904</th>\n",
       "      <td>bor</td>\n",
       "      <td>1163595</td>\n",
       "      <td>68631</td>\n",
       "      <td>cúlóm_ga</td>\n",
       "      <td>coulomb_en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724905</th>\n",
       "      <td>inh</td>\n",
       "      <td>1084576</td>\n",
       "      <td>927180</td>\n",
       "      <td>soiscéal_ga</td>\n",
       "      <td>soiscél_sga</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>724906 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       relation_type    child   parent     child_lexeme  \\\n",
       "0                der  1430903    93275       terrado_pt   \n",
       "1                inh   124835   239436  skītaną_gem-pro   \n",
       "2                inh   124835   239435  skītaną_gem-pro   \n",
       "3                inh   124835   124836  skītaną_gem-pro   \n",
       "4                inh   124835  1217413  skītaną_gem-pro   \n",
       "...              ...      ...      ...              ...   \n",
       "724901           inh   251147   354703         kokea_fi   \n",
       "724902           inh   665740    93416         aluna_pt   \n",
       "724903        der(s)  1120131   -37200   litográfico_es   \n",
       "724904           bor  1163595    68631         cúlóm_ga   \n",
       "724905           inh  1084576   927180      soiscéal_ga   \n",
       "\n",
       "                  parent_lexeme  \n",
       "0                      terra_pt  \n",
       "1                sḱeyd-_ine-pro  \n",
       "2                skeyd-_ine-pro  \n",
       "3       sḱeyd- ~ skeyt-_ine-pro  \n",
       "4                 skeyd_ine-pro  \n",
       "...                         ...  \n",
       "724901      kokedak_fiu-fin-pro  \n",
       "724902                alumna_la  \n",
       "724903                      NaN  \n",
       "724904               coulomb_en  \n",
       "724905              soiscél_sga  \n",
       "\n",
       "[724906 rows x 5 columns]"
      ]
     },
     "execution_count": 306,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 211,
   "id": "b09984fc-27f0-4afe-879c-631690c2ba84",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a directed graph\n",
    "G = nx.DiGraph()\n",
    "\n",
    "# Add edges to the graph\n",
    "for idx, row in df_link[df_link['relation_type'] != 'cog'].iterrows():\n",
    "    if pd.notna(row['parent']):  # Only add edges where the parent exists\n",
    "        G.add_edge(row['parent'], row['child'], relation_type=row['relation_type'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 239,
   "id": "861b5e80-4303-4202-9dd3-a65585fd132c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_etymological_paths(G, word):\n",
    "    \"\"\"\n",
    "    Retrieves all paths from the given word to its ancestors, including the relationship types along the paths.\n",
    "    \"\"\"\n",
    "    all_paths = {}\n",
    "    for ancestor in nx.ancestors(G, word):\n",
    "        # Find all simple paths from each ancestor to the word\n",
    "        paths = nx.all_simple_paths(G, source=ancestor, target=word)\n",
    "        detailed_paths = []\n",
    "        for path in paths:\n",
    "            # Get relationship types along the path\n",
    "            path_details = []\n",
    "            for i in range(len(path) - 1):\n",
    "                parent, child = path[i], path[i + 1]\n",
    "                relation_type = G.edges[parent, child]['relation_type']\n",
    "                path_details.append((parent, child, relation_type))\n",
    "            detailed_paths.append(path_details)\n",
    "        all_paths[ancestor] = detailed_paths \n",
    "    print_paths(all_paths)\n",
    "    # return all_paths\n",
    "\n",
    "\n",
    "\n",
    "def print_paths(paths):\n",
    "    \"\"\"\n",
    "    Prints each path with relationship details.\n",
    "    \"\"\"\n",
    "    for ancestor, paths_details in paths.items():\n",
    "        print(f\"Ancestor: {ancestor}\")\n",
    "        for path in paths_details:\n",
    "            # Create a readable path string, ensuring no child is double-printed\n",
    "            path_str = ''\n",
    "            for i, (parent, child, rel_type) in enumerate(path):\n",
    "                if i == 0:\n",
    "                    path_str += f\"{parent} --({rel_type})--> {child}\"\n",
    "                else:\n",
    "                    path_str += f\" --({rel_type})--> {child}\"\n",
    "            print(path_str)\n",
    "        print(\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 240,
   "id": "7679f966-7c3a-4ac0-a648-aa927df4b9c2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ancestor: μύριος_grc\n",
      "μύριος_grc --(der)--> μυριάς_grc --(inh)--> myrias_la --(bor)--> myriade_fr --(inh)--> myriad_en\n",
      "\n",
      "\n",
      "Ancestor: μυρίος_grc\n",
      "μυρίος_grc --(der)--> μυριάς_grc --(inh)--> myrias_la --(bor)--> myriade_fr --(inh)--> myriad_en\n",
      "\n",
      "\n",
      "Ancestor: myriade_fr\n",
      "myriade_fr --(inh)--> myriad_en\n",
      "\n",
      "\n",
      "Ancestor: μυριάς_grc\n",
      "μυριάς_grc --(inh)--> myrias_la --(bor)--> myriade_fr --(inh)--> myriad_en\n",
      "\n",
      "\n",
      "Ancestor: myrias_la\n",
      "myrias_la --(bor)--> myriade_fr --(inh)--> myriad_en\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "find_etymological_paths(G, 'myriad_en')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "id": "9285a171-d324-4478-9d00-172b1f9d5bae",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ancestor: πλήθω_grc\n",
      "πλήθω_grc --(der)--> πληθώρη_grc --(inh)--> plethora_la --(bor)--> plethora_en\n",
      "\n",
      "\n",
      "Ancestor: pleh₁-_ine-pro\n",
      "pleh₁-_ine-pro --(inh)--> πλήθω_grc --(der)--> πληθώρη_grc --(inh)--> plethora_la --(bor)--> plethora_en\n",
      "\n",
      "\n",
      "Ancestor: plethora_la\n",
      "plethora_la --(bor)--> plethora_en\n",
      "\n",
      "\n",
      "Ancestor: πληθώρη_grc\n",
      "πληθώρη_grc --(inh)--> plethora_la --(bor)--> plethora_en\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "find_etymological_paths(G, \"plethora_en\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "886604d8-772f-4cca-a99d-f4aae74337ce",
   "metadata": {},
   "source": [
    "### Debugging"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 308,
   "id": "475e68ec-b910-4500-ab7a-bcd41f9bc70b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/_1/v79t6xvn6ds15sxk2lxl00cr0000gn/T/ipykernel_38644/3695218092.py:4: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
      "  contains_substring = df_link.applymap(lambda x: substring in str(x)).any(axis=1)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       relation_type   child  parent child_lexeme parent_lexeme\n",
      "354532           inh   93900   93897  plethora_la   πληθώρη_grc\n",
      "488737           bor   93895   93896  plethora_en   plethora_la\n",
      "510642           inh  334182   93896   plétora_es   plethora_la\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/_1/v79t6xvn6ds15sxk2lxl00cr0000gn/T/ipykernel_38644/3695218092.py:4: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
      "  contains_substring = df_link.applymap(lambda x: substring in str(x)).any(axis=1)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       relation_type   child  parent child_lexeme parent_lexeme\n",
      "34485            der   93897   93898  πληθώρη_grc     πλήθω_grc\n",
      "255550           bor  694256   93897  pléthore_fr   πληθώρη_grc\n",
      "354532           inh   93900   93897  plethora_la   πληθώρη_grc\n",
      "680060           inh  332445   93897   pletura_la   πληθώρη_grc\n"
     ]
    }
   ],
   "source": [
    "substrings = [\"plethora\", \"πληθώρη\"]\n",
    "# substrings = [\"myriad\"]\n",
    "for substring in substrings:\n",
    "    contains_substring = df_link.applymap(lambda x: substring in str(x)).any(axis=1)\n",
    "    \n",
    "    # Filter the DataFrame based on the mask\n",
    "    filtered_rows = df_link[contains_substring]\n",
    "    \n",
    "    # Display the filtered rows\n",
    "    print(filtered_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 254,
   "id": "cd50bf89-ea36-4fc1-8e9c-882b0003b216",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "ename": "NetworkXError",
     "evalue": "The node copiousness_en is not in the digraph.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/classes/digraph.py:927\u001b[0m, in \u001b[0;36mDiGraph.predecessors\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m    926\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 927\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28miter\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_pred\u001b[49m\u001b[43m[\u001b[49m\u001b[43mn\u001b[49m\u001b[43m]\u001b[49m)\n\u001b[1;32m    928\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
      "\u001b[0;31mKeyError\u001b[0m: 'copiousness_en'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[0;31mNetworkXError\u001b[0m                             Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[254], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfind_etymological_paths\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcopiousness_en\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "Cell \u001b[0;32mIn[239], line 6\u001b[0m, in \u001b[0;36mfind_etymological_paths\u001b[0;34m(G, word)\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;124;03mRetrieves all paths from the given word to its ancestors, including the relationship types along the paths.\u001b[39;00m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m      5\u001b[0m all_paths \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m----> 6\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ancestor \u001b[38;5;129;01min\u001b[39;00m \u001b[43mnx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mancestors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m      7\u001b[0m     \u001b[38;5;66;03m# Find all simple paths from each ancestor to the word\u001b[39;00m\n\u001b[1;32m      8\u001b[0m     paths \u001b[38;5;241m=\u001b[39m nx\u001b[38;5;241m.\u001b[39mall_simple_paths(G, source\u001b[38;5;241m=\u001b[39mancestor, target\u001b[38;5;241m=\u001b[39mword)\n\u001b[1;32m      9\u001b[0m     detailed_paths \u001b[38;5;241m=\u001b[39m []\n",
      "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/utils/backends.py:412\u001b[0m, in \u001b[0;36m_dispatch.__call__\u001b[0;34m(self, backend, *args, **kwargs)\u001b[0m\n\u001b[1;32m    409\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m/\u001b[39m, \u001b[38;5;241m*\u001b[39margs, backend\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    410\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m backends:\n\u001b[1;32m    411\u001b[0m         \u001b[38;5;66;03m# Fast path if no backends are installed\u001b[39;00m\n\u001b[0;32m--> 412\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43morig_func\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    414\u001b[0m     \u001b[38;5;66;03m# Use `backend_name` in this function instead of `backend`\u001b[39;00m\n\u001b[1;32m    415\u001b[0m     backend_name \u001b[38;5;241m=\u001b[39m backend\n",
      "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/algorithms/dag.py:110\u001b[0m, in \u001b[0;36mancestors\u001b[0;34m(G, source)\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[38;5;129m@nx\u001b[39m\u001b[38;5;241m.\u001b[39m_dispatch\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mancestors\u001b[39m(G, source):\n\u001b[1;32m     78\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Returns all nodes having a path to `source` in `G`.\u001b[39;00m\n\u001b[1;32m     79\u001b[0m \n\u001b[1;32m     80\u001b[0m \u001b[38;5;124;03m    Parameters\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    108\u001b[0m \u001b[38;5;124;03m    descendants\u001b[39;00m\n\u001b[1;32m    109\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 110\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m{\u001b[49m\u001b[43mchild\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mparent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchild\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mnx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbfs_edges\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreverse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m}\u001b[49m\n",
      "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/algorithms/dag.py:110\u001b[0m, in \u001b[0;36m<setcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[38;5;129m@nx\u001b[39m\u001b[38;5;241m.\u001b[39m_dispatch\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mancestors\u001b[39m(G, source):\n\u001b[1;32m     78\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Returns all nodes having a path to `source` in `G`.\u001b[39;00m\n\u001b[1;32m     79\u001b[0m \n\u001b[1;32m     80\u001b[0m \u001b[38;5;124;03m    Parameters\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    108\u001b[0m \u001b[38;5;124;03m    descendants\u001b[39;00m\n\u001b[1;32m    109\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 110\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m{\u001b[49m\u001b[43mchild\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mparent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchild\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mnx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbfs_edges\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreverse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m}\u001b[49m\n",
      "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/algorithms/traversal/breadth_first_search.py:203\u001b[0m, in \u001b[0;36mbfs_edges\u001b[0;34m(G, source, reverse, depth_limit, sort_neighbors)\u001b[0m\n\u001b[1;32m    199\u001b[0m     \u001b[38;5;28;01myield from\u001b[39;00m generic_bfs_edges(\n\u001b[1;32m    200\u001b[0m         G, source, \u001b[38;5;28;01mlambda\u001b[39;00m node: \u001b[38;5;28miter\u001b[39m(sort_neighbors(successors(node))), depth_limit\n\u001b[1;32m    201\u001b[0m     )\n\u001b[1;32m    202\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 203\u001b[0m     \u001b[38;5;28;01myield from\u001b[39;00m generic_bfs_edges(G, source, successors, depth_limit)\n",
      "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/algorithms/traversal/breadth_first_search.py:103\u001b[0m, in \u001b[0;36mgeneric_bfs_edges\u001b[0;34m(G, source, neighbors, depth_limit, sort_neighbors)\u001b[0m\n\u001b[1;32m    101\u001b[0m n \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(G)\n\u001b[1;32m    102\u001b[0m depth \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m--> 103\u001b[0m next_parents_children \u001b[38;5;241m=\u001b[39m [(source, \u001b[43mneighbors\u001b[49m\u001b[43m(\u001b[49m\u001b[43msource\u001b[49m\u001b[43m)\u001b[49m)]\n\u001b[1;32m    104\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m next_parents_children \u001b[38;5;129;01mand\u001b[39;00m depth \u001b[38;5;241m<\u001b[39m depth_limit:\n\u001b[1;32m    105\u001b[0m     this_parents_children \u001b[38;5;241m=\u001b[39m next_parents_children\n",
      "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/classes/digraph.py:929\u001b[0m, in \u001b[0;36mDiGraph.predecessors\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m    927\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28miter\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pred[n])\n\u001b[1;32m    928\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m--> 929\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m NetworkXError(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe node \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mn\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m is not in the digraph.\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n",
      "\u001b[0;31mNetworkXError\u001b[0m: The node copiousness_en is not in the digraph."
     ]
    }
   ],
   "source": [
    "find_etymological_paths(G, \"copiousness_en\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 249,
   "id": "df9c6586-b663-488d-98e8-e5565882a5d6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation_type</th>\n",
       "      <th>child</th>\n",
       "      <th>parent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>der</td>\n",
       "      <td>terrado_pt</td>\n",
       "      <td>terra_pt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>inh</td>\n",
       "      <td>skītaną_gem-pro</td>\n",
       "      <td>sḱeyd-_ine-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>inh</td>\n",
       "      <td>skītaną_gem-pro</td>\n",
       "      <td>skeyd-_ine-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>inh</td>\n",
       "      <td>skītaną_gem-pro</td>\n",
       "      <td>sḱeyd- ~ skeyt-_ine-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>inh</td>\n",
       "      <td>skītaną_gem-pro</td>\n",
       "      <td>skeyd_ine-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724901</th>\n",
       "      <td>inh</td>\n",
       "      <td>kokea_fi</td>\n",
       "      <td>kokedak_fiu-fin-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724902</th>\n",
       "      <td>inh</td>\n",
       "      <td>aluna_pt</td>\n",
       "      <td>alumna_la</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724903</th>\n",
       "      <td>der(s)</td>\n",
       "      <td>litográfico_es</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724904</th>\n",
       "      <td>bor</td>\n",
       "      <td>cúlóm_ga</td>\n",
       "      <td>coulomb_en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724905</th>\n",
       "      <td>inh</td>\n",
       "      <td>soiscéal_ga</td>\n",
       "      <td>soiscél_sga</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>724906 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       relation_type            child                   parent\n",
       "0                der       terrado_pt                 terra_pt\n",
       "1                inh  skītaną_gem-pro           sḱeyd-_ine-pro\n",
       "2                inh  skītaną_gem-pro           skeyd-_ine-pro\n",
       "3                inh  skītaną_gem-pro  sḱeyd- ~ skeyt-_ine-pro\n",
       "4                inh  skītaną_gem-pro            skeyd_ine-pro\n",
       "...              ...              ...                      ...\n",
       "724901           inh         kokea_fi      kokedak_fiu-fin-pro\n",
       "724902           inh         aluna_pt                alumna_la\n",
       "724903        der(s)   litográfico_es                      NaN\n",
       "724904           bor         cúlóm_ga               coulomb_en\n",
       "724905           inh      soiscéal_ga              soiscél_sga\n",
       "\n",
       "[724906 rows x 3 columns]"
      ]
     },
     "execution_count": 249,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 251,
   "id": "eda8df48-9acf-4f7e-b153-d40bf80150c9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        lang  field      lexeme    meaning classification_lang normalized_form\n",
      "id                                                                            \n",
      "1129441   en      0    exampler   exampler                  en        exampler\n",
      "1129443   en      0   examplers  examplers                  en       examplers\n",
      "1578981   la      0  examplexor        NaN                  la      examplexor\n",
      "16136     en      0     example    example                  en         example\n",
      "16138    enm      0     example        NaN                 enm         example\n",
      "1866476  fro      0     example        NaN                 fro         example\n",
      "315456    en      0    examples   examples                  en        examples\n",
      "949419    en      0    exampled   exampled                  en        exampled\n",
      "Empty DataFrame\n",
      "Columns: [relation_type, child, parent]\n",
      "Index: []\n"
     ]
    }
   ],
   "source": [
    "# Find lexeme containing 'example'\n",
    "filtered_df = df_values[df_values['lexeme'].str.startswith('example', na=False)]\n",
    "\n",
    "# Display the filtered DataFrame\n",
    "print(filtered_df)\n",
    "\n",
    "# related_rows = df_link.loc[filtered_df.index]\n",
    "rows_with_ids = df_link[df_link.isin(filtered_df.index).any(axis=1)]\n",
    "print(rows_with_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 268,
   "id": "d6ebbcc9-45cd-4e5a-a58a-8a10ec1ca3f2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation_type</th>\n",
       "      <th>child</th>\n",
       "      <th>parent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>251448</th>\n",
       "      <td>inh</td>\n",
       "      <td>cōp_odt</td>\n",
       "      <td>kaupaz_gem-pro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>483114</th>\n",
       "      <td>inh</td>\n",
       "      <td>cōpon_odt</td>\n",
       "      <td>kaupōną_gem-pro</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       relation_type      child           parent\n",
       "251448           inh    cōp_odt   kaupaz_gem-pro\n",
       "483114           inh  cōpon_odt  kaupōną_gem-pro"
      ]
     },
     "execution_count": 268,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_link[df_link['child'].str.startswith('cōp', na=False)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 271,
   "id": "81e74a0d-bf3e-4129-8340-50b7b2b66602",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lang</th>\n",
       "      <th>field</th>\n",
       "      <th>lexeme</th>\n",
       "      <th>meaning</th>\n",
       "      <th>classification_lang</th>\n",
       "      <th>normalized_form</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>396532</th>\n",
       "      <td>la</td>\n",
       "      <td>0</td>\n",
       "      <td>cōpiōsus</td>\n",
       "      <td>NaN</td>\n",
       "      <td>la</td>\n",
       "      <td>copiosus</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71240</th>\n",
       "      <td>la</td>\n",
       "      <td>0</td>\n",
       "      <td>cōpia</td>\n",
       "      <td>supply</td>\n",
       "      <td>la</td>\n",
       "      <td>copia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71247</th>\n",
       "      <td>la</td>\n",
       "      <td>0</td>\n",
       "      <td>cōpia</td>\n",
       "      <td>plenty, abundance</td>\n",
       "      <td>la</td>\n",
       "      <td>copia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>925907</th>\n",
       "      <td>la</td>\n",
       "      <td>0</td>\n",
       "      <td>cōpia</td>\n",
       "      <td>NaN</td>\n",
       "      <td>la</td>\n",
       "      <td>copia</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       lang  field    lexeme            meaning classification_lang  \\\n",
       "id                                                                    \n",
       "396532   la      0  cōpiōsus                NaN                  la   \n",
       "71240    la      0     cōpia             supply                  la   \n",
       "71247    la      0     cōpia  plenty, abundance                  la   \n",
       "925907   la      0     cōpia                NaN                  la   \n",
       "\n",
       "       normalized_form  \n",
       "id                      \n",
       "396532        copiosus  \n",
       "71240            copia  \n",
       "71247            copia  \n",
       "925907           copia  "
      ]
     },
     "execution_count": 271,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_values[df_values['lexeme'].str.startswith('cōpi', na=False)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 285,
   "id": "b2b53b0a-9547-4206-9a99-1ff02ccaaefe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lang</th>\n",
       "      <th>field</th>\n",
       "      <th>lexeme</th>\n",
       "      <th>meaning</th>\n",
       "      <th>classification_lang</th>\n",
       "      <th>normalized_form</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1010388</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>copings</td>\n",
       "      <td>copings</td>\n",
       "      <td>en</td>\n",
       "      <td>copings</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1123071</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>copist</td>\n",
       "      <td>copist</td>\n",
       "      <td>en</td>\n",
       "      <td>copist</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1123072</th>\n",
       "      <td>fr</td>\n",
       "      <td>0</td>\n",
       "      <td>copiste</td>\n",
       "      <td>copist</td>\n",
       "      <td>fr</td>\n",
       "      <td>copiste</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1123073</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>copists</td>\n",
       "      <td>copists</td>\n",
       "      <td>en</td>\n",
       "      <td>copists</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1127504</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>copicide</td>\n",
       "      <td>copicide</td>\n",
       "      <td>en</td>\n",
       "      <td>copicide</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92270</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>copious</td>\n",
       "      <td>copious</td>\n",
       "      <td>en</td>\n",
       "      <td>copious</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92271</th>\n",
       "      <td>la</td>\n",
       "      <td>0</td>\n",
       "      <td>copiosus ~ copia</td>\n",
       "      <td>abundance</td>\n",
       "      <td>la</td>\n",
       "      <td>copiosus ~ copia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>956757</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>copiotroph</td>\n",
       "      <td>copiotroph</td>\n",
       "      <td>en</td>\n",
       "      <td>copiotroph</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>974941</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>copiloting</td>\n",
       "      <td>copiloting</td>\n",
       "      <td>en</td>\n",
       "      <td>copiloting</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>974942</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>copiloted</td>\n",
       "      <td>copiloted</td>\n",
       "      <td>en</td>\n",
       "      <td>copiloted</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>89 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        lang  field            lexeme     meaning classification_lang  \\\n",
       "id                                                                      \n",
       "1010388   en      0           copings     copings                  en   \n",
       "1123071   en      0            copist      copist                  en   \n",
       "1123072   fr      0           copiste      copist                  fr   \n",
       "1123073   en      0           copists     copists                  en   \n",
       "1127504   en      0          copicide    copicide                  en   \n",
       "...      ...    ...               ...         ...                 ...   \n",
       "92270     en      0           copious     copious                  en   \n",
       "92271     la      0  copiosus ~ copia   abundance                  la   \n",
       "956757    en      0        copiotroph  copiotroph                  en   \n",
       "974941    en      0        copiloting  copiloting                  en   \n",
       "974942    en      0         copiloted   copiloted                  en   \n",
       "\n",
       "          normalized_form  \n",
       "id                         \n",
       "1010388           copings  \n",
       "1123071            copist  \n",
       "1123072           copiste  \n",
       "1123073           copists  \n",
       "1127504          copicide  \n",
       "...                   ...  \n",
       "92270             copious  \n",
       "92271    copiosus ~ copia  \n",
       "956757         copiotroph  \n",
       "974941         copiloting  \n",
       "974942          copiloted  \n",
       "\n",
       "[89 rows x 6 columns]"
      ]
     },
     "execution_count": 285,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_values[df_values['lexeme'].str.startswith('copi', na=False)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 313,
   "id": "5440152f-31c6-44d9-b0a0-3ef6f6cd7ab0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_substring_in_values(substring):\n",
    "    contains_substring = df_values.applymap(lambda x: str(x).startswith(substring)).any(axis=1)\n",
    "    \n",
    "    # Filter the DataFrame based on the mask\n",
    "    filtered_rows = df_values[contains_substring]\n",
    "    \n",
    "    # Display the filtered rows\n",
    "    return filtered_rows\n",
    "\n",
    "def find_substring_in_links(substring):\n",
    "    contains_substring = df_link.applymap(lambda x: str(x).startswith(substring)).any(axis=1)\n",
    "    # contains_substring = df_link.applymap(lambda x: substring in str(x)).any(axis=1)\n",
    "    \n",
    "    # Filter the DataFrame based on the mask\n",
    "    filtered_rows = df_link[contains_substring]\n",
    "    \n",
    "    # Display the filtered rows\n",
    "    return filtered_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 314,
   "id": "c7b63d39-4d97-4efb-8a5a-dca9abd586af",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/_1/v79t6xvn6ds15sxk2lxl00cr0000gn/T/ipykernel_38644/2784440865.py:11: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
      "  contains_substring = df_link.applymap(lambda x: str(x).startswith(substring)).any(axis=1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation_type</th>\n",
       "      <th>child</th>\n",
       "      <th>parent</th>\n",
       "      <th>child_lexeme</th>\n",
       "      <th>parent_lexeme</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>51644</th>\n",
       "      <td>der</td>\n",
       "      <td>553565</td>\n",
       "      <td>399582</td>\n",
       "      <td>copione_it</td>\n",
       "      <td>copiare_it</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109777</th>\n",
       "      <td>inh</td>\n",
       "      <td>396531</td>\n",
       "      <td>396532</td>\n",
       "      <td>copioso_it</td>\n",
       "      <td>copiosus_la</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>235078</th>\n",
       "      <td>inh</td>\n",
       "      <td>396533</td>\n",
       "      <td>396532</td>\n",
       "      <td>copioso_pt</td>\n",
       "      <td>copiosus_la</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>273367</th>\n",
       "      <td>inh</td>\n",
       "      <td>396534</td>\n",
       "      <td>396532</td>\n",
       "      <td>copioso_es</td>\n",
       "      <td>copiosus_la</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>500184</th>\n",
       "      <td>cmpd+bor</td>\n",
       "      <td>92271</td>\n",
       "      <td>-1276</td>\n",
       "      <td>copiosus ~ copia_la</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>624282</th>\n",
       "      <td>bor</td>\n",
       "      <td>847746</td>\n",
       "      <td>396532</td>\n",
       "      <td>copieux_fr</td>\n",
       "      <td>copiosus_la</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       relation_type   child  parent         child_lexeme parent_lexeme\n",
       "51644            der  553565  399582           copione_it    copiare_it\n",
       "109777           inh  396531  396532           copioso_it   copiosus_la\n",
       "235078           inh  396533  396532           copioso_pt   copiosus_la\n",
       "273367           inh  396534  396532           copioso_es   copiosus_la\n",
       "500184      cmpd+bor   92271   -1276  copiosus ~ copia_la           NaN\n",
       "624282           bor  847746  396532           copieux_fr   copiosus_la"
      ]
     },
     "execution_count": 314,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "find_substring_in_links('copio')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 315,
   "id": "571dc06d-3fe4-4866-bbeb-2fb7d43fd113",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/_1/v79t6xvn6ds15sxk2lxl00cr0000gn/T/ipykernel_38644/2784440865.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
      "  contains_substring = df_values.applymap(lambda x: str(x).startswith(substring)).any(axis=1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lang</th>\n",
       "      <th>field</th>\n",
       "      <th>lexeme</th>\n",
       "      <th>meaning</th>\n",
       "      <th>classification_lang</th>\n",
       "      <th>normalized_form</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1089933</th>\n",
       "      <td>sk</td>\n",
       "      <td>0</td>\n",
       "      <td>sporý</td>\n",
       "      <td>copious</td>\n",
       "      <td>sk</td>\n",
       "      <td>sporý</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1489361</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>copiousnesses</td>\n",
       "      <td>copiousnesses</td>\n",
       "      <td>en</td>\n",
       "      <td>copiousnesses</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>320737</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>copiously</td>\n",
       "      <td>copiously</td>\n",
       "      <td>en</td>\n",
       "      <td>copiously</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>456061</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>copiousness</td>\n",
       "      <td>copiousness</td>\n",
       "      <td>en</td>\n",
       "      <td>copiousness</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92270</th>\n",
       "      <td>en</td>\n",
       "      <td>0</td>\n",
       "      <td>copious</td>\n",
       "      <td>copious</td>\n",
       "      <td>en</td>\n",
       "      <td>copious</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        lang  field         lexeme        meaning classification_lang  \\\n",
       "id                                                                      \n",
       "1089933   sk      0          sporý        copious                  sk   \n",
       "1489361   en      0  copiousnesses  copiousnesses                  en   \n",
       "320737    en      0      copiously      copiously                  en   \n",
       "456061    en      0    copiousness    copiousness                  en   \n",
       "92270     en      0        copious        copious                  en   \n",
       "\n",
       "        normalized_form  \n",
       "id                       \n",
       "1089933           sporý  \n",
       "1489361   copiousnesses  \n",
       "320737        copiously  \n",
       "456061      copiousness  \n",
       "92270           copious  "
      ]
     },
     "execution_count": 315,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "find_substring_in_values('copious')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 311,
   "id": "82fb127c-7787-4e6f-8817-890a3d9211d2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation_type</th>\n",
       "      <th>child</th>\n",
       "      <th>parent</th>\n",
       "      <th>child_lexeme</th>\n",
       "      <th>parent_lexeme</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>der(s)</td>\n",
       "      <td>1165275</td>\n",
       "      <td>-40470</td>\n",
       "      <td>offensivamente_it</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>cmpd+bor</td>\n",
       "      <td>275331</td>\n",
       "      <td>-10538</td>\n",
       "      <td>afuera_es</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>cmpd+bor</td>\n",
       "      <td>1606111</td>\n",
       "      <td>-62916</td>\n",
       "      <td>transisjonsmetall_nn</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>cmpd+bor</td>\n",
       "      <td>76497</td>\n",
       "      <td>-1026</td>\n",
       "      <td>پنجابی_fa</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>der(s)</td>\n",
       "      <td>1644793</td>\n",
       "      <td>-65270</td>\n",
       "      <td>memoriousness_en</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724807</th>\n",
       "      <td>cmpd+bor</td>\n",
       "      <td>1252646</td>\n",
       "      <td>-47339</td>\n",
       "      <td>mycha_pl</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724868</th>\n",
       "      <td>cmpd+bor</td>\n",
       "      <td>410216</td>\n",
       "      <td>-8062</td>\n",
       "      <td>refinar_pt</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724877</th>\n",
       "      <td>der(s)</td>\n",
       "      <td>1053115</td>\n",
       "      <td>-33809</td>\n",
       "      <td>wordwise_en</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724894</th>\n",
       "      <td>cmpd+bor</td>\n",
       "      <td>61790</td>\n",
       "      <td>-766</td>\n",
       "      <td>laurer_fr</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724903</th>\n",
       "      <td>der(s)</td>\n",
       "      <td>1120131</td>\n",
       "      <td>-37200</td>\n",
       "      <td>litográfico_es</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>81102 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       relation_type    child  parent          child_lexeme parent_lexeme\n",
       "15            der(s)  1165275  -40470     offensivamente_it           NaN\n",
       "22          cmpd+bor   275331  -10538             afuera_es           NaN\n",
       "30          cmpd+bor  1606111  -62916  transisjonsmetall_nn           NaN\n",
       "31          cmpd+bor    76497   -1026             پنجابی_fa           NaN\n",
       "48            der(s)  1644793  -65270      memoriousness_en           NaN\n",
       "...              ...      ...     ...                   ...           ...\n",
       "724807      cmpd+bor  1252646  -47339              mycha_pl           NaN\n",
       "724868      cmpd+bor   410216   -8062            refinar_pt           NaN\n",
       "724877        der(s)  1053115  -33809           wordwise_en           NaN\n",
       "724894      cmpd+bor    61790    -766             laurer_fr           NaN\n",
       "724903        der(s)  1120131  -37200        litográfico_es           NaN\n",
       "\n",
       "[81102 rows x 5 columns]"
      ]
     },
     "execution_count": 311,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_link[df_link.isnull().any(axis=1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 320,
   "id": "09c17a8c-997e-4fa7-9bcb-7ebabb88a89e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lang</th>\n",
       "      <th>field</th>\n",
       "      <th>lexeme</th>\n",
       "      <th>meaning</th>\n",
       "      <th>classification_lang</th>\n",
       "      <th>normalized_form</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1015334</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>描</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>描</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102467</th>\n",
       "      <td>de</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>zero</td>\n",
       "      <td>de</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1026310</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>曼谷</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>曼谷</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1064258</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>白菜__HASH__Chinese</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>白菜__HASH__Chinese</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1110324</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>khà-sàng</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>khà-sàng</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>947497</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>馬車</td>\n",
       "      <td>becak</td>\n",
       "      <td>NaN</td>\n",
       "      <td>馬車</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>979068</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>白話字 ~ 白话字</td>\n",
       "      <td>vernacular writing</td>\n",
       "      <td>NaN</td>\n",
       "      <td>白話字 ~ 白话字</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>991521</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>復仇</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>復仇</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1397622</th>\n",
       "      <td>aql-pro</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>aql-pro</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>907358</th>\n",
       "      <td>trk-pro</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>trk-pro</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>221 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            lang  field             lexeme             meaning  \\\n",
       "id                                                               \n",
       "1015334      NaN      0                  描                 NaN   \n",
       "102467        de      0                NaN                zero   \n",
       "1026310      NaN      0                 曼谷                 NaN   \n",
       "1064258      NaN      0  白菜__HASH__Chinese                 NaN   \n",
       "1110324      NaN      0           khà-sàng                 NaN   \n",
       "...          ...    ...                ...                 ...   \n",
       "947497       NaN      0                 馬車               becak   \n",
       "979068       NaN      0          白話字 ~ 白话字  vernacular writing   \n",
       "991521       NaN      0                 復仇                 NaN   \n",
       "1397622  aql-pro      0                NaN                 NaN   \n",
       "907358   trk-pro      0                NaN                 NaN   \n",
       "\n",
       "        classification_lang    normalized_form  \n",
       "id                                              \n",
       "1015334                 NaN                  描  \n",
       "102467                   de                NaN  \n",
       "1026310                 NaN                 曼谷  \n",
       "1064258                 NaN  白菜__HASH__Chinese  \n",
       "1110324                 NaN           khà-sàng  \n",
       "...                     ...                ...  \n",
       "947497                  NaN                 馬車  \n",
       "979068                  NaN          白話字 ~ 白话字  \n",
       "991521                  NaN                 復仇  \n",
       "1397622             aql-pro                NaN  \n",
       "907358              trk-pro                NaN  \n",
       "\n",
       "[221 rows x 6 columns]"
      ]
     },
     "execution_count": 320,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_values[df_values.isnull().any(axis=1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 325,
   "id": "f2e02f80-4f8b-45a6-aab5-6575e7c2d61d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lang</th>\n",
       "      <th>field</th>\n",
       "      <th>lexeme</th>\n",
       "      <th>classification_lang</th>\n",
       "      <th>normalized_form</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1015334</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>描</td>\n",
       "      <td>NaN</td>\n",
       "      <td>描</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102467</th>\n",
       "      <td>de</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>de</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1026310</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>曼谷</td>\n",
       "      <td>NaN</td>\n",
       "      <td>曼谷</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1064258</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>白菜__HASH__Chinese</td>\n",
       "      <td>NaN</td>\n",
       "      <td>白菜__HASH__Chinese</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1110324</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>khà-sàng</td>\n",
       "      <td>NaN</td>\n",
       "      <td>khà-sàng</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>947497</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>馬車</td>\n",
       "      <td>NaN</td>\n",
       "      <td>馬車</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>979068</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>白話字 ~ 白话字</td>\n",
       "      <td>NaN</td>\n",
       "      <td>白話字 ~ 白话字</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>991521</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>復仇</td>\n",
       "      <td>NaN</td>\n",
       "      <td>復仇</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1397622</th>\n",
       "      <td>aql-pro</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>aql-pro</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>907358</th>\n",
       "      <td>trk-pro</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>trk-pro</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>221 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            lang  field             lexeme classification_lang  \\\n",
       "id                                                               \n",
       "1015334      NaN      0                  描                 NaN   \n",
       "102467        de      0                NaN                  de   \n",
       "1026310      NaN      0                 曼谷                 NaN   \n",
       "1064258      NaN      0  白菜__HASH__Chinese                 NaN   \n",
       "1110324      NaN      0           khà-sàng                 NaN   \n",
       "...          ...    ...                ...                 ...   \n",
       "947497       NaN      0                 馬車                 NaN   \n",
       "979068       NaN      0          白話字 ~ 白话字                 NaN   \n",
       "991521       NaN      0                 復仇                 NaN   \n",
       "1397622  aql-pro      0                NaN             aql-pro   \n",
       "907358   trk-pro      0                NaN             trk-pro   \n",
       "\n",
       "           normalized_form  \n",
       "id                          \n",
       "1015334                  描  \n",
       "102467                 NaN  \n",
       "1026310                 曼谷  \n",
       "1064258  白菜__HASH__Chinese  \n",
       "1110324           khà-sàng  \n",
       "...                    ...  \n",
       "947497                  馬車  \n",
       "979068           白話字 ~ 白话字  \n",
       "991521                  復仇  \n",
       "1397622                NaN  \n",
       "907358                 NaN  \n",
       "\n",
       "[221 rows x 5 columns]"
      ]
     },
     "execution_count": 325,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_values.drop('meaning', axis=1)[df_values.drop('meaning', axis=1).isnull().any(axis=1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ab9c1dd-829b-4697-92b4-24a26b9479c3",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}