{
"cells": [
{
"cell_type": "code",
"execution_count": 203,
"id": "dc282d33-93c3-4c19-a49a-60feb16ee92a",
"metadata": {},
"outputs": [],
"source": [
"# To read the .csv files\n",
"import pandas as pd \n",
"# To store the direct inheritance relations as a graph\n",
"import networkx as nx\n",
"import matplotlib.pyplot as plt\n",
"from code_to_langs import wiki_code_to_lang\n",
"import unicodedata"
]
},
{
"cell_type": "code",
"execution_count": 301,
"id": "ad6271c3-1521-4fb4-b212-865d07e8e03b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/_1/v79t6xvn6ds15sxk2lxl00cr0000gn/T/ipykernel_38644/2603880363.py:4: ParserWarning: Skipping line 745166: expected 5 fields, saw 6\n",
"\n",
" df_values = pd.read_csv(path_values,\n",
"/var/folders/_1/v79t6xvn6ds15sxk2lxl00cr0000gn/T/ipykernel_38644/2603880363.py:4: ParserWarning: Skipping line 998371: expected 5 fields, saw 6\n",
"\n",
" df_values = pd.read_csv(path_values,\n"
]
}
],
"source": [
"path_values = \"../data/split_etymdb/etymdb_values.csv\"\n",
"path_link = \"../data/split_etymdb/etymdb_links_info.csv\"\n",
"\n",
"df_values = pd.read_csv(path_values,\n",
" sep='\\t',\n",
" names=[\"id\", \"lang\", \"field\", \"lexeme\", \"meaning\"],\n",
" dtype={\"id\": int, \"lang\": str, \"field\": int, \"meaning\": str},\n",
" on_bad_lines='warn').set_index(\"id\")\n",
"\n",
"df_link = pd.read_csv(path_link,\n",
" sep='\\t',\n",
" names=[\"relation_type\", \"child\", \"parent\"],\n",
" dtype={\"relation_type\": str, \"child\": int, \"parent\": int})\n",
"\n",
"df_inher = df_link.loc[df_link['relation_type'].isin([\"inh\"])]\n",
"df_bor = df_link.loc[df_link['relation_type'].isin([\"bor\"])]\n",
"df_cog = df_link.loc[df_link['relation_type'].isin([\"cog\"])]"
]
},
{
"cell_type": "markdown",
"id": "cb8adc5f-d73b-4a36-a38a-23aa5dd08e35",
"metadata": {},
"source": [
"### Data cleaning"
]
},
{
"cell_type": "code",
"execution_count": 302,
"id": "17d9a12d-d58f-4df2-8be7-c1353f1a3383",
"metadata": {},
"outputs": [],
"source": [
"lang_map = {\n",
" 'itc-ola': 'la', 'la': 'la', 'la-ecc': 'la', 'la-lat': 'la',\n",
" 'la-med': 'la', 'la-new': 'la', 'la-ren': 'la', 'la-vul': 'la'\n",
"}\n",
"\n",
"# Apply the mapping and fill other values with their original\n",
"df_values['classification_lang'] = df_values['lang'].map(lang_map).fillna(df_values['lang'])\n",
"\n",
"def remove_accents(input_str):\n",
" if not isinstance(input_str, str):\n",
" return input_str\n",
" nfkd_form = unicodedata.normalize('NFKD', input_str)\n",
" # Keep only base characters, remove diacritics\n",
" return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])\n",
"\n",
"\n",
"# Apply the function only to lexemes where 'lang' is 'la'\n",
"df_values['normalized_form'] = df_values.apply(\n",
" lambda row: remove_accents(row['lexeme']) if row['classification_lang'] == 'la' else row['lexeme'],\n",
" axis=1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 303,
"id": "162864e7-ef62-4ecb-9264-c5fafbcd7522",
"metadata": {},
"outputs": [],
"source": [
"map_series = df_values['normalized_form'] + '_' + df_values['classification_lang']\n",
"\n",
"# Replace IDs in df_link using the map\n",
"df_link['child_lexeme'] = df_link['child'].map(map_series)\n",
"df_link['parent_lexeme'] = df_link['parent'].map(map_series)"
]
},
{
"cell_type": "code",
"execution_count": 304,
"id": "a72a3af4-51c9-4d5a-bf50-f4c21a144650",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>lang</th>\n",
" <th>field</th>\n",
" <th>lexeme</th>\n",
" <th>meaning</th>\n",
" <th>classification_lang</th>\n",
" <th>normalized_form</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1005511</th>\n",
" <td>la-new</td>\n",
" <td>0</td>\n",
" <td>curcuma</td>\n",
" <td>NaN</td>\n",
" <td>la</td>\n",
" <td>curcuma</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1006130</th>\n",
" <td>la-new</td>\n",
" <td>0</td>\n",
" <td>aphis</td>\n",
" <td>NaN</td>\n",
" <td>la</td>\n",
" <td>aphis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1014623</th>\n",
" <td>la-new</td>\n",
" <td>0</td>\n",
" <td>Opiliones</td>\n",
" <td>NaN</td>\n",
" <td>la</td>\n",
" <td>Opiliones</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1023473</th>\n",
" <td>la-new</td>\n",
" <td>0</td>\n",
" <td>punctātrīx</td>\n",
" <td>marker by dots</td>\n",
" <td>la</td>\n",
" <td>punctatrix</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1026932</th>\n",
" <td>la-new</td>\n",
" <td>0</td>\n",
" <td>Gorgonia</td>\n",
" <td>NaN</td>\n",
" <td>la</td>\n",
" <td>Gorgonia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>975058</th>\n",
" <td>la-new</td>\n",
" <td>0</td>\n",
" <td>molybdaena</td>\n",
" <td>any of various substances resembling lead, inc...</td>\n",
" <td>la</td>\n",
" <td>molybdaena</td>\n",
" </tr>\n",
" <tr>\n",
" <th>979913</th>\n",
" <td>la-new</td>\n",
" <td>0</td>\n",
" <td>dactylo-</td>\n",
" <td>NaN</td>\n",
" <td>la</td>\n",
" <td>dactylo-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>983202</th>\n",
" <td>la-new</td>\n",
" <td>0</td>\n",
" <td>mōlecula</td>\n",
" <td>NaN</td>\n",
" <td>la</td>\n",
" <td>molecula</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98511</th>\n",
" <td>la-new</td>\n",
" <td>0</td>\n",
" <td>dys-</td>\n",
" <td>dysphemism</td>\n",
" <td>la</td>\n",
" <td>dys-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>993666</th>\n",
" <td>la-new</td>\n",
" <td>0</td>\n",
" <td>butaurus</td>\n",
" <td>NaN</td>\n",
" <td>la</td>\n",
" <td>butaurus</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>688 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" lang field lexeme \\\n",
"id \n",
"1005511 la-new 0 curcuma \n",
"1006130 la-new 0 aphis \n",
"1014623 la-new 0 Opiliones \n",
"1023473 la-new 0 punctātrīx \n",
"1026932 la-new 0 Gorgonia \n",
"... ... ... ... \n",
"975058 la-new 0 molybdaena \n",
"979913 la-new 0 dactylo- \n",
"983202 la-new 0 mōlecula \n",
"98511 la-new 0 dys- \n",
"993666 la-new 0 butaurus \n",
"\n",
" meaning \\\n",
"id \n",
"1005511 NaN \n",
"1006130 NaN \n",
"1014623 NaN \n",
"1023473 marker by dots \n",
"1026932 NaN \n",
"... ... \n",
"975058 any of various substances resembling lead, inc... \n",
"979913 NaN \n",
"983202 NaN \n",
"98511 dysphemism \n",
"993666 NaN \n",
"\n",
" classification_lang normalized_form \n",
"id \n",
"1005511 la curcuma \n",
"1006130 la aphis \n",
"1014623 la Opiliones \n",
"1023473 la punctatrix \n",
"1026932 la Gorgonia \n",
"... ... ... \n",
"975058 la molybdaena \n",
"979913 la dactylo- \n",
"983202 la molecula \n",
"98511 la dys- \n",
"993666 la butaurus \n",
"\n",
"[688 rows x 6 columns]"
]
},
"execution_count": 304,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_values[df_values.eq('la-new').any(axis=1)]"
]
},
{
"cell_type": "code",
"execution_count": 305,
"id": "bb4fde5d-80fe-4fb8-a0d9-754747ed5c19",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>lang</th>\n",
" <th>field</th>\n",
" <th>lexeme</th>\n",
" <th>meaning</th>\n",
" <th>classification_lang</th>\n",
" <th>normalized_form</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>dictionary</td>\n",
" <td>dictionary</td>\n",
" <td>en</td>\n",
" <td>dictionary</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>enm</td>\n",
" <td>0</td>\n",
" <td>free</td>\n",
" <td>free</td>\n",
" <td>enm</td>\n",
" <td>free</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>ru</td>\n",
" <td>0</td>\n",
" <td>kot</td>\n",
" <td>NaN</td>\n",
" <td>ru</td>\n",
" <td>kot</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1000</th>\n",
" <td>fy</td>\n",
" <td>0</td>\n",
" <td>diele</td>\n",
" <td>to divide, to separate</td>\n",
" <td>fy</td>\n",
" <td>diele</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10000</th>\n",
" <td>la</td>\n",
" <td>0</td>\n",
" <td>proprius</td>\n",
" <td>ownership</td>\n",
" <td>la</td>\n",
" <td>proprius</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>999998</th>\n",
" <td>gmq-osw</td>\n",
" <td>0</td>\n",
" <td>þrir</td>\n",
" <td>NaN</td>\n",
" <td>gmq-osw</td>\n",
" <td>þrir</td>\n",
" </tr>\n",
" <tr>\n",
" <th>999999</th>\n",
" <td>gmq-oda</td>\n",
" <td>0</td>\n",
" <td>fæm</td>\n",
" <td>NaN</td>\n",
" <td>gmq-oda</td>\n",
" <td>fæm</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1397622</th>\n",
" <td>aql-pro</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>aql-pro</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1885631</th>\n",
" <td>sa</td>\n",
" <td>0</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>sa</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>907358</th>\n",
" <td>trk-pro</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>trk-pro</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1885104 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" lang field lexeme meaning \\\n",
"id \n",
"1 en 0 dictionary dictionary \n",
"10 enm 0 free free \n",
"100 ru 0 kot NaN \n",
"1000 fy 0 diele to divide, to separate \n",
"10000 la 0 proprius ownership \n",
"... ... ... ... ... \n",
"999998 gmq-osw 0 þrir NaN \n",
"999999 gmq-oda 0 fæm NaN \n",
"1397622 aql-pro 0 NaN NaN \n",
"1885631 sa 0 NaN \n",
"907358 trk-pro 0 NaN NaN \n",
"\n",
" classification_lang normalized_form \n",
"id \n",
"1 en dictionary \n",
"10 enm free \n",
"100 ru kot \n",
"1000 fy diele \n",
"10000 la proprius \n",
"... ... ... \n",
"999998 gmq-osw þrir \n",
"999999 gmq-oda fæm \n",
"1397622 aql-pro NaN \n",
"1885631 sa \n",
"907358 trk-pro NaN \n",
"\n",
"[1885104 rows x 6 columns]"
]
},
"execution_count": 305,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_values"
]
},
{
"cell_type": "code",
"execution_count": 306,
"id": "e59fa0ca-1cbb-43e1-9327-c043ff72d4a4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation_type</th>\n",
" <th>child</th>\n",
" <th>parent</th>\n",
" <th>child_lexeme</th>\n",
" <th>parent_lexeme</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>der</td>\n",
" <td>1430903</td>\n",
" <td>93275</td>\n",
" <td>terrado_pt</td>\n",
" <td>terra_pt</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>inh</td>\n",
" <td>124835</td>\n",
" <td>239436</td>\n",
" <td>skītaną_gem-pro</td>\n",
" <td>sḱeyd-_ine-pro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>inh</td>\n",
" <td>124835</td>\n",
" <td>239435</td>\n",
" <td>skītaną_gem-pro</td>\n",
" <td>skeyd-_ine-pro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>inh</td>\n",
" <td>124835</td>\n",
" <td>124836</td>\n",
" <td>skītaną_gem-pro</td>\n",
" <td>sḱeyd- ~ skeyt-_ine-pro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>inh</td>\n",
" <td>124835</td>\n",
" <td>1217413</td>\n",
" <td>skītaną_gem-pro</td>\n",
" <td>skeyd_ine-pro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724901</th>\n",
" <td>inh</td>\n",
" <td>251147</td>\n",
" <td>354703</td>\n",
" <td>kokea_fi</td>\n",
" <td>kokedak_fiu-fin-pro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724902</th>\n",
" <td>inh</td>\n",
" <td>665740</td>\n",
" <td>93416</td>\n",
" <td>aluna_pt</td>\n",
" <td>alumna_la</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724903</th>\n",
" <td>der(s)</td>\n",
" <td>1120131</td>\n",
" <td>-37200</td>\n",
" <td>litográfico_es</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724904</th>\n",
" <td>bor</td>\n",
" <td>1163595</td>\n",
" <td>68631</td>\n",
" <td>cúlóm_ga</td>\n",
" <td>coulomb_en</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724905</th>\n",
" <td>inh</td>\n",
" <td>1084576</td>\n",
" <td>927180</td>\n",
" <td>soiscéal_ga</td>\n",
" <td>soiscél_sga</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>724906 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" relation_type child parent child_lexeme \\\n",
"0 der 1430903 93275 terrado_pt \n",
"1 inh 124835 239436 skītaną_gem-pro \n",
"2 inh 124835 239435 skītaną_gem-pro \n",
"3 inh 124835 124836 skītaną_gem-pro \n",
"4 inh 124835 1217413 skītaną_gem-pro \n",
"... ... ... ... ... \n",
"724901 inh 251147 354703 kokea_fi \n",
"724902 inh 665740 93416 aluna_pt \n",
"724903 der(s) 1120131 -37200 litográfico_es \n",
"724904 bor 1163595 68631 cúlóm_ga \n",
"724905 inh 1084576 927180 soiscéal_ga \n",
"\n",
" parent_lexeme \n",
"0 terra_pt \n",
"1 sḱeyd-_ine-pro \n",
"2 skeyd-_ine-pro \n",
"3 sḱeyd- ~ skeyt-_ine-pro \n",
"4 skeyd_ine-pro \n",
"... ... \n",
"724901 kokedak_fiu-fin-pro \n",
"724902 alumna_la \n",
"724903 NaN \n",
"724904 coulomb_en \n",
"724905 soiscél_sga \n",
"\n",
"[724906 rows x 5 columns]"
]
},
"execution_count": 306,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_link"
]
},
{
"cell_type": "code",
"execution_count": 211,
"id": "b09984fc-27f0-4afe-879c-631690c2ba84",
"metadata": {},
"outputs": [],
"source": [
"# Create a directed graph\n",
"G = nx.DiGraph()\n",
"\n",
"# Add edges to the graph\n",
"for idx, row in df_link[df_link['relation_type'] != 'cog'].iterrows():\n",
" if pd.notna(row['parent']): # Only add edges where the parent exists\n",
" G.add_edge(row['parent'], row['child'], relation_type=row['relation_type'])"
]
},
{
"cell_type": "code",
"execution_count": 239,
"id": "861b5e80-4303-4202-9dd3-a65585fd132c",
"metadata": {},
"outputs": [],
"source": [
"def find_etymological_paths(G, word):\n",
" \"\"\"\n",
" Retrieves all paths from the given word to its ancestors, including the relationship types along the paths.\n",
" \"\"\"\n",
" all_paths = {}\n",
" for ancestor in nx.ancestors(G, word):\n",
" # Find all simple paths from each ancestor to the word\n",
" paths = nx.all_simple_paths(G, source=ancestor, target=word)\n",
" detailed_paths = []\n",
" for path in paths:\n",
" # Get relationship types along the path\n",
" path_details = []\n",
" for i in range(len(path) - 1):\n",
" parent, child = path[i], path[i + 1]\n",
" relation_type = G.edges[parent, child]['relation_type']\n",
" path_details.append((parent, child, relation_type))\n",
" detailed_paths.append(path_details)\n",
" all_paths[ancestor] = detailed_paths \n",
" print_paths(all_paths)\n",
" # return all_paths\n",
"\n",
"\n",
"\n",
"def print_paths(paths):\n",
" \"\"\"\n",
" Prints each path with relationship details.\n",
" \"\"\"\n",
" for ancestor, paths_details in paths.items():\n",
" print(f\"Ancestor: {ancestor}\")\n",
" for path in paths_details:\n",
" # Create a readable path string, ensuring no child is double-printed\n",
" path_str = ''\n",
" for i, (parent, child, rel_type) in enumerate(path):\n",
" if i == 0:\n",
" path_str += f\"{parent} --({rel_type})--> {child}\"\n",
" else:\n",
" path_str += f\" --({rel_type})--> {child}\"\n",
" print(path_str)\n",
" print(\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 240,
"id": "7679f966-7c3a-4ac0-a648-aa927df4b9c2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ancestor: μύριος_grc\n",
"μύριος_grc --(der)--> μυριάς_grc --(inh)--> myrias_la --(bor)--> myriade_fr --(inh)--> myriad_en\n",
"\n",
"\n",
"Ancestor: μυρίος_grc\n",
"μυρίος_grc --(der)--> μυριάς_grc --(inh)--> myrias_la --(bor)--> myriade_fr --(inh)--> myriad_en\n",
"\n",
"\n",
"Ancestor: myriade_fr\n",
"myriade_fr --(inh)--> myriad_en\n",
"\n",
"\n",
"Ancestor: μυριάς_grc\n",
"μυριάς_grc --(inh)--> myrias_la --(bor)--> myriade_fr --(inh)--> myriad_en\n",
"\n",
"\n",
"Ancestor: myrias_la\n",
"myrias_la --(bor)--> myriade_fr --(inh)--> myriad_en\n",
"\n",
"\n"
]
}
],
"source": [
"find_etymological_paths(G, 'myriad_en')"
]
},
{
"cell_type": "code",
"execution_count": 230,
"id": "9285a171-d324-4478-9d00-172b1f9d5bae",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ancestor: πλήθω_grc\n",
"πλήθω_grc --(der)--> πληθώρη_grc --(inh)--> plethora_la --(bor)--> plethora_en\n",
"\n",
"\n",
"Ancestor: pleh₁-_ine-pro\n",
"pleh₁-_ine-pro --(inh)--> πλήθω_grc --(der)--> πληθώρη_grc --(inh)--> plethora_la --(bor)--> plethora_en\n",
"\n",
"\n",
"Ancestor: plethora_la\n",
"plethora_la --(bor)--> plethora_en\n",
"\n",
"\n",
"Ancestor: πληθώρη_grc\n",
"πληθώρη_grc --(inh)--> plethora_la --(bor)--> plethora_en\n",
"\n",
"\n"
]
}
],
"source": [
"find_etymological_paths(G, \"plethora_en\")"
]
},
{
"cell_type": "markdown",
"id": "886604d8-772f-4cca-a99d-f4aae74337ce",
"metadata": {},
"source": [
"### Debugging"
]
},
{
"cell_type": "code",
"execution_count": 308,
"id": "475e68ec-b910-4500-ab7a-bcd41f9bc70b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/_1/v79t6xvn6ds15sxk2lxl00cr0000gn/T/ipykernel_38644/3695218092.py:4: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
" contains_substring = df_link.applymap(lambda x: substring in str(x)).any(axis=1)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" relation_type child parent child_lexeme parent_lexeme\n",
"354532 inh 93900 93897 plethora_la πληθώρη_grc\n",
"488737 bor 93895 93896 plethora_en plethora_la\n",
"510642 inh 334182 93896 plétora_es plethora_la\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/_1/v79t6xvn6ds15sxk2lxl00cr0000gn/T/ipykernel_38644/3695218092.py:4: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
" contains_substring = df_link.applymap(lambda x: substring in str(x)).any(axis=1)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" relation_type child parent child_lexeme parent_lexeme\n",
"34485 der 93897 93898 πληθώρη_grc πλήθω_grc\n",
"255550 bor 694256 93897 pléthore_fr πληθώρη_grc\n",
"354532 inh 93900 93897 plethora_la πληθώρη_grc\n",
"680060 inh 332445 93897 pletura_la πληθώρη_grc\n"
]
}
],
"source": [
"substrings = [\"plethora\", \"πληθώρη\"]\n",
"# substrings = [\"myriad\"]\n",
"for substring in substrings:\n",
" contains_substring = df_link.applymap(lambda x: substring in str(x)).any(axis=1)\n",
" \n",
" # Filter the DataFrame based on the mask\n",
" filtered_rows = df_link[contains_substring]\n",
" \n",
" # Display the filtered rows\n",
" print(filtered_rows)"
]
},
{
"cell_type": "code",
"execution_count": 254,
"id": "cd50bf89-ea36-4fc1-8e9c-882b0003b216",
"metadata": {
"scrolled": true
},
"outputs": [
{
"ename": "NetworkXError",
"evalue": "The node copiousness_en is not in the digraph.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/classes/digraph.py:927\u001b[0m, in \u001b[0;36mDiGraph.predecessors\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 926\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 927\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28miter\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_pred\u001b[49m\u001b[43m[\u001b[49m\u001b[43mn\u001b[49m\u001b[43m]\u001b[49m)\n\u001b[1;32m 928\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
"\u001b[0;31mKeyError\u001b[0m: 'copiousness_en'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mNetworkXError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[254], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfind_etymological_paths\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcopiousness_en\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"Cell \u001b[0;32mIn[239], line 6\u001b[0m, in \u001b[0;36mfind_etymological_paths\u001b[0;34m(G, word)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124;03mRetrieves all paths from the given word to its ancestors, including the relationship types along the paths.\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 5\u001b[0m all_paths \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m----> 6\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ancestor \u001b[38;5;129;01min\u001b[39;00m \u001b[43mnx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mancestors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m# Find all simple paths from each ancestor to the word\u001b[39;00m\n\u001b[1;32m 8\u001b[0m paths \u001b[38;5;241m=\u001b[39m nx\u001b[38;5;241m.\u001b[39mall_simple_paths(G, source\u001b[38;5;241m=\u001b[39mancestor, target\u001b[38;5;241m=\u001b[39mword)\n\u001b[1;32m 9\u001b[0m detailed_paths \u001b[38;5;241m=\u001b[39m []\n",
"File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/utils/backends.py:412\u001b[0m, in \u001b[0;36m_dispatch.__call__\u001b[0;34m(self, backend, *args, **kwargs)\u001b[0m\n\u001b[1;32m 409\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m/\u001b[39m, \u001b[38;5;241m*\u001b[39margs, backend\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 410\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m backends:\n\u001b[1;32m 411\u001b[0m \u001b[38;5;66;03m# Fast path if no backends are installed\u001b[39;00m\n\u001b[0;32m--> 412\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43morig_func\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 414\u001b[0m \u001b[38;5;66;03m# Use `backend_name` in this function instead of `backend`\u001b[39;00m\n\u001b[1;32m 415\u001b[0m backend_name \u001b[38;5;241m=\u001b[39m backend\n",
"File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/algorithms/dag.py:110\u001b[0m, in \u001b[0;36mancestors\u001b[0;34m(G, source)\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;129m@nx\u001b[39m\u001b[38;5;241m.\u001b[39m_dispatch\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mancestors\u001b[39m(G, source):\n\u001b[1;32m 78\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns all nodes having a path to `source` in `G`.\u001b[39;00m\n\u001b[1;32m 79\u001b[0m \n\u001b[1;32m 80\u001b[0m \u001b[38;5;124;03m Parameters\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;124;03m descendants\u001b[39;00m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 110\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m{\u001b[49m\u001b[43mchild\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mparent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchild\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mnx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbfs_edges\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreverse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m}\u001b[49m\n",
"File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/algorithms/dag.py:110\u001b[0m, in \u001b[0;36m<setcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;129m@nx\u001b[39m\u001b[38;5;241m.\u001b[39m_dispatch\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mancestors\u001b[39m(G, source):\n\u001b[1;32m 78\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns all nodes having a path to `source` in `G`.\u001b[39;00m\n\u001b[1;32m 79\u001b[0m \n\u001b[1;32m 80\u001b[0m \u001b[38;5;124;03m Parameters\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;124;03m descendants\u001b[39;00m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 110\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m{\u001b[49m\u001b[43mchild\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mparent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchild\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mnx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbfs_edges\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreverse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m}\u001b[49m\n",
"File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/algorithms/traversal/breadth_first_search.py:203\u001b[0m, in \u001b[0;36mbfs_edges\u001b[0;34m(G, source, reverse, depth_limit, sort_neighbors)\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m generic_bfs_edges(\n\u001b[1;32m 200\u001b[0m G, source, \u001b[38;5;28;01mlambda\u001b[39;00m node: \u001b[38;5;28miter\u001b[39m(sort_neighbors(successors(node))), depth_limit\n\u001b[1;32m 201\u001b[0m )\n\u001b[1;32m 202\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 203\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m generic_bfs_edges(G, source, successors, depth_limit)\n",
"File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/algorithms/traversal/breadth_first_search.py:103\u001b[0m, in \u001b[0;36mgeneric_bfs_edges\u001b[0;34m(G, source, neighbors, depth_limit, sort_neighbors)\u001b[0m\n\u001b[1;32m 101\u001b[0m n \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(G)\n\u001b[1;32m 102\u001b[0m depth \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m--> 103\u001b[0m next_parents_children \u001b[38;5;241m=\u001b[39m [(source, \u001b[43mneighbors\u001b[49m\u001b[43m(\u001b[49m\u001b[43msource\u001b[49m\u001b[43m)\u001b[49m)]\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m next_parents_children \u001b[38;5;129;01mand\u001b[39;00m depth \u001b[38;5;241m<\u001b[39m depth_limit:\n\u001b[1;32m 105\u001b[0m this_parents_children \u001b[38;5;241m=\u001b[39m next_parents_children\n",
"File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/research/lib/python3.11/site-packages/networkx/classes/digraph.py:929\u001b[0m, in \u001b[0;36mDiGraph.predecessors\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 927\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28miter\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pred[n])\n\u001b[1;32m 928\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m--> 929\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m NetworkXError(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe node \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mn\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m is not in the digraph.\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n",
"\u001b[0;31mNetworkXError\u001b[0m: The node copiousness_en is not in the digraph."
]
}
],
"source": [
"find_etymological_paths(G, \"copiousness_en\")"
]
},
{
"cell_type": "code",
"execution_count": 249,
"id": "df9c6586-b663-488d-98e8-e5565882a5d6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation_type</th>\n",
" <th>child</th>\n",
" <th>parent</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>der</td>\n",
" <td>terrado_pt</td>\n",
" <td>terra_pt</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>inh</td>\n",
" <td>skītaną_gem-pro</td>\n",
" <td>sḱeyd-_ine-pro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>inh</td>\n",
" <td>skītaną_gem-pro</td>\n",
" <td>skeyd-_ine-pro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>inh</td>\n",
" <td>skītaną_gem-pro</td>\n",
" <td>sḱeyd- ~ skeyt-_ine-pro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>inh</td>\n",
" <td>skītaną_gem-pro</td>\n",
" <td>skeyd_ine-pro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724901</th>\n",
" <td>inh</td>\n",
" <td>kokea_fi</td>\n",
" <td>kokedak_fiu-fin-pro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724902</th>\n",
" <td>inh</td>\n",
" <td>aluna_pt</td>\n",
" <td>alumna_la</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724903</th>\n",
" <td>der(s)</td>\n",
" <td>litográfico_es</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724904</th>\n",
" <td>bor</td>\n",
" <td>cúlóm_ga</td>\n",
" <td>coulomb_en</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724905</th>\n",
" <td>inh</td>\n",
" <td>soiscéal_ga</td>\n",
" <td>soiscél_sga</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>724906 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" relation_type child parent\n",
"0 der terrado_pt terra_pt\n",
"1 inh skītaną_gem-pro sḱeyd-_ine-pro\n",
"2 inh skītaną_gem-pro skeyd-_ine-pro\n",
"3 inh skītaną_gem-pro sḱeyd- ~ skeyt-_ine-pro\n",
"4 inh skītaną_gem-pro skeyd_ine-pro\n",
"... ... ... ...\n",
"724901 inh kokea_fi kokedak_fiu-fin-pro\n",
"724902 inh aluna_pt alumna_la\n",
"724903 der(s) litográfico_es NaN\n",
"724904 bor cúlóm_ga coulomb_en\n",
"724905 inh soiscéal_ga soiscél_sga\n",
"\n",
"[724906 rows x 3 columns]"
]
},
"execution_count": 249,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_link"
]
},
{
"cell_type": "code",
"execution_count": 251,
"id": "eda8df48-9acf-4f7e-b153-d40bf80150c9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" lang field lexeme meaning classification_lang normalized_form\n",
"id \n",
"1129441 en 0 exampler exampler en exampler\n",
"1129443 en 0 examplers examplers en examplers\n",
"1578981 la 0 examplexor NaN la examplexor\n",
"16136 en 0 example example en example\n",
"16138 enm 0 example NaN enm example\n",
"1866476 fro 0 example NaN fro example\n",
"315456 en 0 examples examples en examples\n",
"949419 en 0 exampled exampled en exampled\n",
"Empty DataFrame\n",
"Columns: [relation_type, child, parent]\n",
"Index: []\n"
]
}
],
"source": [
"# Find lexeme containing 'example'\n",
"filtered_df = df_values[df_values['lexeme'].str.startswith('example', na=False)]\n",
"\n",
"# Display the filtered DataFrame\n",
"print(filtered_df)\n",
"\n",
"# related_rows = df_link.loc[filtered_df.index]\n",
"rows_with_ids = df_link[df_link.isin(filtered_df.index).any(axis=1)]\n",
"print(rows_with_ids)"
]
},
{
"cell_type": "code",
"execution_count": 268,
"id": "d6ebbcc9-45cd-4e5a-a58a-8a10ec1ca3f2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation_type</th>\n",
" <th>child</th>\n",
" <th>parent</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>251448</th>\n",
" <td>inh</td>\n",
" <td>cōp_odt</td>\n",
" <td>kaupaz_gem-pro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>483114</th>\n",
" <td>inh</td>\n",
" <td>cōpon_odt</td>\n",
" <td>kaupōną_gem-pro</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation_type child parent\n",
"251448 inh cōp_odt kaupaz_gem-pro\n",
"483114 inh cōpon_odt kaupōną_gem-pro"
]
},
"execution_count": 268,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_link[df_link['child'].str.startswith('cōp', na=False)]"
]
},
{
"cell_type": "code",
"execution_count": 271,
"id": "81e74a0d-bf3e-4129-8340-50b7b2b66602",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>lang</th>\n",
" <th>field</th>\n",
" <th>lexeme</th>\n",
" <th>meaning</th>\n",
" <th>classification_lang</th>\n",
" <th>normalized_form</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>396532</th>\n",
" <td>la</td>\n",
" <td>0</td>\n",
" <td>cōpiōsus</td>\n",
" <td>NaN</td>\n",
" <td>la</td>\n",
" <td>copiosus</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71240</th>\n",
" <td>la</td>\n",
" <td>0</td>\n",
" <td>cōpia</td>\n",
" <td>supply</td>\n",
" <td>la</td>\n",
" <td>copia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71247</th>\n",
" <td>la</td>\n",
" <td>0</td>\n",
" <td>cōpia</td>\n",
" <td>plenty, abundance</td>\n",
" <td>la</td>\n",
" <td>copia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>925907</th>\n",
" <td>la</td>\n",
" <td>0</td>\n",
" <td>cōpia</td>\n",
" <td>NaN</td>\n",
" <td>la</td>\n",
" <td>copia</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" lang field lexeme meaning classification_lang \\\n",
"id \n",
"396532 la 0 cōpiōsus NaN la \n",
"71240 la 0 cōpia supply la \n",
"71247 la 0 cōpia plenty, abundance la \n",
"925907 la 0 cōpia NaN la \n",
"\n",
" normalized_form \n",
"id \n",
"396532 copiosus \n",
"71240 copia \n",
"71247 copia \n",
"925907 copia "
]
},
"execution_count": 271,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_values[df_values['lexeme'].str.startswith('cōpi', na=False)]"
]
},
{
"cell_type": "code",
"execution_count": 285,
"id": "b2b53b0a-9547-4206-9a99-1ff02ccaaefe",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>lang</th>\n",
" <th>field</th>\n",
" <th>lexeme</th>\n",
" <th>meaning</th>\n",
" <th>classification_lang</th>\n",
" <th>normalized_form</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1010388</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>copings</td>\n",
" <td>copings</td>\n",
" <td>en</td>\n",
" <td>copings</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1123071</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>copist</td>\n",
" <td>copist</td>\n",
" <td>en</td>\n",
" <td>copist</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1123072</th>\n",
" <td>fr</td>\n",
" <td>0</td>\n",
" <td>copiste</td>\n",
" <td>copist</td>\n",
" <td>fr</td>\n",
" <td>copiste</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1123073</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>copists</td>\n",
" <td>copists</td>\n",
" <td>en</td>\n",
" <td>copists</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1127504</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>copicide</td>\n",
" <td>copicide</td>\n",
" <td>en</td>\n",
" <td>copicide</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92270</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>copious</td>\n",
" <td>copious</td>\n",
" <td>en</td>\n",
" <td>copious</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92271</th>\n",
" <td>la</td>\n",
" <td>0</td>\n",
" <td>copiosus ~ copia</td>\n",
" <td>abundance</td>\n",
" <td>la</td>\n",
" <td>copiosus ~ copia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>956757</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>copiotroph</td>\n",
" <td>copiotroph</td>\n",
" <td>en</td>\n",
" <td>copiotroph</td>\n",
" </tr>\n",
" <tr>\n",
" <th>974941</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>copiloting</td>\n",
" <td>copiloting</td>\n",
" <td>en</td>\n",
" <td>copiloting</td>\n",
" </tr>\n",
" <tr>\n",
" <th>974942</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>copiloted</td>\n",
" <td>copiloted</td>\n",
" <td>en</td>\n",
" <td>copiloted</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>89 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" lang field lexeme meaning classification_lang \\\n",
"id \n",
"1010388 en 0 copings copings en \n",
"1123071 en 0 copist copist en \n",
"1123072 fr 0 copiste copist fr \n",
"1123073 en 0 copists copists en \n",
"1127504 en 0 copicide copicide en \n",
"... ... ... ... ... ... \n",
"92270 en 0 copious copious en \n",
"92271 la 0 copiosus ~ copia abundance la \n",
"956757 en 0 copiotroph copiotroph en \n",
"974941 en 0 copiloting copiloting en \n",
"974942 en 0 copiloted copiloted en \n",
"\n",
" normalized_form \n",
"id \n",
"1010388 copings \n",
"1123071 copist \n",
"1123072 copiste \n",
"1123073 copists \n",
"1127504 copicide \n",
"... ... \n",
"92270 copious \n",
"92271 copiosus ~ copia \n",
"956757 copiotroph \n",
"974941 copiloting \n",
"974942 copiloted \n",
"\n",
"[89 rows x 6 columns]"
]
},
"execution_count": 285,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_values[df_values['lexeme'].str.startswith('copi', na=False)]"
]
},
{
"cell_type": "code",
"execution_count": 313,
"id": "5440152f-31c6-44d9-b0a0-3ef6f6cd7ab0",
"metadata": {},
"outputs": [],
"source": [
"def find_substring_in_values(substring):\n",
" contains_substring = df_values.applymap(lambda x: str(x).startswith(substring)).any(axis=1)\n",
" \n",
" # Filter the DataFrame based on the mask\n",
" filtered_rows = df_values[contains_substring]\n",
" \n",
" # Display the filtered rows\n",
" return filtered_rows\n",
"\n",
"def find_substring_in_links(substring):\n",
" contains_substring = df_link.applymap(lambda x: str(x).startswith(substring)).any(axis=1)\n",
" # contains_substring = df_link.applymap(lambda x: substring in str(x)).any(axis=1)\n",
" \n",
" # Filter the DataFrame based on the mask\n",
" filtered_rows = df_link[contains_substring]\n",
" \n",
" # Display the filtered rows\n",
" return filtered_rows"
]
},
{
"cell_type": "code",
"execution_count": 314,
"id": "c7b63d39-4d97-4efb-8a5a-dca9abd586af",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/_1/v79t6xvn6ds15sxk2lxl00cr0000gn/T/ipykernel_38644/2784440865.py:11: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
" contains_substring = df_link.applymap(lambda x: str(x).startswith(substring)).any(axis=1)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation_type</th>\n",
" <th>child</th>\n",
" <th>parent</th>\n",
" <th>child_lexeme</th>\n",
" <th>parent_lexeme</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>51644</th>\n",
" <td>der</td>\n",
" <td>553565</td>\n",
" <td>399582</td>\n",
" <td>copione_it</td>\n",
" <td>copiare_it</td>\n",
" </tr>\n",
" <tr>\n",
" <th>109777</th>\n",
" <td>inh</td>\n",
" <td>396531</td>\n",
" <td>396532</td>\n",
" <td>copioso_it</td>\n",
" <td>copiosus_la</td>\n",
" </tr>\n",
" <tr>\n",
" <th>235078</th>\n",
" <td>inh</td>\n",
" <td>396533</td>\n",
" <td>396532</td>\n",
" <td>copioso_pt</td>\n",
" <td>copiosus_la</td>\n",
" </tr>\n",
" <tr>\n",
" <th>273367</th>\n",
" <td>inh</td>\n",
" <td>396534</td>\n",
" <td>396532</td>\n",
" <td>copioso_es</td>\n",
" <td>copiosus_la</td>\n",
" </tr>\n",
" <tr>\n",
" <th>500184</th>\n",
" <td>cmpd+bor</td>\n",
" <td>92271</td>\n",
" <td>-1276</td>\n",
" <td>copiosus ~ copia_la</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>624282</th>\n",
" <td>bor</td>\n",
" <td>847746</td>\n",
" <td>396532</td>\n",
" <td>copieux_fr</td>\n",
" <td>copiosus_la</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation_type child parent child_lexeme parent_lexeme\n",
"51644 der 553565 399582 copione_it copiare_it\n",
"109777 inh 396531 396532 copioso_it copiosus_la\n",
"235078 inh 396533 396532 copioso_pt copiosus_la\n",
"273367 inh 396534 396532 copioso_es copiosus_la\n",
"500184 cmpd+bor 92271 -1276 copiosus ~ copia_la NaN\n",
"624282 bor 847746 396532 copieux_fr copiosus_la"
]
},
"execution_count": 314,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"find_substring_in_links('copio')"
]
},
{
"cell_type": "code",
"execution_count": 315,
"id": "571dc06d-3fe4-4866-bbeb-2fb7d43fd113",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/_1/v79t6xvn6ds15sxk2lxl00cr0000gn/T/ipykernel_38644/2784440865.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
" contains_substring = df_values.applymap(lambda x: str(x).startswith(substring)).any(axis=1)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>lang</th>\n",
" <th>field</th>\n",
" <th>lexeme</th>\n",
" <th>meaning</th>\n",
" <th>classification_lang</th>\n",
" <th>normalized_form</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1089933</th>\n",
" <td>sk</td>\n",
" <td>0</td>\n",
" <td>sporý</td>\n",
" <td>copious</td>\n",
" <td>sk</td>\n",
" <td>sporý</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1489361</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>copiousnesses</td>\n",
" <td>copiousnesses</td>\n",
" <td>en</td>\n",
" <td>copiousnesses</td>\n",
" </tr>\n",
" <tr>\n",
" <th>320737</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>copiously</td>\n",
" <td>copiously</td>\n",
" <td>en</td>\n",
" <td>copiously</td>\n",
" </tr>\n",
" <tr>\n",
" <th>456061</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>copiousness</td>\n",
" <td>copiousness</td>\n",
" <td>en</td>\n",
" <td>copiousness</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92270</th>\n",
" <td>en</td>\n",
" <td>0</td>\n",
" <td>copious</td>\n",
" <td>copious</td>\n",
" <td>en</td>\n",
" <td>copious</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" lang field lexeme meaning classification_lang \\\n",
"id \n",
"1089933 sk 0 sporý copious sk \n",
"1489361 en 0 copiousnesses copiousnesses en \n",
"320737 en 0 copiously copiously en \n",
"456061 en 0 copiousness copiousness en \n",
"92270 en 0 copious copious en \n",
"\n",
" normalized_form \n",
"id \n",
"1089933 sporý \n",
"1489361 copiousnesses \n",
"320737 copiously \n",
"456061 copiousness \n",
"92270 copious "
]
},
"execution_count": 315,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"find_substring_in_values('copious')"
]
},
{
"cell_type": "code",
"execution_count": 311,
"id": "82fb127c-7787-4e6f-8817-890a3d9211d2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation_type</th>\n",
" <th>child</th>\n",
" <th>parent</th>\n",
" <th>child_lexeme</th>\n",
" <th>parent_lexeme</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>der(s)</td>\n",
" <td>1165275</td>\n",
" <td>-40470</td>\n",
" <td>offensivamente_it</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>cmpd+bor</td>\n",
" <td>275331</td>\n",
" <td>-10538</td>\n",
" <td>afuera_es</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>cmpd+bor</td>\n",
" <td>1606111</td>\n",
" <td>-62916</td>\n",
" <td>transisjonsmetall_nn</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>cmpd+bor</td>\n",
" <td>76497</td>\n",
" <td>-1026</td>\n",
" <td>پنجابی_fa</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>der(s)</td>\n",
" <td>1644793</td>\n",
" <td>-65270</td>\n",
" <td>memoriousness_en</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724807</th>\n",
" <td>cmpd+bor</td>\n",
" <td>1252646</td>\n",
" <td>-47339</td>\n",
" <td>mycha_pl</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724868</th>\n",
" <td>cmpd+bor</td>\n",
" <td>410216</td>\n",
" <td>-8062</td>\n",
" <td>refinar_pt</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724877</th>\n",
" <td>der(s)</td>\n",
" <td>1053115</td>\n",
" <td>-33809</td>\n",
" <td>wordwise_en</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724894</th>\n",
" <td>cmpd+bor</td>\n",
" <td>61790</td>\n",
" <td>-766</td>\n",
" <td>laurer_fr</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724903</th>\n",
" <td>der(s)</td>\n",
" <td>1120131</td>\n",
" <td>-37200</td>\n",
" <td>litográfico_es</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>81102 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" relation_type child parent child_lexeme parent_lexeme\n",
"15 der(s) 1165275 -40470 offensivamente_it NaN\n",
"22 cmpd+bor 275331 -10538 afuera_es NaN\n",
"30 cmpd+bor 1606111 -62916 transisjonsmetall_nn NaN\n",
"31 cmpd+bor 76497 -1026 پنجابی_fa NaN\n",
"48 der(s) 1644793 -65270 memoriousness_en NaN\n",
"... ... ... ... ... ...\n",
"724807 cmpd+bor 1252646 -47339 mycha_pl NaN\n",
"724868 cmpd+bor 410216 -8062 refinar_pt NaN\n",
"724877 der(s) 1053115 -33809 wordwise_en NaN\n",
"724894 cmpd+bor 61790 -766 laurer_fr NaN\n",
"724903 der(s) 1120131 -37200 litográfico_es NaN\n",
"\n",
"[81102 rows x 5 columns]"
]
},
"execution_count": 311,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_link[df_link.isnull().any(axis=1)]"
]
},
{
"cell_type": "code",
"execution_count": 320,
"id": "09c17a8c-997e-4fa7-9bcb-7ebabb88a89e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>lang</th>\n",
" <th>field</th>\n",
" <th>lexeme</th>\n",
" <th>meaning</th>\n",
" <th>classification_lang</th>\n",
" <th>normalized_form</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1015334</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>描</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>描</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102467</th>\n",
" <td>de</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>zero</td>\n",
" <td>de</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1026310</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>曼谷</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>曼谷</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1064258</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>白菜__HASH__Chinese</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>白菜__HASH__Chinese</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1110324</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>khà-sàng</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>khà-sàng</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>947497</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>馬車</td>\n",
" <td>becak</td>\n",
" <td>NaN</td>\n",
" <td>馬車</td>\n",
" </tr>\n",
" <tr>\n",
" <th>979068</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>白話字 ~ 白话字</td>\n",
" <td>vernacular writing</td>\n",
" <td>NaN</td>\n",
" <td>白話字 ~ 白话字</td>\n",
" </tr>\n",
" <tr>\n",
" <th>991521</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>復仇</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>復仇</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1397622</th>\n",
" <td>aql-pro</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>aql-pro</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>907358</th>\n",
" <td>trk-pro</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>trk-pro</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>221 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" lang field lexeme meaning \\\n",
"id \n",
"1015334 NaN 0 描 NaN \n",
"102467 de 0 NaN zero \n",
"1026310 NaN 0 曼谷 NaN \n",
"1064258 NaN 0 白菜__HASH__Chinese NaN \n",
"1110324 NaN 0 khà-sàng NaN \n",
"... ... ... ... ... \n",
"947497 NaN 0 馬車 becak \n",
"979068 NaN 0 白話字 ~ 白话字 vernacular writing \n",
"991521 NaN 0 復仇 NaN \n",
"1397622 aql-pro 0 NaN NaN \n",
"907358 trk-pro 0 NaN NaN \n",
"\n",
" classification_lang normalized_form \n",
"id \n",
"1015334 NaN 描 \n",
"102467 de NaN \n",
"1026310 NaN 曼谷 \n",
"1064258 NaN 白菜__HASH__Chinese \n",
"1110324 NaN khà-sàng \n",
"... ... ... \n",
"947497 NaN 馬車 \n",
"979068 NaN 白話字 ~ 白话字 \n",
"991521 NaN 復仇 \n",
"1397622 aql-pro NaN \n",
"907358 trk-pro NaN \n",
"\n",
"[221 rows x 6 columns]"
]
},
"execution_count": 320,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_values[df_values.isnull().any(axis=1)]"
]
},
{
"cell_type": "code",
"execution_count": 325,
"id": "f2e02f80-4f8b-45a6-aab5-6575e7c2d61d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>lang</th>\n",
" <th>field</th>\n",
" <th>lexeme</th>\n",
" <th>classification_lang</th>\n",
" <th>normalized_form</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1015334</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>描</td>\n",
" <td>NaN</td>\n",
" <td>描</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102467</th>\n",
" <td>de</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>de</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1026310</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>曼谷</td>\n",
" <td>NaN</td>\n",
" <td>曼谷</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1064258</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>白菜__HASH__Chinese</td>\n",
" <td>NaN</td>\n",
" <td>白菜__HASH__Chinese</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1110324</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>khà-sàng</td>\n",
" <td>NaN</td>\n",
" <td>khà-sàng</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>947497</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>馬車</td>\n",
" <td>NaN</td>\n",
" <td>馬車</td>\n",
" </tr>\n",
" <tr>\n",
" <th>979068</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>白話字 ~ 白话字</td>\n",
" <td>NaN</td>\n",
" <td>白話字 ~ 白话字</td>\n",
" </tr>\n",
" <tr>\n",
" <th>991521</th>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>復仇</td>\n",
" <td>NaN</td>\n",
" <td>復仇</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1397622</th>\n",
" <td>aql-pro</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>aql-pro</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>907358</th>\n",
" <td>trk-pro</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>trk-pro</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>221 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" lang field lexeme classification_lang \\\n",
"id \n",
"1015334 NaN 0 描 NaN \n",
"102467 de 0 NaN de \n",
"1026310 NaN 0 曼谷 NaN \n",
"1064258 NaN 0 白菜__HASH__Chinese NaN \n",
"1110324 NaN 0 khà-sàng NaN \n",
"... ... ... ... ... \n",
"947497 NaN 0 馬車 NaN \n",
"979068 NaN 0 白話字 ~ 白话字 NaN \n",
"991521 NaN 0 復仇 NaN \n",
"1397622 aql-pro 0 NaN aql-pro \n",
"907358 trk-pro 0 NaN trk-pro \n",
"\n",
" normalized_form \n",
"id \n",
"1015334 描 \n",
"102467 NaN \n",
"1026310 曼谷 \n",
"1064258 白菜__HASH__Chinese \n",
"1110324 khà-sàng \n",
"... ... \n",
"947497 馬車 \n",
"979068 白話字 ~ 白话字 \n",
"991521 復仇 \n",
"1397622 NaN \n",
"907358 NaN \n",
"\n",
"[221 rows x 5 columns]"
]
},
"execution_count": 325,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_values.drop('meaning', axis=1)[df_values.drop('meaning', axis=1).isnull().any(axis=1)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ab9c1dd-829b-4697-92b4-24a26b9479c3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}