{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"id": "a1718c4d-231f-4b55-b84b-737e6ed0efeb",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import spacy\n",
"from collections import Counter\n",
"from IPython.display import display, Markdown\n",
"pd.set_option('display.max_rows', None)"
]
},
{
"cell_type": "markdown",
"id": "3f392a04-3969-429d-a730-295694e461a0",
"metadata": {},
"source": [
"### Sentences"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "1c44a7c8-1052-42de-a24a-cbec269baddd",
"metadata": {},
"outputs": [],
"source": [
"balanced_germanic_romance_sentence = \"\"\"\n",
"Amidst the blooming garden, the adept gardener, a truly expert and proficient individual, skillfully tended to the endless rows of celestial \n",
"flowers. His companion, equally skilled and equally committed to their shared companionship, remained approachable and joyful, their pulses \n",
"beating in harmony with nature's rhythm. Together, they cherished their allegiance to the earth, their unwavering loyalty mirroring the \n",
"firm, steady growth of each blossom. As they worked, their satisfaction bloomed, fulfilling a shared love for the beauty they nurtured daily,\n",
"their efforts a testament to their credible and trustworthy dedication to the garden's eternal flourish.\n",
"\"\"\"\n",
"\n",
"fully_germanic_text = \"\"\"\n",
"The skillful and skilled artisan showed unwavering allegiance to his craft, creating endless blooms that were both believable and heavenly. \n",
"His steady hand and stiff resolve shut out any distractions, making his work truly everlasting.\n",
"\"\"\"\n",
"\n",
"fully_romance_text = \"\"\"The proficient expert resumed his enchanting narration, ensuring his companions were grateful and satisfied \n",
"with his incredible story.\"\"\""
]
},
{
"cell_type": "markdown",
"id": "e5d00580-dd24-42f8-874e-f9f078ebd382",
"metadata": {},
"source": [
"### Code"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "f119ef07-b6fe-473c-9cfa-f7ff63d10b64",
"metadata": {},
"outputs": [],
"source": [
"class TextCognateCounter: \n",
" def __init__(self, cognates_list_file, sent_beg=\"<s>\", sent_end=\"</s>\", _POS=True):\n",
" self.cognates_df = pd.read_csv(cognates_list_file)\n",
" self.POS = _POS\n",
" self.word_list = list(self.cognates_df['word'] + \"_\" + self.cognates_df['POS']) if self.POS else list(self.cognates_df['word'])\n",
" self.begin_sentence_marker = sent_beg\n",
" self.end_sentence_marker = sent_end\n",
" self.nlp = spacy.load('en_core_web_sm')\n",
"\n",
" def count_cognates_in_text(self, raw_text):\n",
" cognate_counts = self.init_cognate_count_dict()\n",
" text = POS_tag_text(nlp=self.nlp, raw_text=raw_text)\n",
"\n",
" text = [x.replace(self.begin_sentence_marker, '').replace(self.end_sentence_marker, '') for x in text]\n",
" words = [word for row in text for word in row.split(\" \")]\n",
" cnt = Counter(words)\n",
" word_to_count = {key: cnt[key] for key in cnt if key in self.word_list}\n",
" extra = {x: 0 for x in self.word_list if x not in word_to_count.keys()}\n",
" word_to_count.update(extra)\n",
" \n",
" for cognate in self.word_list:\n",
" if self.POS:\n",
" lemma, pos = cognate.split(\"_\")\n",
" cognate_counts.loc[(cognate_counts['word'] == lemma) & (cognate_counts['POS'] == pos), 'count'] = word_to_count[cognate]\n",
" else:\n",
" cognate_counts.loc[cognate_counts['word'] == cognate, 'count'] = word_to_count[cognate]\n",
"\n",
" return cognate_counts\n",
"\n",
" def init_cognate_count_dict(self):\n",
" cognate_counts = self.cognates_df[['synset', 'word', 'Source', 'POS']].copy() if self.POS else self.cognates_df[['synset', 'word', 'Source']].copy()\n",
" cognate_counts['count'] = 0\n",
" return cognate_counts"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "8fed55f3-6248-4f76-95c3-26aa142ca666",
"metadata": {},
"outputs": [],
"source": [
"def POS_tag_text(nlp, raw_text=\"\"):\n",
" # Process the raw text\n",
" doc = nlp(raw_text)\n",
" tagged_text = []\n",
" for sent in doc.sents:\n",
" tagged_sentence = []\n",
" for word in sent:\n",
" tagged_sentence.append(word.lemma_ + \"_\" + word.pos_)\n",
" tagged_text.extend(tagged_sentence)\n",
" return tagged_text\n",
"\n",
"def calculate_germanic_tendency(text: str, counter: TextCognateCounter, _POS=True, verbose=True):\n",
" cognate_counts = counter.count_cognates_in_text(text)\n",
" total_germanic_count = cognate_counts[cognate_counts['Source'] == 'G']['count'].sum()\n",
" total_romance_count = cognate_counts[cognate_counts['Source'] == 'R']['count'].sum()\n",
"\n",
" if total_germanic_count + total_romance_count > 0:\n",
" GT = total_germanic_count / (total_germanic_count + total_romance_count)\n",
" RT = total_romance_count / (total_germanic_count + total_romance_count)\n",
" else:\n",
" GT = 0\n",
" RT = 0\n",
"\n",
" if verbose:\n",
" display(Markdown(f\"##### Input Sentence:\\n {text}\\n\\n\"))\n",
" display(Markdown(\"___\"))\n",
" display(cognate_counts[cognate_counts[\"count\"] != 0])\n",
" display(Markdown(f\"**Germanic Tendency: {GT}, Romance Tendency: {RT}**\"))\n",
"\n",
" return {\"germanic_tendency\": round(GT, 3), \"romance_tendency\": round(RT, 3)}\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "23f1de27-8c88-4844-8902-4d450561fcf5",
"metadata": {},
"outputs": [],
"source": [
"cognates_list_file = \"manual_synset_list_with_origin_and_POS.csv\"\n",
"cognate_counter = TextCognateCounter(cognates_list_file)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "95fa5704-2e69-4484-9fc1-86ca7cd28335",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"##### Input Sentence:\n",
" \n",
"Amidst the blooming garden, the adept gardener, a truly expert and proficient individual, skillfully tended to the endless rows of celestial \n",
"flowers. His companion, equally skilled and equally committed to their shared companionship, remained approachable and joyful, their pulses \n",
"beating in harmony with nature's rhythm. Together, they cherished their allegiance to the earth, their unwavering loyalty mirroring the \n",
"firm, steady growth of each blossom. As they worked, their satisfaction bloomed, fulfilling a shared love for the beauty they nurtured daily,\n",
"their efforts a testament to their credible and trustworthy dedication to the garden's eternal flourish.\n",
"\n",
"\n"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"___"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>synset</th>\n",
" <th>word</th>\n",
" <th>Source</th>\n",
" <th>POS</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>adept</td>\n",
" <td>R</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>proficient</td>\n",
" <td>R</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>skilled</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2</td>\n",
" <td>allegiance</td>\n",
" <td>G</td>\n",
" <td>NOUN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2</td>\n",
" <td>loyalty</td>\n",
" <td>R</td>\n",
" <td>NOUN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>4</td>\n",
" <td>approachable</td>\n",
" <td>R</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>5</td>\n",
" <td>bloom</td>\n",
" <td>G</td>\n",
" <td>VERB</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>6</td>\n",
" <td>celestial</td>\n",
" <td>R</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>8</td>\n",
" <td>companion</td>\n",
" <td>R</td>\n",
" <td>NOUN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>9</td>\n",
" <td>companionship</td>\n",
" <td>R</td>\n",
" <td>NOUN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>11</td>\n",
" <td>credible</td>\n",
" <td>R</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>13</td>\n",
" <td>endless</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>13</td>\n",
" <td>eternal</td>\n",
" <td>R</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>14</td>\n",
" <td>steady</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>14</td>\n",
" <td>unwavering</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>15</td>\n",
" <td>joyful</td>\n",
" <td>R</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>21</td>\n",
" <td>pulse</td>\n",
" <td>R</td>\n",
" <td>NOUN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>170</th>\n",
" <td>74</td>\n",
" <td>trustworthy</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>348</th>\n",
" <td>155</td>\n",
" <td>steady</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" synset word Source POS count\n",
"0 1 adept R ADJ 1\n",
"2 1 proficient R ADJ 1\n",
"4 1 skilled G ADJ 1\n",
"5 2 allegiance G NOUN 1\n",
"6 2 loyalty R NOUN 1\n",
"10 4 approachable R ADJ 1\n",
"12 5 bloom G VERB 2\n",
"15 6 celestial R ADJ 1\n",
"19 8 companion R NOUN 1\n",
"21 9 companionship R NOUN 1\n",
"26 11 credible R ADJ 1\n",
"31 13 endless G ADJ 1\n",
"32 13 eternal R ADJ 1\n",
"36 14 steady G ADJ 1\n",
"38 14 unwavering G ADJ 1\n",
"41 15 joyful R ADJ 1\n",
"54 21 pulse R NOUN 1\n",
"170 74 trustworthy G ADJ 1\n",
"348 155 steady G ADJ 1"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"**Germanic Tendency: 0.45, Romance Tendency: 0.55**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"##### Input Sentence:\n",
" The proficient expert resumed his enchanting narration, ensuring his companions were grateful and satisfied \n",
"with his incredible story.\n",
"\n"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"___"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>synset</th>\n",
" <th>word</th>\n",
" <th>Source</th>\n",
" <th>POS</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>proficient</td>\n",
" <td>R</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>8</td>\n",
" <td>companion</td>\n",
" <td>R</td>\n",
" <td>NOUN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>16</td>\n",
" <td>grateful</td>\n",
" <td>R</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>19</td>\n",
" <td>narration</td>\n",
" <td>R</td>\n",
" <td>NOUN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>19</td>\n",
" <td>story</td>\n",
" <td>R</td>\n",
" <td>NOUN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58</th>\n",
" <td>22</td>\n",
" <td>resume</td>\n",
" <td>R</td>\n",
" <td>VERB</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>177</th>\n",
" <td>77</td>\n",
" <td>incredible</td>\n",
" <td>R</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" synset word Source POS count\n",
"2 1 proficient R ADJ 1\n",
"19 8 companion R NOUN 1\n",
"42 16 grateful R ADJ 1\n",
"49 19 narration R NOUN 1\n",
"51 19 story R NOUN 1\n",
"58 22 resume R VERB 1\n",
"177 77 incredible R ADJ 1"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"**Germanic Tendency: 0.0, Romance Tendency: 1.0**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"##### Input Sentence:\n",
" \n",
"The skillful and skilled artisan showed unwavering allegiance to his craft, creating endless blooms that were both believable and heavenly. \n",
"His steady hand and stiff resolve shut out any distractions, making his work truly everlasting.\n",
"\n",
"\n"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"___"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>synset</th>\n",
" <th>word</th>\n",
" <th>Source</th>\n",
" <th>POS</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>skillful</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>skilled</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2</td>\n",
" <td>allegiance</td>\n",
" <td>G</td>\n",
" <td>NOUN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>7</td>\n",
" <td>shut</td>\n",
" <td>G</td>\n",
" <td>VERB</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>11</td>\n",
" <td>believable</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>13</td>\n",
" <td>endless</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>14</td>\n",
" <td>steady</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>14</td>\n",
" <td>stiff</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>166</th>\n",
" <td>72</td>\n",
" <td>show</td>\n",
" <td>G</td>\n",
" <td>VERB</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>348</th>\n",
" <td>155</td>\n",
" <td>steady</td>\n",
" <td>G</td>\n",
" <td>ADJ</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" synset word Source POS count\n",
"3 1 skillful G ADJ 1\n",
"4 1 skilled G ADJ 1\n",
"5 2 allegiance G NOUN 1\n",
"18 7 shut G VERB 1\n",
"27 11 believable G ADJ 1\n",
"31 13 endless G ADJ 1\n",
"36 14 steady G ADJ 1\n",
"37 14 stiff G ADJ 1\n",
"166 72 show G VERB 1\n",
"348 155 steady G ADJ 1"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"**Germanic Tendency: 1.0, Romance Tendency: 0.0**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"{'germanic_tendency': 1.0, 'romance_tendency': 0.0}"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calculate_germanic_tendency(balanced_germanic_romance_sentence, cognate_counter)\n",
"\n",
"calculate_germanic_tendency(fully_romance_text, cognate_counter)\n",
"\n",
"calculate_germanic_tendency(fully_germanic_text, cognate_counter)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d12db334-407f-4a0d-a634-23a2294d800f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}