{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "collapsed_sections": [ "xJJatsUev-We" ] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "source": [ "import pandas as pd\n", "import re" ], "metadata": { "id": "40uCf910RUdV" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df = pd.read_excel('/content/data_en_it_tagged.xlsx')" ], "metadata": { "id": "w3Iur8DPVlHe" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "9LaOvu7jVpYL", "outputId": "02e3a47b-b776-4226-e003-56c94fa73709" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " direction id text_type \\\n", "0 en_to_it 0001en_sp_st en_sp_st \n", "1 en_to_it 0002en_sp_st en_sp_st \n", "2 en_to_it 0003en_sp_st en_sp_st \n", "3 en_to_it 0004en_sp_st en_sp_st \n", "4 en_to_it 0005en_sp_st en_sp_st \n", ".. ... ... ... \n", "523 it_to_en 1064en_wr_tt en_wr_tt \n", "524 it_to_en 1065en_wr_tt en_wr_tt \n", "525 it_to_en 1066en_wr_tt en_wr_tt \n", "526 it_to_en 1067en_wr_tt en_wr_tt \n", "527 it_to_en 1068en_wr_tt en_wr_tt \n", "\n", " text \n", "0 Thank/VV you/PP President/NP ./SENT Well/RB... \n", "1 Thank/VV you/PP very/RB much/JJ Mr/NP Pre... \n", "2 Excuse/VV me/PP ./SENT Thank/VV you/PP Pre... \n", "3 President/NP ,/, the/DT upheaval/NN in/IN ... \n", "4 Thank/VV you/PP Mr/NP President/NP ./SENT ... \n", ".. ... \n", "523 Mr/NP President/NP ,/, ladies/NNS and/CC g... \n", "524 Mr/NP President/NP ,/, High/NP Representati... \n", "525 Mr/NP President/NP ,/, ladies/NNS and/CC g... \n", "526 Mr/NP President/NP ,/, ladies/NNS and/CC g... \n", "527 Mr/NP President/NP ,/, ladies/NNS and/CC g... \n", "\n", "[528 rows x 4 columns]" ], "text/html": [ "\n", " <div id=\"df-0650964d-e86d-423e-a3ed-9f013d66d5ee\">\n", " <div class=\"colab-df-container\">\n", " <div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>direction</th>\n", " <th>id</th>\n", " <th>text_type</th>\n", " <th>text</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>en_to_it</td>\n", " <td>0001en_sp_st</td>\n", " <td>en_sp_st</td>\n", " <td>Thank/VV you/PP President/NP ./SENT Well/RB...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>en_to_it</td>\n", " <td>0002en_sp_st</td>\n", " <td>en_sp_st</td>\n", " <td>Thank/VV you/PP very/RB much/JJ Mr/NP Pre...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>en_to_it</td>\n", " <td>0003en_sp_st</td>\n", " <td>en_sp_st</td>\n", " <td>Excuse/VV me/PP ./SENT Thank/VV you/PP Pre...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>en_to_it</td>\n", " <td>0004en_sp_st</td>\n", " <td>en_sp_st</td>\n", " <td>President/NP ,/, the/DT upheaval/NN in/IN ...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>en_to_it</td>\n", " <td>0005en_sp_st</td>\n", " <td>en_sp_st</td>\n", " <td>Thank/VV you/PP Mr/NP President/NP ./SENT ...</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>523</th>\n", " <td>it_to_en</td>\n", " <td>1064en_wr_tt</td>\n", " <td>en_wr_tt</td>\n", " <td>Mr/NP President/NP ,/, ladies/NNS and/CC g...</td>\n", " </tr>\n", " <tr>\n", " <th>524</th>\n", " <td>it_to_en</td>\n", " <td>1065en_wr_tt</td>\n", " <td>en_wr_tt</td>\n", " <td>Mr/NP President/NP ,/, High/NP Representati...</td>\n", " </tr>\n", " <tr>\n", " <th>525</th>\n", " <td>it_to_en</td>\n", " <td>1066en_wr_tt</td>\n", " <td>en_wr_tt</td>\n", " <td>Mr/NP President/NP ,/, ladies/NNS and/CC g...</td>\n", " </tr>\n", " <tr>\n", " <th>526</th>\n", " <td>it_to_en</td>\n", " <td>1067en_wr_tt</td>\n", " <td>en_wr_tt</td>\n", " <td>Mr/NP President/NP ,/, ladies/NNS and/CC g...</td>\n", " </tr>\n", " <tr>\n", " <th>527</th>\n", " <td>it_to_en</td>\n", " <td>1068en_wr_tt</td>\n", " <td>en_wr_tt</td>\n", " <td>Mr/NP President/NP ,/, ladies/NNS and/CC g...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>528 rows × 4 columns</p>\n", "</div>\n", " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-0650964d-e86d-423e-a3ed-9f013d66d5ee')\"\n", " title=\"Convert this dataframe to an interactive table.\"\n", " style=\"display:none;\">\n", " \n", " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", " width=\"24px\">\n", " <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", " <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", " </svg>\n", " </button>\n", " \n", " <style>\n", " .colab-df-container {\n", " display:flex;\n", " flex-wrap:wrap;\n", " gap: 12px;\n", " }\n", "\n", " .colab-df-convert {\n", " background-color: #E8F0FE;\n", " border: none;\n", " border-radius: 50%;\n", " cursor: pointer;\n", " display: none;\n", " fill: #1967D2;\n", " height: 32px;\n", " padding: 0 0 0 0;\n", " width: 32px;\n", " }\n", "\n", " .colab-df-convert:hover {\n", " background-color: #E2EBFA;\n", " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", " fill: #174EA6;\n", " }\n", "\n", " [theme=dark] .colab-df-convert {\n", " background-color: #3B4455;\n", " fill: #D2E3FC;\n", " }\n", "\n", " [theme=dark] .colab-df-convert:hover {\n", " background-color: #434B5C;\n", " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", " fill: #FFFFFF;\n", " }\n", " </style>\n", "\n", " <script>\n", " const buttonEl =\n", " document.querySelector('#df-0650964d-e86d-423e-a3ed-9f013d66d5ee button.colab-df-convert');\n", " buttonEl.style.display =\n", " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", "\n", " async function convertToInteractive(key) {\n", " const element = document.querySelector('#df-0650964d-e86d-423e-a3ed-9f013d66d5ee');\n", " const dataTable =\n", " await google.colab.kernel.invokeFunction('convertToInteractive',\n", " [key], {});\n", " if (!dataTable) return;\n", "\n", " const docLinkHtml = 'Like what you see? Visit the ' +\n", " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", " + ' to learn more about interactive tables.';\n", " element.innerHTML = '';\n", " dataTable['output_type'] = 'display_data';\n", " await google.colab.output.renderOutput(dataTable, element);\n", " const docLink = document.createElement('div');\n", " docLink.innerHTML = docLinkHtml;\n", " element.appendChild(docLink);\n", " }\n", " </script>\n", " </div>\n", " </div>\n", " " ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "# Function to remove tags from text\n", "def remove_tags(text):\n", " ignore_tags = ['SENT', ',', '\\'', 'DYSF', ':', 'EPAUSE', '``', 'FPAUSE', 'UNCLEAR', 'NOCAT', 'SYM', 'PUN', '/NOCAT']\n", " words = text.split()\n", " cleaned_words = []\n", " for word in words:\n", " try:\n", " if word.split('/')[1] not in ignore_tags:\n", " cleaned_words.append(word.split('/')[0])\n", " except IndexError: # word does not have a '/'\n", " cleaned_words.append(word)\n", " return ' '.join(cleaned_words)\n", "\n", "# Apply function to remove tags from \"text\"\n", "df['text'] = df['text'].apply(remove_tags)\n", "\n", "# Function to split texts into chunks of 1000 tokens\n", "def split_text(text):\n", " words = text.split()\n", " return [words[i:i+1000] for i in range(0, len(words), 1000)]\n", "\n", "# Function to handle rows merging/splitting\n", "def handle_rows(df):\n", " rows = []\n", " buffer_text = []\n", " buffer_id = []\n", "\n", " for index, row in df.iterrows():\n", " chunks = split_text(row['text'])\n", " if len(buffer_text) > 0 and buffer_id[-1].split('_')[-1] != row['id'].split('_')[-1]: # Check if text_type has changed\n", " if len(buffer_text) >= 1000:\n", " rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})\n", " buffer_text = []\n", " buffer_id = []\n", " for i, chunk in enumerate(chunks):\n", " if i == 0 and len(buffer_text) < 1000:\n", " buffer_text += chunk\n", " buffer_id.append(row['id'])\n", " if len(buffer_text) >= 1000:\n", " rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})\n", " buffer_text = buffer_text[1000:]\n", " buffer_id = [row['id']]\n", " else:\n", " buffer_text = chunk\n", " buffer_id = [row['id']]\n", " if len(buffer_text) >= 1000:\n", " rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})\n", " buffer_text = buffer_text[1000:]\n", " return pd.DataFrame(rows)\n", "\n", "# Apply function to handle rows merging/splitting\n", "df = handle_rows(df)\n", "\n", "# Print the number of tokens for each unique value in the text_type column\n", "token_counts = df.groupby('text_type')['text'].apply(lambda x: sum(len(text.split()) for text in x))\n", "print(token_counts)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NvfBC4F_vXdL", "outputId": "1b27d368-720e-4acd-f609-8c959d6542a7" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "text_type\n", "en_sp_st 20000\n", "en_sp_tt 16000\n", "en_wr_st 19000\n", "en_wr_tt 18000\n", "it_sp_st 17000\n", "it_sp_tt 17000\n", "it_wr_st 17000\n", "it_wr_tt 18000\n", "Name: text, dtype: int64\n" ] } ] }, { "cell_type": "code", "source": [ "def calculate_sttr(text):\n", "\n", " # lowercase and tokenize the text\n", " tokens = text.lower().split()\n", "\n", " # number of tokens (total number of words)\n", " num_tokens = len(tokens)\n", "\n", " # number of types (unique words)\n", " num_types = len(set(tokens))\n", "\n", " # calculate Type-Token Ratio (TTR)\n", " ttr = num_types / num_tokens if num_tokens > 0 else 0\n", "\n", " return ttr\n", "\n", "# apply the function to the 'text' column and create a new 'ttr' column\n", "df['sttr'] = df['text'].apply(calculate_sttr)" ], "metadata": { "id": "LtIT3tv9GGid" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df = df[['direction', 'id', 'text_type', 'sttr', 'text']]" ], "metadata": { "id": "CmjUKuzBGUDO" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "ZBPNYPe7GITS", "outputId": "8622cc0f-eb8b-4ccc-892a-e452e8196d06" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " direction id text_type \\\n", "0 en_to_it 0001en_sp_st, 0002en_sp_st, 0003en_sp_st en_sp_st \n", "1 en_to_it 0003en_sp_st, 0004en_sp_st, 0005en_sp_st, 0006... en_sp_st \n", "2 en_to_it 0006en_sp_st, 0007en_sp_st, 0008en_sp_st en_sp_st \n", "3 en_to_it 0008en_sp_st, 0009en_sp_st, 0010en_sp_st en_sp_st \n", "4 en_to_it 0010en_sp_st, 0011en_sp_st, 0012en_sp_st, 0013... en_sp_st \n", ".. ... ... ... \n", "137 it_to_en 1045en_wr_tt, 1046en_wr_tt, 1047en_wr_tt, 1048... en_wr_tt \n", "138 it_to_en 1049en_wr_tt, 1050en_wr_tt, 1051en_wr_tt, 1052... en_wr_tt \n", "139 it_to_en 1055en_wr_tt, 1056en_wr_tt, 1057en_wr_tt, 1058... en_wr_tt \n", "140 it_to_en 1059en_wr_tt, 1060en_wr_tt, 1061en_wr_tt, 1062... en_wr_tt \n", "141 it_to_en 1063en_wr_tt, 1064en_wr_tt, 1065en_wr_tt, 1066... en_wr_tt \n", "\n", " sttr text \n", "0 0.461924 Thank you President Well some colleagues took ... \n", "1 0.478478 refrain from using violence and that there wil... \n", "2 0.431156 their hard work their thoughtfulness and commi... \n", "3 0.424000 into our Committee to do that I think has been... \n", "4 0.427711 that these measures when endorsed and adopted ... \n", ".. ... ... \n", "137 0.439759 regions and between administrative structures ... \n", "138 0.442000 by the European Union are those specifically i... \n", "139 0.463928 couples around the world who every day face th... \n", "140 0.483903 We are now about to adopt the agreement on Ira... \n", "141 0.458753 book Premesse della politica Premises of pol... \n", "\n", "[142 rows x 5 columns]" ], "text/html": [ "\n", " <div id=\"df-4fe5e669-2f81-4ac6-8b20-635af14803e5\">\n", " <div class=\"colab-df-container\">\n", " <div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>direction</th>\n", " <th>id</th>\n", " <th>text_type</th>\n", " <th>sttr</th>\n", " <th>text</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>en_to_it</td>\n", " <td>0001en_sp_st, 0002en_sp_st, 0003en_sp_st</td>\n", " <td>en_sp_st</td>\n", " <td>0.461924</td>\n", " <td>Thank you President Well some colleagues took ...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>en_to_it</td>\n", " <td>0003en_sp_st, 0004en_sp_st, 0005en_sp_st, 0006...</td>\n", " <td>en_sp_st</td>\n", " <td>0.478478</td>\n", " <td>refrain from using violence and that there wil...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>en_to_it</td>\n", " <td>0006en_sp_st, 0007en_sp_st, 0008en_sp_st</td>\n", " <td>en_sp_st</td>\n", " <td>0.431156</td>\n", " <td>their hard work their thoughtfulness and commi...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>en_to_it</td>\n", " <td>0008en_sp_st, 0009en_sp_st, 0010en_sp_st</td>\n", " <td>en_sp_st</td>\n", " <td>0.424000</td>\n", " <td>into our Committee to do that I think has been...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>en_to_it</td>\n", " <td>0010en_sp_st, 0011en_sp_st, 0012en_sp_st, 0013...</td>\n", " <td>en_sp_st</td>\n", " <td>0.427711</td>\n", " <td>that these measures when endorsed and adopted ...</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>137</th>\n", " <td>it_to_en</td>\n", " <td>1045en_wr_tt, 1046en_wr_tt, 1047en_wr_tt, 1048...</td>\n", " <td>en_wr_tt</td>\n", " <td>0.439759</td>\n", " <td>regions and between administrative structures ...</td>\n", " </tr>\n", " <tr>\n", " <th>138</th>\n", " <td>it_to_en</td>\n", " <td>1049en_wr_tt, 1050en_wr_tt, 1051en_wr_tt, 1052...</td>\n", " <td>en_wr_tt</td>\n", " <td>0.442000</td>\n", " <td>by the European Union are those specifically i...</td>\n", " </tr>\n", " <tr>\n", " <th>139</th>\n", " <td>it_to_en</td>\n", " <td>1055en_wr_tt, 1056en_wr_tt, 1057en_wr_tt, 1058...</td>\n", " <td>en_wr_tt</td>\n", " <td>0.463928</td>\n", " <td>couples around the world who every day face th...</td>\n", " </tr>\n", " <tr>\n", " <th>140</th>\n", " <td>it_to_en</td>\n", " <td>1059en_wr_tt, 1060en_wr_tt, 1061en_wr_tt, 1062...</td>\n", " <td>en_wr_tt</td>\n", " <td>0.483903</td>\n", " <td>We are now about to adopt the agreement on Ira...</td>\n", " </tr>\n", " <tr>\n", " <th>141</th>\n", " <td>it_to_en</td>\n", " <td>1063en_wr_tt, 1064en_wr_tt, 1065en_wr_tt, 1066...</td>\n", " <td>en_wr_tt</td>\n", " <td>0.458753</td>\n", " <td>book Premesse della politica Premises of pol...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>142 rows × 5 columns</p>\n", "</div>\n", " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4fe5e669-2f81-4ac6-8b20-635af14803e5')\"\n", " title=\"Convert this dataframe to an interactive table.\"\n", " style=\"display:none;\">\n", " \n", " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", " width=\"24px\">\n", " <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", " <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", " </svg>\n", " </button>\n", " \n", " <style>\n", " .colab-df-container {\n", " display:flex;\n", " flex-wrap:wrap;\n", " gap: 12px;\n", " }\n", "\n", " .colab-df-convert {\n", " background-color: #E8F0FE;\n", " border: none;\n", " border-radius: 50%;\n", " cursor: pointer;\n", " display: none;\n", " fill: #1967D2;\n", " height: 32px;\n", " padding: 0 0 0 0;\n", " width: 32px;\n", " }\n", "\n", " .colab-df-convert:hover {\n", " background-color: #E2EBFA;\n", " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", " fill: #174EA6;\n", " }\n", "\n", " [theme=dark] .colab-df-convert {\n", " background-color: #3B4455;\n", " fill: #D2E3FC;\n", " }\n", "\n", " [theme=dark] .colab-df-convert:hover {\n", " background-color: #434B5C;\n", " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", " fill: #FFFFFF;\n", " }\n", " </style>\n", "\n", " <script>\n", " const buttonEl =\n", " document.querySelector('#df-4fe5e669-2f81-4ac6-8b20-635af14803e5 button.colab-df-convert');\n", " buttonEl.style.display =\n", " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", "\n", " async function convertToInteractive(key) {\n", " const element = document.querySelector('#df-4fe5e669-2f81-4ac6-8b20-635af14803e5');\n", " const dataTable =\n", " await google.colab.kernel.invokeFunction('convertToInteractive',\n", " [key], {});\n", " if (!dataTable) return;\n", "\n", " const docLinkHtml = 'Like what you see? Visit the ' +\n", " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", " + ' to learn more about interactive tables.';\n", " element.innerHTML = '';\n", " dataTable['output_type'] = 'display_data';\n", " await google.colab.output.renderOutput(dataTable, element);\n", " const docLink = document.createElement('div');\n", " docLink.innerHTML = docLinkHtml;\n", " element.appendChild(docLink);\n", " }\n", " </script>\n", " </div>\n", " </div>\n", " " ] }, "metadata": {}, "execution_count": 110 } ] }, { "cell_type": "code", "source": [ "# calculate the mean and median of ttr for each unique text_type\n", "mean_ttr = df.groupby('text_type')['sttr'].mean()\n", "median_ttr = df.groupby('text_type')['sttr'].median()\n", "\n", "# print the mean and median of ttr for each unique text_type\n", "print(\"Mean TTR by Text Type:\\n\", mean_ttr, \"\\n\")\n", "print(\"Median TTR by Text Type:\\n\", median_ttr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "T_xS5zqVG0Wc", "outputId": "24ad2935-a2a7-447d-aa93-045b7f8a3a66" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mean TTR by Text Type:\n", " text_type\n", "en_sp_st 0.425851\n", "en_sp_tt 0.412468\n", "en_wr_st 0.434631\n", "en_wr_tt 0.441960\n", "it_sp_st 0.498941\n", "it_sp_tt 0.459118\n", "it_wr_st 0.499294\n", "it_wr_tt 0.500667\n", "Name: sttr, dtype: float64 \n", "\n", "Median TTR by Text Type:\n", " text_type\n", "en_sp_st 0.422000\n", "en_sp_tt 0.409408\n", "en_wr_st 0.434870\n", "en_wr_tt 0.442000\n", "it_sp_st 0.498000\n", "it_sp_tt 0.465000\n", "it_wr_st 0.503000\n", "it_wr_tt 0.496000\n", "Name: sttr, dtype: float64\n" ] } ] } ] }