lda / data_preparation.ipynb
data_preparation.ipynb
Raw
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "xJJatsUev-We"
      ]
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "import re"
      ],
      "metadata": {
        "id": "40uCf910RUdV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df = pd.read_excel('/content/data_en_it_tagged.xlsx')"
      ],
      "metadata": {
        "id": "w3Iur8DPVlHe"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 424
        },
        "id": "9LaOvu7jVpYL",
        "outputId": "02e3a47b-b776-4226-e003-56c94fa73709"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "    direction            id text_type  \\\n",
              "0    en_to_it  0001en_sp_st  en_sp_st   \n",
              "1    en_to_it  0002en_sp_st  en_sp_st   \n",
              "2    en_to_it  0003en_sp_st  en_sp_st   \n",
              "3    en_to_it  0004en_sp_st  en_sp_st   \n",
              "4    en_to_it  0005en_sp_st  en_sp_st   \n",
              "..        ...           ...       ...   \n",
              "523  it_to_en  1064en_wr_tt  en_wr_tt   \n",
              "524  it_to_en  1065en_wr_tt  en_wr_tt   \n",
              "525  it_to_en  1066en_wr_tt  en_wr_tt   \n",
              "526  it_to_en  1067en_wr_tt  en_wr_tt   \n",
              "527  it_to_en  1068en_wr_tt  en_wr_tt   \n",
              "\n",
              "                                                  text  \n",
              "0    Thank/VV  you/PP  President/NP ./SENT  Well/RB...  \n",
              "1    Thank/VV  you/PP  very/RB  much/JJ  Mr/NP  Pre...  \n",
              "2    Excuse/VV  me/PP ./SENT  Thank/VV  you/PP  Pre...  \n",
              "3    President/NP ,/,  the/DT  upheaval/NN  in/IN  ...  \n",
              "4    Thank/VV  you/PP  Mr/NP  President/NP ./SENT  ...  \n",
              "..                                                 ...  \n",
              "523  Mr/NP  President/NP ,/,  ladies/NNS  and/CC  g...  \n",
              "524  Mr/NP  President/NP ,/,  High/NP  Representati...  \n",
              "525  Mr/NP  President/NP ,/,  ladies/NNS  and/CC  g...  \n",
              "526  Mr/NP  President/NP ,/,  ladies/NNS  and/CC  g...  \n",
              "527  Mr/NP  President/NP ,/,  ladies/NNS  and/CC  g...  \n",
              "\n",
              "[528 rows x 4 columns]"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-0650964d-e86d-423e-a3ed-9f013d66d5ee\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>direction</th>\n",
              "      <th>id</th>\n",
              "      <th>text_type</th>\n",
              "      <th>text</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>en_to_it</td>\n",
              "      <td>0001en_sp_st</td>\n",
              "      <td>en_sp_st</td>\n",
              "      <td>Thank/VV  you/PP  President/NP ./SENT  Well/RB...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>en_to_it</td>\n",
              "      <td>0002en_sp_st</td>\n",
              "      <td>en_sp_st</td>\n",
              "      <td>Thank/VV  you/PP  very/RB  much/JJ  Mr/NP  Pre...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>en_to_it</td>\n",
              "      <td>0003en_sp_st</td>\n",
              "      <td>en_sp_st</td>\n",
              "      <td>Excuse/VV  me/PP ./SENT  Thank/VV  you/PP  Pre...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>en_to_it</td>\n",
              "      <td>0004en_sp_st</td>\n",
              "      <td>en_sp_st</td>\n",
              "      <td>President/NP ,/,  the/DT  upheaval/NN  in/IN  ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>en_to_it</td>\n",
              "      <td>0005en_sp_st</td>\n",
              "      <td>en_sp_st</td>\n",
              "      <td>Thank/VV  you/PP  Mr/NP  President/NP ./SENT  ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>523</th>\n",
              "      <td>it_to_en</td>\n",
              "      <td>1064en_wr_tt</td>\n",
              "      <td>en_wr_tt</td>\n",
              "      <td>Mr/NP  President/NP ,/,  ladies/NNS  and/CC  g...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>524</th>\n",
              "      <td>it_to_en</td>\n",
              "      <td>1065en_wr_tt</td>\n",
              "      <td>en_wr_tt</td>\n",
              "      <td>Mr/NP  President/NP ,/,  High/NP  Representati...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>525</th>\n",
              "      <td>it_to_en</td>\n",
              "      <td>1066en_wr_tt</td>\n",
              "      <td>en_wr_tt</td>\n",
              "      <td>Mr/NP  President/NP ,/,  ladies/NNS  and/CC  g...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>526</th>\n",
              "      <td>it_to_en</td>\n",
              "      <td>1067en_wr_tt</td>\n",
              "      <td>en_wr_tt</td>\n",
              "      <td>Mr/NP  President/NP ,/,  ladies/NNS  and/CC  g...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>527</th>\n",
              "      <td>it_to_en</td>\n",
              "      <td>1068en_wr_tt</td>\n",
              "      <td>en_wr_tt</td>\n",
              "      <td>Mr/NP  President/NP ,/,  ladies/NNS  and/CC  g...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>528 rows × 4 columns</p>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-0650964d-e86d-423e-a3ed-9f013d66d5ee')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-0650964d-e86d-423e-a3ed-9f013d66d5ee button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-0650964d-e86d-423e-a3ed-9f013d66d5ee');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Function to remove tags from text\n",
        "def remove_tags(text):\n",
        "    ignore_tags = ['SENT', ',', '\\'', 'DYSF', ':', 'EPAUSE', '``', 'FPAUSE', 'UNCLEAR', 'NOCAT', 'SYM', 'PUN', '/NOCAT']\n",
        "    words = text.split()\n",
        "    cleaned_words = []\n",
        "    for word in words:\n",
        "        try:\n",
        "            if word.split('/')[1] not in ignore_tags:\n",
        "                cleaned_words.append(word.split('/')[0])\n",
        "        except IndexError: # word does not have a '/'\n",
        "            cleaned_words.append(word)\n",
        "    return ' '.join(cleaned_words)\n",
        "\n",
        "# Apply function to remove tags from \"text\"\n",
        "df['text'] = df['text'].apply(remove_tags)\n",
        "\n",
        "# Function to split texts into chunks of 1000 tokens\n",
        "def split_text(text):\n",
        "    words = text.split()\n",
        "    return [words[i:i+1000] for i in range(0, len(words), 1000)]\n",
        "\n",
        "# Function to handle rows merging/splitting\n",
        "def handle_rows(df):\n",
        "    rows = []\n",
        "    buffer_text = []\n",
        "    buffer_id = []\n",
        "\n",
        "    for index, row in df.iterrows():\n",
        "        chunks = split_text(row['text'])\n",
        "        if len(buffer_text) > 0 and buffer_id[-1].split('_')[-1] != row['id'].split('_')[-1]: # Check if text_type has changed\n",
        "            if len(buffer_text) >= 1000:\n",
        "                rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})\n",
        "            buffer_text = []\n",
        "            buffer_id = []\n",
        "        for i, chunk in enumerate(chunks):\n",
        "            if i == 0 and len(buffer_text) < 1000:\n",
        "                buffer_text += chunk\n",
        "                buffer_id.append(row['id'])\n",
        "                if len(buffer_text) >= 1000:\n",
        "                    rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})\n",
        "                    buffer_text = buffer_text[1000:]\n",
        "                    buffer_id = [row['id']]\n",
        "            else:\n",
        "                buffer_text = chunk\n",
        "                buffer_id = [row['id']]\n",
        "                if len(buffer_text) >= 1000:\n",
        "                    rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})\n",
        "                    buffer_text = buffer_text[1000:]\n",
        "    return pd.DataFrame(rows)\n",
        "\n",
        "# Apply function to handle rows merging/splitting\n",
        "df = handle_rows(df)\n",
        "\n",
        "# Print the number of tokens for each unique value in the text_type column\n",
        "token_counts = df.groupby('text_type')['text'].apply(lambda x: sum(len(text.split()) for text in x))\n",
        "print(token_counts)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "NvfBC4F_vXdL",
        "outputId": "1b27d368-720e-4acd-f609-8c959d6542a7"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "text_type\n",
            "en_sp_st    20000\n",
            "en_sp_tt    16000\n",
            "en_wr_st    19000\n",
            "en_wr_tt    18000\n",
            "it_sp_st    17000\n",
            "it_sp_tt    17000\n",
            "it_wr_st    17000\n",
            "it_wr_tt    18000\n",
            "Name: text, dtype: int64\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def calculate_sttr(text):\n",
        "\n",
        "    # lowercase and tokenize the text\n",
        "    tokens = text.lower().split()\n",
        "\n",
        "    # number of tokens (total number of words)\n",
        "    num_tokens = len(tokens)\n",
        "\n",
        "    # number of types (unique words)\n",
        "    num_types = len(set(tokens))\n",
        "\n",
        "    # calculate Type-Token Ratio (TTR)\n",
        "    ttr = num_types / num_tokens if num_tokens > 0 else 0\n",
        "\n",
        "    return ttr\n",
        "\n",
        "# apply the function to the 'text' column and create a new 'ttr' column\n",
        "df['sttr'] = df['text'].apply(calculate_sttr)"
      ],
      "metadata": {
        "id": "LtIT3tv9GGid"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df = df[['direction', 'id', 'text_type', 'sttr', 'text']]"
      ],
      "metadata": {
        "id": "CmjUKuzBGUDO"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 424
        },
        "id": "ZBPNYPe7GITS",
        "outputId": "8622cc0f-eb8b-4ccc-892a-e452e8196d06"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "    direction                                                 id text_type  \\\n",
              "0    en_to_it           0001en_sp_st, 0002en_sp_st, 0003en_sp_st  en_sp_st   \n",
              "1    en_to_it  0003en_sp_st, 0004en_sp_st, 0005en_sp_st, 0006...  en_sp_st   \n",
              "2    en_to_it           0006en_sp_st, 0007en_sp_st, 0008en_sp_st  en_sp_st   \n",
              "3    en_to_it           0008en_sp_st, 0009en_sp_st, 0010en_sp_st  en_sp_st   \n",
              "4    en_to_it  0010en_sp_st, 0011en_sp_st, 0012en_sp_st, 0013...  en_sp_st   \n",
              "..        ...                                                ...       ...   \n",
              "137  it_to_en  1045en_wr_tt, 1046en_wr_tt, 1047en_wr_tt, 1048...  en_wr_tt   \n",
              "138  it_to_en  1049en_wr_tt, 1050en_wr_tt, 1051en_wr_tt, 1052...  en_wr_tt   \n",
              "139  it_to_en  1055en_wr_tt, 1056en_wr_tt, 1057en_wr_tt, 1058...  en_wr_tt   \n",
              "140  it_to_en  1059en_wr_tt, 1060en_wr_tt, 1061en_wr_tt, 1062...  en_wr_tt   \n",
              "141  it_to_en  1063en_wr_tt, 1064en_wr_tt, 1065en_wr_tt, 1066...  en_wr_tt   \n",
              "\n",
              "         sttr                                               text  \n",
              "0    0.461924  Thank you President Well some colleagues took ...  \n",
              "1    0.478478  refrain from using violence and that there wil...  \n",
              "2    0.431156  their hard work their thoughtfulness and commi...  \n",
              "3    0.424000  into our Committee to do that I think has been...  \n",
              "4    0.427711  that these measures when endorsed and adopted ...  \n",
              "..        ...                                                ...  \n",
              "137  0.439759  regions and between administrative structures ...  \n",
              "138  0.442000  by the European Union are those specifically i...  \n",
              "139  0.463928  couples around the world who every day face th...  \n",
              "140  0.483903  We are now about to adopt the agreement on Ira...  \n",
              "141  0.458753  book Premesse della politica   Premises of pol...  \n",
              "\n",
              "[142 rows x 5 columns]"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-4fe5e669-2f81-4ac6-8b20-635af14803e5\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>direction</th>\n",
              "      <th>id</th>\n",
              "      <th>text_type</th>\n",
              "      <th>sttr</th>\n",
              "      <th>text</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>en_to_it</td>\n",
              "      <td>0001en_sp_st, 0002en_sp_st, 0003en_sp_st</td>\n",
              "      <td>en_sp_st</td>\n",
              "      <td>0.461924</td>\n",
              "      <td>Thank you President Well some colleagues took ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>en_to_it</td>\n",
              "      <td>0003en_sp_st, 0004en_sp_st, 0005en_sp_st, 0006...</td>\n",
              "      <td>en_sp_st</td>\n",
              "      <td>0.478478</td>\n",
              "      <td>refrain from using violence and that there wil...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>en_to_it</td>\n",
              "      <td>0006en_sp_st, 0007en_sp_st, 0008en_sp_st</td>\n",
              "      <td>en_sp_st</td>\n",
              "      <td>0.431156</td>\n",
              "      <td>their hard work their thoughtfulness and commi...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>en_to_it</td>\n",
              "      <td>0008en_sp_st, 0009en_sp_st, 0010en_sp_st</td>\n",
              "      <td>en_sp_st</td>\n",
              "      <td>0.424000</td>\n",
              "      <td>into our Committee to do that I think has been...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>en_to_it</td>\n",
              "      <td>0010en_sp_st, 0011en_sp_st, 0012en_sp_st, 0013...</td>\n",
              "      <td>en_sp_st</td>\n",
              "      <td>0.427711</td>\n",
              "      <td>that these measures when endorsed and adopted ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>137</th>\n",
              "      <td>it_to_en</td>\n",
              "      <td>1045en_wr_tt, 1046en_wr_tt, 1047en_wr_tt, 1048...</td>\n",
              "      <td>en_wr_tt</td>\n",
              "      <td>0.439759</td>\n",
              "      <td>regions and between administrative structures ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>138</th>\n",
              "      <td>it_to_en</td>\n",
              "      <td>1049en_wr_tt, 1050en_wr_tt, 1051en_wr_tt, 1052...</td>\n",
              "      <td>en_wr_tt</td>\n",
              "      <td>0.442000</td>\n",
              "      <td>by the European Union are those specifically i...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>139</th>\n",
              "      <td>it_to_en</td>\n",
              "      <td>1055en_wr_tt, 1056en_wr_tt, 1057en_wr_tt, 1058...</td>\n",
              "      <td>en_wr_tt</td>\n",
              "      <td>0.463928</td>\n",
              "      <td>couples around the world who every day face th...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>140</th>\n",
              "      <td>it_to_en</td>\n",
              "      <td>1059en_wr_tt, 1060en_wr_tt, 1061en_wr_tt, 1062...</td>\n",
              "      <td>en_wr_tt</td>\n",
              "      <td>0.483903</td>\n",
              "      <td>We are now about to adopt the agreement on Ira...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>141</th>\n",
              "      <td>it_to_en</td>\n",
              "      <td>1063en_wr_tt, 1064en_wr_tt, 1065en_wr_tt, 1066...</td>\n",
              "      <td>en_wr_tt</td>\n",
              "      <td>0.458753</td>\n",
              "      <td>book Premesse della politica   Premises of pol...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>142 rows × 5 columns</p>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4fe5e669-2f81-4ac6-8b20-635af14803e5')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-4fe5e669-2f81-4ac6-8b20-635af14803e5 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-4fe5e669-2f81-4ac6-8b20-635af14803e5');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 110
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# calculate the mean and median of ttr for each unique text_type\n",
        "mean_ttr = df.groupby('text_type')['sttr'].mean()\n",
        "median_ttr = df.groupby('text_type')['sttr'].median()\n",
        "\n",
        "# print the mean and median of ttr for each unique text_type\n",
        "print(\"Mean TTR by Text Type:\\n\", mean_ttr, \"\\n\")\n",
        "print(\"Median TTR by Text Type:\\n\", median_ttr)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "T_xS5zqVG0Wc",
        "outputId": "24ad2935-a2a7-447d-aa93-045b7f8a3a66"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mean TTR by Text Type:\n",
            " text_type\n",
            "en_sp_st    0.425851\n",
            "en_sp_tt    0.412468\n",
            "en_wr_st    0.434631\n",
            "en_wr_tt    0.441960\n",
            "it_sp_st    0.498941\n",
            "it_sp_tt    0.459118\n",
            "it_wr_st    0.499294\n",
            "it_wr_tt    0.500667\n",
            "Name: sttr, dtype: float64 \n",
            "\n",
            "Median TTR by Text Type:\n",
            " text_type\n",
            "en_sp_st    0.422000\n",
            "en_sp_tt    0.409408\n",
            "en_wr_st    0.434870\n",
            "en_wr_tt    0.442000\n",
            "it_sp_st    0.498000\n",
            "it_sp_tt    0.465000\n",
            "it_wr_st    0.503000\n",
            "it_wr_tt    0.496000\n",
            "Name: sttr, dtype: float64\n"
          ]
        }
      ]
    }
  ]
}