{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "collapsed_sections": [ "xJJatsUev-We" ] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "source": [ "import pandas as pd\n", "import re" ], "metadata": { "id": "40uCf910RUdV" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df = pd.read_excel('/content/data_en_it_tagged.xlsx')" ], "metadata": { "id": "w3Iur8DPVlHe" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "9LaOvu7jVpYL", "outputId": "02e3a47b-b776-4226-e003-56c94fa73709" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " direction id text_type \\\n", "0 en_to_it 0001en_sp_st en_sp_st \n", "1 en_to_it 0002en_sp_st en_sp_st \n", "2 en_to_it 0003en_sp_st en_sp_st \n", "3 en_to_it 0004en_sp_st en_sp_st \n", "4 en_to_it 0005en_sp_st en_sp_st \n", ".. ... ... ... \n", "523 it_to_en 1064en_wr_tt en_wr_tt \n", "524 it_to_en 1065en_wr_tt en_wr_tt \n", "525 it_to_en 1066en_wr_tt en_wr_tt \n", "526 it_to_en 1067en_wr_tt en_wr_tt \n", "527 it_to_en 1068en_wr_tt en_wr_tt \n", "\n", " text \n", "0 Thank/VV you/PP President/NP ./SENT Well/RB... \n", "1 Thank/VV you/PP very/RB much/JJ Mr/NP Pre... \n", "2 Excuse/VV me/PP ./SENT Thank/VV you/PP Pre... \n", "3 President/NP ,/, the/DT upheaval/NN in/IN ... \n", "4 Thank/VV you/PP Mr/NP President/NP ./SENT ... \n", ".. ... \n", "523 Mr/NP President/NP ,/, ladies/NNS and/CC g... \n", "524 Mr/NP President/NP ,/, High/NP Representati... \n", "525 Mr/NP President/NP ,/, ladies/NNS and/CC g... \n", "526 Mr/NP President/NP ,/, ladies/NNS and/CC g... \n", "527 Mr/NP President/NP ,/, ladies/NNS and/CC g... \n", "\n", "[528 rows x 4 columns]" ], "text/html": [ "\n", " <div word does not have a '/'\n", " cleaned_words.append(word)\n", " return ' '.join(cleaned_words)\n", "\n", "# Apply function to remove tags from \"text\"\n", "df['text'] = df['text'].apply(remove_tags)\n", "\n", "# Function to split texts into chunks of 1000 tokens\n", "def split_text(text):\n", " words = text.split()\n", " return [words[i:i+1000] for i in range(0, len(words), 1000)]\n", "\n", "# Function to handle rows merging/splitting\n", "def handle_rows(df):\n", " rows = []\n", " buffer_text = []\n", " buffer_id = []\n", "\n", " for index, row in df.iterrows():\n", " chunks = split_text(row['text'])\n", " if len(buffer_text) > 0 and buffer_id[-1].split('_')[-1] != row['id'].split('_')[-1]: # Check if text_type has changed\n", " if len(buffer_text) >= 1000:\n", " rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})\n", " buffer_text = []\n", " buffer_id = []\n", " for i, chunk in word does not have a '/'\n", " cleaned_words.append(word)\n", " return ' '.join(cleaned_words)\n", "\n", "# Apply function to remove tags from \"text\"\n", "df['text'] = df['text'].apply(remove_tags)\n", "\n", "# Function to split texts into chunks of 1000 tokens\n", "def split_text(text):\n", " words = text.split()\n", " return [words[i:i+1000] for i in range(0, len(words), 1000)]\n", "\n", "# Function to handle rows merging/splitting\n", "def handle_rows(df):\n", " rows = []\n", " buffer_text = []\n", " buffer_id = []\n", "\n", " for index, row in df.iterrows():\n", " chunks = split_text(row['text'])\n", " if len(buffer_text) > 0 and buffer_id[-1].split('_')[-1] != row['id'].split('_')[-1]: # Check if text_type has changed\n", " if len(buffer_text) >= 1000:\n", " rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})\n", " buffer_text = []\n", " buffer_id = []\n", " for i, chunk in enumerate(chunks):\n", " if i == 0 and len(buffer_text) < 1000:\n", " buffer_text += chunk\n", " buffer_id.append(row['id'])\n", " if len(buffer_text) >= 1000:\n", " rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})\n", " buffer_text = buffer_text[1000:]\n", " buffer_id = [row['id']]\n", " else:\n", " buffer_text = chunk\n", " buffer_id = [row['id']]\n", " if len(buffer_text) >= 1000:\n", " rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})\n", " buffer_text = buffer_text[1000:]\n", " return pd.DataFrame(rows)\n", "\n", "# Apply function to handle rows merging/splitting\n", "df = handle_rows(df)\n", "\n", "# Print the number of tokens for each unique value in the text_type column\n", "token_counts = df.groupby('text_type')['text'].apply(lambda x: sum(len(text.split()) for text in x))\n", "print(token_counts)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NvfBC4F_vXdL", "outputId": "1b27d368-720e-4acd-f609-8c959d6542a7" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "text_type\n", "en_sp_st 20000\n", "en_sp_tt 16000\n", "en_wr_st 19000\n", "en_wr_tt 18000\n", "it_sp_st 17000\n", "it_sp_tt 17000\n", "it_wr_st 17000\n", "it_wr_tt 18000\n", "Name: text, dtype: int64\n" ] } ] }, { "cell_type": "code", "source": [ "def calculate_sttr(text):\n", "\n", " # lowercase and tokenize the text\n", " tokens = text.lower().split()\n", "\n", " # number of tokens (total number of words)\n", " num_tokens = len(tokens)\n", "\n", " # number of types (unique words)\n", " num_types = len(set(tokens))\n", "\n", " # calculate Type-Token Ratio (TTR)\n", " ttr = num_types / num_tokens if num_tokens > 0 else 0\n", "\n", " return ttr\n", "\n", "# apply the function to the 'text' column and create a new 'ttr' column\n", "df['sttr'] = df['text'].apply(calculate_sttr)" ], "metadata": { "id": "LtIT3tv9GGid" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df = df[['direction', 'id', 'text_type', 'sttr', 'text']]" ], "metadata": { "id": "CmjUKuzBGUDO" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "ZBPNYPe7GITS", "outputId": "8622cc0f-eb8b-4ccc-892a-e452e8196d06" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " direction id text_type \\\n", "0 en_to_it 0001en_sp_st, 0002en_sp_st, 0003en_sp_st en_sp_st \n", "1 en_to_it 0003en_sp_st, 0004en_sp_st, 0005en_sp_st, 0006... en_sp_st \n", "2 en_to_it 0006en_sp_st, 0007en_sp_st, 0008en_sp_st en_sp_st \n", "3 en_to_it 0008en_sp_st, 0009en_sp_st, 0010en_sp_st en_sp_st \n", "4 en_to_it 0010en_sp_st, 0011en_sp_st, 0012en_sp_st, 0013... en_sp_st \n", ".. ... ... ... \n", "137 it_to_en 1045en_wr_tt, 1046en_wr_tt, 1047en_wr_tt, 1048... en_wr_tt \n", "138 it_to_en 1049en_wr_tt, 1050en_wr_tt, 1051en_wr_tt, 1052... en_wr_tt \n", "139 it_to_en 1055en_wr_tt, 1056en_wr_tt, 1057en_wr_tt, 1058... en_wr_tt \n", "140 it_to_en 1059en_wr_tt, 1060en_wr_tt, 1061en_wr_tt, 1062... en_wr_tt \n", "141 it_to_en 1063en_wr_tt, 1064en_wr_tt, 1065en_wr_tt, 1066... en_wr_tt \n", "\n", " sttr text \n", "0 0.461924 Thank you President Well some colleagues took ... \n", "1 0.478478 refrain from using violence and that there wil... \n", "2 0.431156 their hard work their thoughtfulness and commi... \n", "3 0.424000 into our Committee to do that I think has been... \n", "4 0.427711 that these measures when endorsed and adopted ... \n", ".. ... ... \n", "137 0.439759 regions and between administrative structures ... \n", "138 0.442000 by the European Union are those specifically i... \n", "139 0.463928 couples around the world who every day face th... \n", "140 0.483903 