In [None]:
import pandas as pd
import re

In [None]:
df = pd.read_excel('/content/data_en_it_tagged.xlsx')

In [None]:
df

Unnamed: 0,direction,id,text_type,text
0,en_to_it,0001en_sp_st,en_sp_st,Thank/VV you/PP President/NP ./SENT Well/RB...
1,en_to_it,0002en_sp_st,en_sp_st,Thank/VV you/PP very/RB much/JJ Mr/NP Pre...
2,en_to_it,0003en_sp_st,en_sp_st,Excuse/VV me/PP ./SENT Thank/VV you/PP Pre...
3,en_to_it,0004en_sp_st,en_sp_st,"President/NP ,/, the/DT upheaval/NN in/IN ..."
4,en_to_it,0005en_sp_st,en_sp_st,Thank/VV you/PP Mr/NP President/NP ./SENT ...
...,...,...,...,...
523,it_to_en,1064en_wr_tt,en_wr_tt,"Mr/NP President/NP ,/, ladies/NNS and/CC g..."
524,it_to_en,1065en_wr_tt,en_wr_tt,"Mr/NP President/NP ,/, High/NP Representati..."
525,it_to_en,1066en_wr_tt,en_wr_tt,"Mr/NP President/NP ,/, ladies/NNS and/CC g..."
526,it_to_en,1067en_wr_tt,en_wr_tt,"Mr/NP President/NP ,/, ladies/NNS and/CC g..."


In [None]:
# Function to remove tags from text
def remove_tags(text):
    ignore_tags = ['SENT', ',', '\'', 'DYSF', ':', 'EPAUSE', '``', 'FPAUSE', 'UNCLEAR', 'NOCAT', 'SYM', 'PUN', '/NOCAT']
    words = text.split()
    cleaned_words = []
    for word in words:
        try:
            if word.split('/')[1] not in ignore_tags:
                cleaned_words.append(word.split('/')[0])
        except IndexError: # word does not have a '/'
            cleaned_words.append(word)
    return ' '.join(cleaned_words)

# Apply function to remove tags from "text"
df['text'] = df['text'].apply(remove_tags)

# Function to split texts into chunks of 1000 tokens
def split_text(text):
    words = text.split()
    return [words[i:i+1000] for i in range(0, len(words), 1000)]

# Function to handle rows merging/splitting
def handle_rows(df):
    rows = []
    buffer_text = []
    buffer_id = []

    for index, row in df.iterrows():
        chunks = split_text(row['text'])
        if len(buffer_text) > 0 and buffer_id[-1].split('_')[-1] != row['id'].split('_')[-1]: # Check if text_type has changed
            if len(buffer_text) >= 1000:
                rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})
            buffer_text = []
            buffer_id = []
        for i, chunk in enumerate(chunks):
            if i == 0 and len(buffer_text) < 1000:
                buffer_text += chunk
                buffer_id.append(row['id'])
                if len(buffer_text) >= 1000:
                    rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})
                    buffer_text = buffer_text[1000:]
                    buffer_id = [row['id']]
            else:
                buffer_text = chunk
                buffer_id = [row['id']]
                if len(buffer_text) >= 1000:
                    rows.append({'direction': row['direction'], 'id': ', '.join(buffer_id), 'text_type': row['text_type'], 'text': ' '.join(buffer_text[:1000])})
                    buffer_text = buffer_text[1000:]
    return pd.DataFrame(rows)

# Apply function to handle rows merging/splitting
df = handle_rows(df)

# Print the number of tokens for each unique value in the text_type column
token_counts = df.groupby('text_type')['text'].apply(lambda x: sum(len(text.split()) for text in x))
print(token_counts)

text_type
en_sp_st    20000
en_sp_tt    16000
en_wr_st    19000
en_wr_tt    18000
it_sp_st    17000
it_sp_tt    17000
it_wr_st    17000
it_wr_tt    18000
Name: text, dtype: int64


In [None]:
def calculate_sttr(text):

    # lowercase and tokenize the text
    tokens = text.lower().split()

    # number of tokens (total number of words)
    num_tokens = len(tokens)

    # number of types (unique words)
    num_types = len(set(tokens))

    # calculate Type-Token Ratio (TTR)
    ttr = num_types / num_tokens if num_tokens > 0 else 0

    return ttr

# apply the function to the 'text' column and create a new 'ttr' column
df['sttr'] = df['text'].apply(calculate_sttr)

In [None]:
df = df[['direction', 'id', 'text_type', 'sttr', 'text']]

In [None]:
df

Unnamed: 0,direction,id,text_type,sttr,text
0,en_to_it,"0001en_sp_st, 0002en_sp_st, 0003en_sp_st",en_sp_st,0.461924,Thank you President Well some colleagues took ...
1,en_to_it,"0003en_sp_st, 0004en_sp_st, 0005en_sp_st, 0006...",en_sp_st,0.478478,refrain from using violence and that there wil...
2,en_to_it,"0006en_sp_st, 0007en_sp_st, 0008en_sp_st",en_sp_st,0.431156,their hard work their thoughtfulness and commi...
3,en_to_it,"0008en_sp_st, 0009en_sp_st, 0010en_sp_st",en_sp_st,0.424000,into our Committee to do that I think has been...
4,en_to_it,"0010en_sp_st, 0011en_sp_st, 0012en_sp_st, 0013...",en_sp_st,0.427711,that these measures when endorsed and adopted ...
...,...,...,...,...,...
137,it_to_en,"1045en_wr_tt, 1046en_wr_tt, 1047en_wr_tt, 1048...",en_wr_tt,0.439759,regions and between administrative structures ...
138,it_to_en,"1049en_wr_tt, 1050en_wr_tt, 1051en_wr_tt, 1052...",en_wr_tt,0.442000,by the European Union are those specifically i...
139,it_to_en,"1055en_wr_tt, 1056en_wr_tt, 1057en_wr_tt, 1058...",en_wr_tt,0.463928,couples around the world who every day face th...
140,it_to_en,"1059en_wr_tt, 1060en_wr_tt, 1061en_wr_tt, 1062...",en_wr_tt,0.483903,We are now about to adopt the agreement on Ira...


In [None]:
# calculate the mean and median of ttr for each unique text_type
mean_ttr = df.groupby('text_type')['sttr'].mean()
median_ttr = df.groupby('text_type')['sttr'].median()

# print the mean and median of ttr for each unique text_type
print("Mean TTR by Text Type:\n", mean_ttr, "\n")
print("Median TTR by Text Type:\n", median_ttr)

Mean TTR by Text Type:
 text_type
en_sp_st    0.425851
en_sp_tt    0.412468
en_wr_st    0.434631
en_wr_tt    0.441960
it_sp_st    0.498941
it_sp_tt    0.459118
it_wr_st    0.499294
it_wr_tt    0.500667
Name: sttr, dtype: float64 

Median TTR by Text Type:
 text_type
en_sp_st    0.422000
en_sp_tt    0.409408
en_wr_st    0.434870
en_wr_tt    0.442000
it_sp_st    0.498000
it_sp_tt    0.465000
it_wr_st    0.503000
it_wr_tt    0.496000
Name: sttr, dtype: float64
