code/data/datacleaner.py · CoC-code

import pickle

train = pickle.load(open('../data/met_train.pkl', 'rb'))
test = pickle.load(open('../data/met_test.pkl', 'rb'))
dev = pickle.load(open('../data/met_dev.pkl', 'rb'))

import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

# nltk.download('stopwords')
# nltk.download('wordnet')
def tokenize(sentence):
    sentence = re.sub(r'\s+', ' ', sentence)
    token_words = word_tokenize(sentence)
    token_words = pos_tag(token_words)
    return token_words


wordnet_lematizer = WordNetLemmatizer()


def stem(token_words):
    words_lematizer = []
    for word, tag in token_words:
        if tag.startswith('NN'):
            word_lematizer = wordnet_lematizer.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            word_lematizer = wordnet_lematizer.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            word_lematizer = wordnet_lematizer.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            word_lematizer = wordnet_lematizer.lemmatize(word, pos='r')
        else:
            word_lematizer = wordnet_lematizer.lemmatize(word)
        words_lematizer.append(word_lematizer)
    return words_lematizer


sr = stopwords.words('english')

def delete_stopwords(token_words):
    cleaned_words = [word for word in token_words if word not in sr]
    return cleaned_words


def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass

    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass

    return False


characters = [' ', ',', '.', 'DBSCAN', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '-', '...',
              '^', '{', '}']


def delete_characters(token_words):
    words_list = [word for word in token_words if word not in characters]
    return words_list


def to_lower(token_words):
    words_lists = [x.lower() for x in token_words]
    return words_lists


def pre_process(text):
    token_words = tokenize(text)
    token_words = delete_characters(token_words)
    token_words = stem(token_words)
    token_words = to_lower(token_words)
    token_words = delete_stopwords(token_words)
    return token_words
print(len(train['id']))
print(len(train['raw_texts']))
for i in range(len(train['id'])):
    text = train['raw_texts'][i]
    res = pre_process(text)
    # print(res)
    train['raw_texts'][i] = ' '.join(res)

for i in range(len(test['id'])):
    text = test['raw_texts'][i]
    res = pre_process(text)
    # print(res)
    test['raw_texts'][i] = ' '.join(res)

for i in range(len(dev['id'])):
    text = dev['raw_texts'][i]
    res = pre_process(text)
    # print(res)
    dev['raw_texts'][i] = ' '.join(res)

with open('../data/met_textclean_train.pkl', 'wb') as f:
    pickle.dump(train, f)

with open('../data/mettextclean_test.pkl', 'wb') as f:
    pickle.dump(test, f)

with open('../data/met_textclean_dev.pkl', 'wb') as f:
    pickle.dump(dev, f)