import pandas as pd import pickle import string import os, sys from vocab import Vocabulary import torch import torchtext from torchtext.data import get_tokenizer import tqdm def clean_text(text): """Preprocesses a text string to increase glove embedding count""" dic = { '$': ' $ ', '-': ' - ', '£': ' £ ', '₹': ' ₹ ', '“': ' “ ', '\'': ' \' ', '/' : ' / ', '[': ' [ ', ']': ' ] ', '—': ' - ', '–': ' - ', '¢': ' ¢ ', '‘‘': ' ‘‘ ', '€': ' € ', '<': ' < ', '”': ' ” ', '`' : ' ` ', '£': ' £ ', '+': ' + ', '’': ' ’ ', '°': ' ° ', '″': ' ″ ', '−': ' − ', '×': ' × ' } for elem in dic.keys(): text = text.replace(elem, dic[elem]) return text def save_vocab(vocab): with open('savedVocab', 'wb') as savedVocab: pickle.dump(vocab, savedVocab) print("Saved the vocab.") def load_vocab(): if not os.path.exists('savedVocab'): build_vocab() with open(os.path.join(sys.path[0], 'savedVocab'), 'rb') as savedVocab: vocab = pickle.load(savedVocab) print("loaded vocab") return vocab def build_vocab(): train = pd.read_csv('./data/train.csv') val = pd.read_csv('./data/val.csv') vocab = Vocabulary() tokenizer = get_tokenizer("basic_english") words = [] print("Building Vocabulary") for i in range(len(train)): row = train.iloc[i] passage = str(row['passage']) question = str(row['question']) answer = str(row['answer']) passage = tokenizer(passage) question = tokenizer(question) answer = tokenizer(answer) words += passage + question + answer for i in range(len(val)): row = val.iloc[i] passage = str(row['passage']) question = str(row['question']) answer = str(row['answer']) passage = tokenizer(passage) question = tokenizer(question) answer = tokenizer(answer) words += passage + question + answer ser = pd.Series(words) counts = ser.value_counts() all_words = list(ser[ser.isin(counts[counts >= 2].index)].unique()) for elem in all_words: vocab.add_word(elem) save_vocab(vocab) return vocab