#include #include #include #include #include #include "csvstream.h" #include using namespace std; class classifier { private: //map representing the number of posts for each unique label map num_post_label; //map of map representing the number of posts for each unique word given a label map> num_post_label_with_word; //map representing the number of occurances for each unique word in the whole file map num_words; //number of posts in the file int num_posts; //number of unique words in the file int num_unique_words; public: classifier(): num_posts(0), num_unique_words(0) {} set unique_words(const string &str) { istringstream source(str); set words; string word; while (source >> word) { words.insert(word); } return words; } void process_data(string file_in) { //read in a single csv file, with each line representing a post, //reading in and store each line as a node in row_value; ; csvstream name(file_in); map row_value; while (name >> row_value) { num_post_label[row_value["tag"]]++; set split_words = unique_words(row_value["content"]); for (const auto &i : split_words) { num_words[i]++; num_post_label_with_word[row_value["tag"]][i]++; } num_posts += 1; } num_unique_words = static_cast(num_words.size()); } void process_print_norm(string file_in) { process_data(file_in); cout << "trained on " << num_posts << " examples" << endl; } void process_print_debug(string file_in) { process_data(file_in); cout << "training data:" << endl; map row_value; csvstream csvin(file_in); while (csvin >> row_value) { string label = row_value["tag"]; string content = row_value["content"]; string predicted = find_most_probable_label(content).first; cout << " label = " << label << ", content = " << content << endl; } cout << "trained on " << num_posts << " examples" << endl << "vocabulary size = " << num_unique_words << endl << endl; cout << "classes:" << endl; for(const auto &i : num_post_label) { cout << " " << i.first << ", " << i.second << " examples, log-prior = " << compute_log_labelc(i.first) << endl; } cout << "classifier parameters:" << endl; for(const auto &i : num_post_label_with_word) { for (const auto &j : i.second) { cout << " " << i.first << ":" << j.first << ", count = " << j.second << ", log-likelihood = " << compute_log_labelc_givenw(i.first, j.first) << endl; } } } double compute_log_labelc(string label) { //computes the log probability of finding a post of a given label //case 1: label doesnt exist if (num_post_label.find(label) == num_post_label.end()) { return log(1.0 / num_posts); } //case 2: label exists else { return log(num_post_label[label] / static_cast(num_posts)); } } double compute_log_labelc_givenw(string label, string word) { //computes the log probability of finding a post with a word given a label //case 1: word does not exist if (num_words.find(word) == num_words.end()) { return log(1.0 / num_posts); } //case 2: word exists but not in label else if (num_post_label_with_word[label].find(word) == num_post_label_with_word[label].end()) { return log(num_words[word] / static_cast(num_posts)); } //case 3: word exists and is within label else { return log(num_post_label_with_word[label][word] / static_cast(num_post_label[label])); } } double compute_prob(set post_content, string label) { //computes probability of a post having the given label double prob_post = compute_log_labelc(label); for (const auto &word : post_content) { prob_post += compute_log_labelc_givenw(label, word); } return prob_post; } pair find_most_probable_label(string post_in) { //finds the label of the highest proability for a given post set set_of_unique_words = unique_words(post_in); string label = num_post_label.begin()->first; double max_prob = compute_prob(set_of_unique_words, label); for (const auto &l : num_post_label) { double prob = compute_prob(set_of_unique_words, l.first); if (prob > max_prob) { label = l.first; max_prob = prob; } } pair pair (label, max_prob); return pair; } // double compute_lps(string post_in) { // // computes lps of predicted label based on post // set set_of_unique_words = unique_words(post_in); // string label = find_most_probable_label(post_in); // return compute_prob(set_of_unique_words, label); // } void predict(string file_in) { //predicts labels for each post prints results to cout int count_correct = 0, count_total = 0; cout << endl << "test data:" << endl; map row_value; csvstream csvin(file_in); while (csvin >> row_value) { string correct = row_value["tag"]; string content = row_value["content"]; string predicted = find_most_probable_label(content).first; // double lps = compute_lps(row_value["content"]); double lps = find_most_probable_label(content).second; cout << " correct = " << correct << ", predicted = " << predicted << ", log-probability score = " << lps << endl << " content = " << content << endl << endl; if (correct == predicted) {count_correct++;} count_total++; } cout << "performance: " << count_correct << " / " << count_total << " posts predicted correctly" << endl; } }; int main(int argc, char **argv) { cout.precision(3); string executable, train_file, test_file, debug = ""; if (!(argc == 3 || argc == 4)) { cout << "Usage: main.exe TRAIN_FILE TEST_FILE [--debug]" << endl; return -1; } else if (argc == 3) { executable = argv[0]; train_file = argv[1]; test_file = argv[2]; } else { executable = argv[0]; train_file = argv[1]; test_file = argv[2]; debug = argv[3]; if (debug != "--debug") { cout << "Usage: main.exe TRAIN_FILE TEST_FILE [--debug]" << endl; return -1; } } ifstream fin_train(train_file); ifstream fin_test(test_file); if (!fin_train.is_open()) { cout << "Error opening file: " << train_file << endl; return -1; } if (!fin_test.is_open()) { cout << "Error opening file: " << test_file << endl; return -1; } classifier test; if (debug == "--debug") {test.process_print_debug(train_file);} else {test.process_print_norm(train_file);} test.predict(test_file); return 0; }