#include <iostream> #include <fstream> #include <map> #include "csvstream.h" #include <string> #include <cmath> #include <set> #include <vector> using namespace std; class Classifier { private: double numposts = 0.0; int vocab = 0; string filename; string testname; bool debug = false; pair<string, string> a; map<string, double> wordmap; map<string, double> labelmap; map<pair<string,string>, double> detailmap; string maxlabel; map<string, double> logpriormap; map<pair<string,string>, double> detailmaplog; public: int input_management(int argc, char *argv[]) // Manage Input { if(argc == 3) { filename = argv[1]; testname = argv[2]; } else if(argc == 4) { if(strcmp(argv[3], "--debug") != 0) { cout << argv[3] << endl; cout << "Usage: main.exe TRAIN_FILE TEST_FILE [--debug]" << endl; return 1; } else { filename = argv[1]; testname = argv[2]; debug = true; } } else { cout << "Usage: main.exe TRAIN_FILE TEST_FILE [--debug]" << endl; return 1; } return 0; } // END OF INPUT MANAGEMENT int train_io() { csvstream csvin(filename); if(!csvin) { cout << "Error opening file: " << filename << endl; return 1; } int counter = 0; map<string, string> row; pair<string, string> temp; if(debug == true) cout << "training data:" << endl; while(csvin >> row) { for(auto &col : row) { string a = col.first; string b = col.second; if(col.first == "content") { if(debug == true) { cout << " label = " << row["tag"] << ", " << "content = " << col.second << endl; } counter++; set<string> temp2 = unique_words(col.second); numposts++; temp.first = row["tag"]; labelmap[row["tag"]]++; if(temp.first == "") { temp.first = row["tag"]; } for(auto it: temp2) { temp.second = it; wordmap[it]++; detailmap[temp]++; } } } } cout << "trained on " << counter << " examples" << endl; if(!debug) cout << endl; maxlabelfinder(); vocab = wordmap.size(); if(debug == true) { cout << "vocabulary size = " << vocab << endl; cout << endl; cout << "classes:" << endl; for(auto &lab: logpriormap) { cout << " " << lab.first << ", " << labelmap[lab.first] << " examples, " << "log-prior = " << lab.second << endl; } cout << "classifier parameters:" << endl; { double tempvar; for(auto det: detailmap) { tempvar = log(det.second/labelmap[(det.first).first]); cout << " " << (det.first).first <<":" << (det.first).second << ", count = " << det.second << ", log-likelihood = " << tempvar << endl; } } cout << endl; } return 0; } int classify_io() { csvstream csvin_test(testname); if(!csvin_test) { cout << "Error opening file: " << testname << endl; return 1; } map<string, string> row; string temp; string ans; set<string> wordset; pair<string, double> answer; int correct = 0, ourcorrect = 0; map<string, double> mapofwords; cout << "test data:" << endl; while(csvin_test >> row) { for(auto &col : row) { if(col.first == "content") { temp = col.second; wordset = unique_words(temp); answer = predict(wordset); } if(col.first == "tag") { if(col.second == answer.first) { correct++; ourcorrect++; } else { correct++; } cout << " correct = " << col.second; cout << ", predicted = " << answer.first; cout << ", log-probability score = " << answer.second; cout << endl << " content = " << temp << endl << endl; } } } cout << "performance: " << ourcorrect << " / " << correct << " posts predicted correctly" << endl; return 0; } vector<string> create_set(string str) // CREATES SET OF WORDS TO PREDICT ON { vector<string> ans; string temp; for(string::iterator it=str.begin(); it!=str.end(); ++it) { if(*it != ' ') { char a = *it; temp.push_back(a); } if(*it == ' ') { ans.push_back(temp); temp = ""; } } ans.push_back(temp); return ans; } set<string> unique_words(const string &str) { // Fancy modern C++ and STL way to do it istringstream source{str}; return {istream_iterator<string>{source}, istream_iterator<string>{}}; } void maxlabelfinder() // CALC MAXLABEL { double logprior; double max = -1000000; if(maxlabel == "") { for(auto lab: labelmap) { logprior = log(lab.second/numposts); logpriormap[lab.first] = logprior; if(logprior > max) { maxlabel = lab.first; } } } } pair<string, double> predict(set<string> wordset) // PREDICT { double maxlabelword; string tag; double maximumlog = -1000000; pair<string,string> temp; pair<string, double> returnans; for(auto lab: labelmap) { maxlabelword = logpriormap[lab.first]; for(auto words: wordset) { temp.first = lab.first; temp.second = words; if(detailmap.find(temp) != detailmap.end()) { maxlabelword += log((double)detailmap[temp]/lab.second); } else { if(wordmap.find(words) != wordmap.end()) { maxlabelword += log((double)wordmap[words]/numposts); } else { maxlabelword += log((double)1/numposts); } } } if(maximumlog < maxlabelword) { maximumlog = maxlabelword; tag = lab.first; } else if(maximumlog == maxlabelword) { if((lab.first).compare(tag) < 0) { tag = lab.first; } } } returnans.first = tag; returnans.second = maximumlog; return returnans; } }; int main(int argc, char *argv[]) { cout.precision(3); Classifier classA; int a = 0, b = 0; a = classA.input_management(argc , argv); if(a == 1) return 1; b = classA.train_io(); if(b == 1) return 1; classA.classify_io(); return 0; }