Piazza-Classifier / main.cpp
main.cpp
Raw
    #include <iostream>
    #include <fstream>
    #include <map>
    #include "csvstream.h"
    #include <string>
    #include <cmath>
    #include <set>
    #include <vector>

    using namespace std;


    class Classifier
    {
        private:
            double numposts  = 0.0;
            int vocab = 0;
            string filename;
            string testname;
            bool debug = false; 
            pair<string, string> a;
            map<string, double> wordmap;
            map<string, double> labelmap;
            map<pair<string,string>, double> detailmap;
            string maxlabel;
            map<string, double> logpriormap;
            map<pair<string,string>, double> detailmaplog;



        public:
            int input_management(int argc, char *argv[])    // Manage Input
            {
                if(argc == 3)
                {
                    filename = argv[1];
                    testname = argv[2];
                }
                else if(argc == 4) {
                if(strcmp(argv[3], "--debug") != 0)
                    {   
                        cout << argv[3] << endl;
                    cout << "Usage: main.exe TRAIN_FILE TEST_FILE [--debug]" << endl;
                    return 1;
                    }
                    else
                    {
                        filename = argv[1];
                        testname = argv[2];
                        debug = true;
                    }   
                }
                else
                {
                    cout << "Usage: main.exe TRAIN_FILE TEST_FILE [--debug]" << endl;
                    return 1;
                }
                return 0;
            }       // END OF INPUT MANAGEMENT 

            int train_io() 
            {
                csvstream csvin(filename);
                if(!csvin)
                {
                    cout << "Error opening file: " << filename << endl;
                    return 1;

                }
                int counter = 0;
                map<string, string> row;
                pair<string, string> temp;
                if(debug == true)
                    cout << "training data:" << endl;
                while(csvin >> row)
                {
                    for(auto &col : row) {  
                        string a = col.first;
                        string b = col.second;
                        if(col.first == "content")
                        {
                            if(debug == true)
                            {
                               cout << "  label = " << row["tag"] << ", "
                               << "content = " << col.second << endl;
                            }
                            counter++;
                             set<string> temp2 = unique_words(col.second);
                             numposts++;
                             temp.first = row["tag"];
                             labelmap[row["tag"]]++;

                           if(temp.first == "")
                           {
                            temp.first = row["tag"];
                           }
                            for(auto it: temp2)
                            {
                                temp.second = it;
                                wordmap[it]++;
                                detailmap[temp]++;
                            }
                        }
                        }
                                           }
                cout << "trained on " << counter << " examples" << endl;
                if(!debug)
                cout << endl;

                maxlabelfinder();
                vocab = wordmap.size();
                if(debug == true)
                {
                    cout << "vocabulary size = " << vocab << endl;
                    cout << endl;
                    cout << "classes:" << endl;
                    for(auto &lab: logpriormap)
                    {
                        cout << "  " << lab.first << ", " << labelmap[lab.first]
                        << " examples, " << "log-prior = " << lab.second << endl;
                    }
                    cout << "classifier parameters:" << endl;
                    {
                        double tempvar;
                    for(auto det: detailmap)
                    {
                        tempvar = log(det.second/labelmap[(det.first).first]);
                        cout << "  " << (det.first).first <<":" << (det.first).second << 
                        ", count = " << det.second << ", log-likelihood = " << tempvar 
                        << endl;
                    }
                    }
                 cout << endl;
                }
                return 0;
            }

            int classify_io()
            {
                csvstream csvin_test(testname);
                 if(!csvin_test)
                {
                    cout << "Error opening file: " << testname << endl;
                    return 1;

                }
                map<string, string> row;
                string temp;
                string ans;
                set<string> wordset;
                pair<string, double> answer;
                int correct = 0, ourcorrect = 0;
                map<string, double> mapofwords;
                cout << "test data:" << endl;
                while(csvin_test >> row)
                {
                    for(auto &col : row)
                    {
                        if(col.first == "content")
                        {
                            temp = col.second;
                            wordset = unique_words(temp);    
                            answer = predict(wordset);    

                        }
                        if(col.first == "tag")
                        {
                            if(col.second == answer.first)
                            {
                                correct++;
                                ourcorrect++;
                            }
                            else
                            {
                                correct++;
                            }
                            cout << "  correct = " << col.second;
                            cout << ", predicted = " << answer.first;
                            cout << ", log-probability score = " << answer.second;
                            cout << endl << "  content = " << temp << endl << endl;
                        }
                        
                    }
                }
                cout << "performance: " << ourcorrect << " / " << 
                correct << " posts predicted correctly" << endl;
                return 0;
            }


            vector<string> create_set(string str) // CREATES SET OF WORDS TO PREDICT ON 
            {
                vector<string> ans;
                string temp;
                    for(string::iterator it=str.begin(); it!=str.end(); ++it)
                     {
                         if(*it != ' ')
                         {
                             char a = *it;
                             temp.push_back(a);
                         }
                         if(*it == ' ')
                         {
                             ans.push_back(temp);
                             temp = "";
                         }
                         
                     }
                     ans.push_back(temp);
                     return ans;
            }

            set<string> unique_words(const string &str) {
            // Fancy modern C++ and STL way to do it
            istringstream source{str};
            return {istream_iterator<string>{source},
                    istream_iterator<string>{}};
            }

            void maxlabelfinder()             // CALC MAXLABEL
            {
                double logprior;
                double max = -1000000;
                if(maxlabel == "")
                {
                for(auto lab: labelmap)
                {
                    logprior = log(lab.second/numposts);
                    logpriormap[lab.first] = logprior;
                    if(logprior > max)
                    {
                        maxlabel = lab.first;
                    }
                }
                }   
            }
            
          pair<string, double> predict(set<string> wordset)         // PREDICT
            {
                double maxlabelword;
                string tag;
                double maximumlog = -1000000;
                pair<string,string> temp;
                pair<string, double> returnans;

                for(auto lab: labelmap)
                {
                    maxlabelword = logpriormap[lab.first];
                    for(auto words: wordset)
                    {
                        temp.first = lab.first;
                        temp.second = words;
                        if(detailmap.find(temp) != detailmap.end())
                        {
                            maxlabelword += log((double)detailmap[temp]/lab.second);
                        }
                        else
                        {

                         if(wordmap.find(words) != wordmap.end())
                        {
                            maxlabelword += log((double)wordmap[words]/numposts);
                        }
                        else
                        {
                            maxlabelword += log((double)1/numposts);
                        }
                        }
                        
                    }
                    if(maximumlog < maxlabelword)
                        {
                            maximumlog = maxlabelword;
                            tag = lab.first;

                        }
                     else if(maximumlog == maxlabelword)
                    {
                        if((lab.first).compare(tag) < 0)
                        {
                            tag = lab.first;
 
                        }
                    }

                }
                returnans.first = tag;
                returnans.second = maximumlog;
                return returnans;

            }
            

    };




    int main(int argc, char *argv[]) {
        cout.precision(3);
        Classifier classA;
        int a = 0, b = 0;
        a = classA.input_management(argc , argv);
        if(a == 1)
            return 1;
        b = classA.train_io();
        if(b == 1)
            return 1;
        classA.classify_io();
        return 0;
    }