Post-classifier-using-Machine-Learning / csvstream.h
csvstream.h
Raw

#ifndef CSVSTREAM_H
#define CSVSTREAM_H

#include <iostream>
#include <fstream>
#include <sstream>
#include <cassert>
#include <string>
#include <vector>
#include <map>
#include <regex>
#include <exception>


// A custom exception type
class csvstream_exception : public std::exception {
public:
  const char * what () const noexcept override {
    return msg.c_str();
  }
  const std::string msg;
  csvstream_exception(const std::string &msg) : msg(msg) {};
};


// csvstream interface
class csvstream {
public:
  // Constructor from filename. Throws csvstream_exception if open fails.
  csvstream(const std::string &filename, char delimiter=',', bool strict=true);

  // Constructor from stream
  csvstream(std::istream &is, char delimiter=',', bool strict=true);

  // Destructor
  ~csvstream();

  // Return false if an error flag on underlying stream is set
  explicit operator bool() const;

  // Return header processed by constructor
  std::vector<std::string> getheader() const;

  // Stream extraction operator reads one row. Throws csvstream_exception if
  // the number of items in a row does not match the header.
  csvstream & operator>> (std::map<std::string, std::string>& row);

  // Stream extraction operator reads one row, keeping column order. Throws
  // csvstream_exception if the number of items in a row does not match the
  // header.
  csvstream & operator>> (std::vector<std::pair<std::string, std::string> >& row);

private:
  // Filename.  Used for error messages.
  std::string filename;

  // File stream in CSV format, used when library is called with filename ctor
  std::ifstream fin;

  // Stream in CSV format
  std::istream &is;

  // Delimiter between columns
  char delimiter;

  // Strictly enforce the number of values in each row.  Raise an exception if
  // a row contains too many values or too few compared to the header.  When
  // strict=false, ignore extra values and set missing values to empty string.
  bool strict;

  // Line no in file.  Used for error messages
  size_t line_no;

  // Store header column names
  std::vector<std::string> header;

  // Process header, the first line of the file
  void read_header();

  // Disable copying because copying streams is bad!
  csvstream(const csvstream &);
  csvstream & operator= (const csvstream &);
};


///////////////////////////////////////////////////////////////////////////////
// Implementation

// Read and tokenize one line from a stream
static bool read_csv_line(std::istream &is,
                          std::vector<std::string> &data,
                          char delimiter
                          ) {

  // Add entry for first token, start with empty string
  data.clear();
  data.push_back(std::string());

  // Process one character at a time
  char c = '\0';
  enum State {BEGIN, QUOTED, QUOTED_ESCAPED, UNQUOTED, UNQUOTED_ESCAPED, END};
  State state = BEGIN;
  while(is.get(c)) {
    switch (state) {
    case BEGIN:
      // We need this state transition to properly handle cases where nothing
      // is extracted.
      state = UNQUOTED;

      // Intended switch fallthrough.  Beginning with GCC7, this triggers an
      // error by default.  Disable the error for this specific line.
      #if __GNUG__ && __GNUC__ >= 7
      [[fallthrough]];
      #endif

    case UNQUOTED:
      if (c == '"') {
        // Change states when we see a double quote
        state = QUOTED;
      } else if (c == '\\') { //note this checks for a single backslash char
        state = UNQUOTED_ESCAPED;
        data.back() += c;
      } else if (c == delimiter) {
        // If you see a delimiter, then start a new field with an empty string
        data.push_back("");
      } else if (c == '\n' || c == '\r') {
        // If you see a line ending *and it's not within a quoted token*, stop
        // parsing the line.  Works for UNIX (\n) and OSX (\r) line endings.
        // Consumes the line ending character.
        state = END;
      } else {
        // Append character to current token
        data.back() += c;
      }
      break;

    case UNQUOTED_ESCAPED:
      // If a character is escaped, add it no matter what.
      data.back() += c;
      state = UNQUOTED;
      break;

    case QUOTED:
      if (c == '"') {
        // Change states when we see a double quote
        state = UNQUOTED;
      } else if (c == '\\') {
        state = QUOTED_ESCAPED;
        data.back() += c;
      } else {
        // Append character to current token
        data.back() += c;
      }
      break;

    case QUOTED_ESCAPED:
      // If a character is escaped, add it no matter what.
      data.back() += c;
      state = QUOTED;
      break;

    case END:
      if (c == '\n') {
        // Handle second character of a Windows line ending (\r\n).  Do
        // nothing, only consume the character.
      } else {
        // If this wasn't a Windows line ending, then put character back for
        // the next call to read_csv_line()
        is.unget();
      }

      // We're done with this line, so break out of both the switch and loop.
      goto multilevel_break; //This is a rare example where goto is OK
      break;

    default:
      assert(0);
      throw state;

    }//switch
  }//while

 multilevel_break:
  // Clear the failbit if we extracted anything.  This is to mimic the behavior
  // of getline(), which will set the eofbit, but *not* the failbit if a partial
  // line is read.
  if (state != BEGIN) is.clear();

  // Return status is the underlying stream's status
  return static_cast<bool>(is);
}


csvstream::csvstream(const std::string &filename, char delimiter, bool strict)
  : filename(filename),
    is(fin),
    delimiter(delimiter),
    strict(strict),
    line_no(0) {

  // Open file
  fin.open(filename.c_str());
  if (!fin.is_open()) {
    throw csvstream_exception("Error opening file: " + filename);
  }

  // Process header
  read_header();
}


csvstream::csvstream(std::istream &is, char delimiter, bool strict)
  : filename("[no filename]"),
    is(is),
    delimiter(delimiter),
    strict(strict),
    line_no(0) {
  read_header();
}


csvstream::~csvstream() {
  if (fin.is_open()) fin.close();
}


csvstream::operator bool() const {
  return static_cast<bool>(is);
}


std::vector<std::string> csvstream::getheader() const {
  return header;
}


csvstream & csvstream::operator>> (std::map<std::string, std::string>& row) {
  // Clear input row
  row.clear();

  // Read one line from stream, bail out if we're at the end
  std::vector<std::string> data;
  if (!read_csv_line(is, data, delimiter)) return *this;
  line_no += 1;

  // When strict mode is disabled, coerce the length of the data.  If data is
  // larger than header, discard extra values.  If data is smaller than header,
  // pad data with empty strings.
  if (!strict) {
    data.resize(header.size());
  }

  // Check length of data
  if (data.size() != header.size()) {
    auto msg = "Number of items in row does not match header. " +
      filename + ":L" + std::to_string(line_no) + " " +
      "header.size() = " + std::to_string(header.size()) + " " +
      "row.size() = " + std::to_string(data.size()) + " "
      ;
    throw csvstream_exception(msg);
  }

  // combine data and header into a row object
  for (size_t i=0; i<data.size(); ++i) {
    row[header[i]] = data[i];
  }

  return *this;
}


csvstream & csvstream::operator>> (std::vector<std::pair<std::string, std::string> >& row) {
  // Clear input row
  row.clear();
  row.resize(header.size());

  // Read one line from stream, bail out if we're at the end
  std::vector<std::string> data;
  if (!read_csv_line(is, data, delimiter)) return *this;
  line_no += 1;

  // When strict mode is disabled, coerce the length of the data.  If data is
  // larger than header, discard extra values.  If data is smaller than header,
  // pad data with empty strings.
  if (!strict) {
    data.resize(header.size());
  }

  // Check length of data
  if (row.size() != header.size()) {
    auto msg = "Number of items in row does not match header. " +
      filename + ":L" + std::to_string(line_no) + " " +
      "header.size() = " + std::to_string(header.size()) + " " +
      "row.size() = " + std::to_string(row.size()) + " "
      ;
    throw csvstream_exception(msg);
  }

  // combine data and header into a row object
  for (size_t i=0; i<data.size(); ++i) {
    row[i] = make_pair(header[i], data[i]);
  }

  return *this;
}


void csvstream::read_header() {
  // read first line, which is the header
  if (!read_csv_line(is, header, delimiter)) {
    throw csvstream_exception("error reading header");
  }
}

#endif