OpenDataPhillyFinal / src / edu / upenn / cit594 / datamanagement / CSVCovidReader.java
CSVCovidReader.java
Raw
package edu.upenn.cit594.datamanagement;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.upenn.cit594.ui.UserInterface;
import edu.upenn.cit594.util.CovidData;

public class CSVCovidReader implements CovidReader {

	protected static String filename;
	
	public CSVCovidReader (String name) {
		filename = name;
	}
	
	/**
	 * Reads covid_data.csv file and stores the values in the CovidData Objects
	 * @param filename: covid data.csv file
	 * @return an ArrayList of CovidData objects
	 */
	public static ArrayList<CovidData> readCovidCSV (){
		File csvFile = new File(filename);
		if(!csvFile.exists()) {
			UserInterface.print("error: covid csv file does not exist.");
			return null;
		} else if (!csvFile.canRead()) {
			UserInterface.print("error: covid csv file can not be read.");
			return null;
		}

		//To store the CovidData Object Entries
		ArrayList<CovidData> CovidData = new ArrayList<CovidData>();
		//Stores BufferedReader line
		String line = "";
		
		try {
			try (BufferedReader br = new BufferedReader(new FileReader(csvFile))) {
				
				//Initialize column indexes
				int zipCodeIndex = -1;
				int partialVaxIndex = -1;
				int fullVaxIndex= -1;
				int boostedIndex = -1;
				int timeStampIndex = -1;
				
				//Reads the first line to get header row out of the way
				@SuppressWarnings("unused")
				String headerLine = br.readLine();
				
				//Loop through the rest of csv file
				while((line = br.readLine()) != null) {
					
					//Splits by "," but ignores any commas surrounded by double quotes
					String[] colValues = line.split(",(?=(?:[^\\\"]*\\\"[^\\\"]*\\\")*[^\\\"]*$)");
					
					//Set any null or empty string values to 0
					for (int i=0; i<colValues.length; i++) {
						if(colValues[i] == null || colValues[i].equals("")) {
							colValues[i] = "0";
						}
					}
					
					//Get Key:Val pair of columns Name:Index
					HashMap<String, Integer> headerColsMap = new HashMap<String, Integer>();
					headerColsMap = readHeader(filename);
					
					//Check column name exists for Object Fields
					boolean zipCodeExists = headerColsMap.containsKey("\"zip_code\"");
					boolean partialVaxExists = headerColsMap.containsKey("\"partially_vaccinated\"");
					boolean fullVaxExists = headerColsMap.containsKey("\"fully_vaccinated\"");
					boolean boostedExists = headerColsMap.containsKey("\"boosted\"");
					boolean timeStampExists = headerColsMap.containsKey("\"etl_timestamp\"");
					
					//Set column indexes
					//if any of the columns are missing from the csv file, display a warning message
					if (zipCodeExists) {
						zipCodeIndex = headerColsMap.get("\"zip_code\"");
					}else {
						UserInterface.print("Warning: csv file is missing a zip_code column.");
					}
					if (partialVaxExists) {
						partialVaxIndex = headerColsMap.get("\"partially_vaccinated\"");
					}else {
						UserInterface.print("Warning: csv file is missing a partially_vaccinated column.");
					}
					if (fullVaxExists) {
						fullVaxIndex = headerColsMap.get("\"fully_vaccinated\"");
					}else {
						UserInterface.print("Warning: csv file is missing a fully_vaccinated column.");
					}
					if (boostedExists) {
						boostedIndex = headerColsMap.get("\"boosted\"");
					}else {
						UserInterface.print("Warning: csv file is missing a boosted column.");
					}
					if (timeStampExists) {
						timeStampIndex = headerColsMap.get("\"etl_timestamp\"");
					}else {
						UserInterface.print("Warning: csv file is missing an etl_timestamp column.");
					}
					
					
					//If zipcode length is not equal to 5, then skip to next covid data row
					if(colValues[zipCodeIndex].strip().length() != 5) { continue; }
					
					//If timestamp is not in specified format (“YYYY- MM-DD hh:mm:ss”), then skip row
					String pattern = "[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1]) (2[0-3]|[01][0-9]):[0-5][0-9]:[0-5][0-9]";
					Pattern r = Pattern.compile(pattern);
					Matcher m = r.matcher(colValues[timeStampIndex]);
					if(colValues[timeStampIndex].equals("0") || !m.find()) { continue; }
					
					
					//If any of the indexes are still -1: means that the column name did not exist. Set the field's value to 0
					if (zipCodeIndex == -1) { colValues[zipCodeIndex] = "0"; }
					if (partialVaxIndex == -1) { colValues[partialVaxIndex] = "0"; }
					if (fullVaxIndex == -1) { colValues[fullVaxIndex] = "0"; }
					if (boostedIndex == -1) { colValues[boostedIndex] = "0"; }
					if (timeStampIndex == -1) { colValues[timeStampIndex] = "0"; }
					
					
					//Add a new CovidData set
					CovidData.add(new CovidData(
						Integer.parseInt(colValues[zipCodeIndex]), // set zip code
						Integer.parseInt(colValues[partialVaxIndex]), // set partialVax
						Integer.parseInt(colValues[fullVaxIndex]), // set fullVax
						Integer.parseInt(colValues[boostedIndex]), // set boosted count
						(colValues[timeStampIndex])					// set time stamp
						));
				
				}
				br.close();
				//return CovidData Object
				return CovidData;
			} catch (NumberFormatException e) {
				e.printStackTrace();
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		//If we don't make it into the try, then returns empty ArrayList
		return CovidData;			
	}

	/**
	 * Get the column index of every column's name
	 * @param filename: the csv file
	 * @return a HashMap where the key is the column's name and the value is it's index 
	 */
	public static HashMap<String, Integer> readHeader (String filename){
		
		File csvFile = new File(filename);
		if(!csvFile.exists()) {
			UserInterface.print("error: csv file does not exist.");
			return null;
		} else if (!csvFile.canRead()) {
			UserInterface.print("error: csv file can not be read.");
			return null;
		}
	
		//Keys: column's name 
		//Values: column's index
		HashMap<String, Integer> headerColsMap = new HashMap<String, Integer>();
		
		try {
			try (BufferedReader br = new BufferedReader(new FileReader(csvFile))) {
				
				String headerLiner = br.readLine();
				//Splits by "," but ignores any commas surrounded by double quotes
				String[] headerArray = headerLiner.split(",(?=(?:[^\\\"]*\\\"[^\\\"]*\\\")*[^\\\"]*$)");
				
				//Stores index of column name
				int i = 0;
				
				for(String col : headerArray) {
					headerColsMap.put(col, i);
					i++;
				}
				br.close();
			}			
			return headerColsMap;
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return headerColsMap;
	}
	
	
}