CSE-8B / PA7 / starter / RatingPredictor.java
RatingPredictor.java
Raw
  //File: RatingPredictor.java
//Name: Trai Pham
//Date: 02/16/2020
//Course: CSE 8B
/**
This class will allow us to clean text files and predict ratings based on
learned data.
*/

import java.util.Arrays;
import java.util.HashSet;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Scanner;
import java.io.FileWriter;
import java.io.File;
import java.io.PrintWriter;
import java.io.IOException;
import java.io.FileNotFoundException;

public class RatingPredictor{
/**
-stores a string as a key and int array as a value
-String represent a word in the review
-The first element in array is the sum of the rating of the reviews that the
word is a part of
-The Second element hold the count for the word
**/
  private HashMap<String, int[]> wordFreqMap;
//stores all of the stopwords read from file stopwords.txt
  private HashSet<String> stopWords;

  String stopWordsFile =
    "C:/Users/traip/Desktop/CSE8B PA7/starter/stopWords.txt";

  String whiteSpace = " ";
  String emptySpace = "";
  String allPunctuations = "!\"#$%&'()*+,-./:;<=>?@\\[\\]\\\\^_`{|}~";

//Constructor
  public RatingPredictor(){
    this.stopWords = new HashSet<String>();
    this.wordFreqMap = new HashMap<String, int[]>();

  }

/**
@return ArrayList of string; of words from the review
@param String that contain entire contents of one review
This method should split the review(param) into words and return an ArrayList
of words
**/
  public ArrayList<String> splitLine (String sentence){
    ArrayList<String> wordsOfReview = new ArrayList<String>();
    String words = new String();
    if(sentence == null || sentence.length() == 0){
      return null;
    }
//uses the String split method to split the words based on spaces
    else{
      String [] splitWords = sentence.split(whiteSpace);
        for(int i = 0; i < splitWords.length; i++){
          words = splitWords[i];
          wordsOfReview.add(words);
        }
      }
      return wordsOfReview;
    }
/**
@return ArrayList of String(words)
@param ArrayList of String, it's the individual words in an ArrayList

This method splits the words in the ArrayList at the hyphens and single quotes,
it'll split the Strings that were once attached together by the hyphen or quote
**/
  public ArrayList<String> splitAtHyphensAndQuotes (ArrayList<String> words){
    ArrayList<String> cleanWords = new ArrayList<String>();
    String newWords = new String();

    if(words == null || words.size() ==0){
      return null;
    }
    else{
      String [] strArray= new String[words.size()];
      for(int i = 0; i < words.size(); i++){
        newWords = words.get(i).replaceAll("[-']", whiteSpace);
//this will split into a String [], how do I make sure that
        strArray = newWords.split(whiteSpace);
        for(int j = 0; j < strArray.length; j++){
          cleanWords.add(strArray[j]);
        }
      }
    }
    return cleanWords;
  }

/**
@return ArrayList of string
@param ArrayList of String of the review/file that needs to be cleaned
This will remove any type of punctuations from the ArrayList of string within
the review
**/
  public ArrayList<String> removePunctuation (ArrayList<String> words){
    ArrayList<String> cleanWords = new ArrayList<String>();
    String newWords = new String();

    if(words == null || words.size() ==0){
      return null;
    }
    else{
      String [] strArray = new String[words.size()];
      for(int i = 0; i < words.size(); i++){
        newWords = words.get(i).replaceAll(
          "[!\"#$%&'()*+,-./:;<=>?@\\[\\]\\\\^_`{|}~]", whiteSpace);
        strArray = newWords.split(whiteSpace);
        for(int j = 0; j < strArray.length; j++){
          cleanWords.add(strArray[j]);
          }
      }
    }
    return cleanWords;
}
/**
@return ArrayList of String
@param ArrayList of String of the uncleaned ArrayList of Strings
This method would remove all the whiteSpaces from the review based on the
ArrayList of String, it'll get you a new ArrayList of String that will have no
WhiteSpaces within the ArrayList

**/
  public ArrayList<String> removeWhiteSpaces (ArrayList<String> words){
    ArrayList<String> cleanWords = new ArrayList<String>();

    if(words == null || words.size() ==0){
      return null;
    }
    else{
//replace whiteSpace with emptySpace, and putting elements in String[]
      for(int i = 0; i < words.size(); i++){
        cleanWords.add(words.get(i).replaceAll(whiteSpace, emptySpace));
      }
    }
    return cleanWords;
  }
/**
@return ArrayList of String
@param ArrayList of String of the uncleaned ArrayList of Strings
This method would remove the empty space within the ArrayList of Strings. A new
ArrayList of Strings would be created that doesn't have any empty Spaces
**/
  public ArrayList<String> removeEmptyWords (ArrayList<String> words){
    ArrayList<String> cleanWords = new ArrayList<String>();

    if(words == null || words.size() ==0){
      return null;
    }
    else{
      String [] strArray = new String[words.size()];
//if specific element in ArrayList equals empty space, we'll remove it
      for(int i = words.size() - 1; i >= 0; i--){
        if(words.get(i).equals(emptySpace)){
          words.remove(i);
        }
      }

    }
    return words;
  }
/**
@return ArrayList of String
@param ArrayList of String of the uncleaned ArrayList of Strings
This method would remove Single Letter Strings within the ArrayList of Strings
it'll create a new ArrayList of Strings that wouldn't have any Single letter
Strings
**/
  public ArrayList<String> removeSingleLetterWords (ArrayList<String> words){
    ArrayList<String> cleanWords = new ArrayList<String>();

    if(words == null || words.size() ==0){
      return null;
    }
    else{
      String [] strArray = new String[words.size()];
      for(int i = words.size()-1; i >= 0; i--){
        if(words.get(i).length() == 1){
          words.remove(i);
        }
      }
      cleanWords = words;
    }
    return cleanWords;
  }
/**
@return ArrayList of String
@param ArrayList of String of the uncleaned ArrayList of Strings
This method would convert the elements within the ArrayList of Strings to all
lowercase. It'll create a new ArrayList of strings with only the lowercased
Strings from the unclean ArrayList
**/
  public ArrayList<String> toLowerCase (ArrayList<String> words){
    ArrayList<String> cleanWords = new ArrayList<String>();

    if(words == null || words.size() ==0){
      return null;
    }
    else{
      for(int i = 0; i < words.size(); i++){
        cleanWords.add(words.get(i).toLowerCase());
      }
    }
    return cleanWords;
  }
/**
@return ArrayList of String
@param ArrayList of String of the uncleaned ArrayList of Strings
This method would read through the file stopwords.txt and would remove any
words from the ArrayList of Strings from the uncleaned reviews that are in the
stopwords.txt file. A new ArrayList of Strings would be created that would not
have any of the stopwords in it.
**/
//use HashSet<String> stopWords which stores all stopwords from stopwords.txt
  public ArrayList<String> removeStopWords (ArrayList<String> arrList){
    ArrayList<String> cleanWords = new ArrayList<String>();
    if(arrList == null || arrList.size() ==0){
      return null;
    }
//If this doesn't work, put whole definition in try and catch the exception
    try{
      String [] strArray = new String[arrList.size()];
      File stopWords2 = new File(stopWordsFile);
      Scanner read = new Scanner(stopWords2);

      String randomLine = emptySpace;
//this would add the lines/elements from stopWords file to the set.
      while(read.hasNext()){
        randomLine = read.nextLine();
        this.stopWords.add(randomLine);
      }
/**this reads through the Set and check if the ArrayList has the element in
stopWords file. Then it'll remove the element at the specific index
**/
      for(int i = arrList.size() -1; i >= 0; i--){
        if(this.stopWords.contains(arrList.get(i)))
        arrList.remove(i);
      }
      cleanWords = arrList;
      read.close();
    }
    catch(FileNotFoundException ex){}
    return cleanWords;
  }
  /**
@return nothing
@param String that represent the input file, String that represent the output
file
This method would create a new output file based on the input file
stopwords.txt. This method makes sure that elements within the stopwords.txt
is not repeated in the new file.
**/
  public void createStopWordsSet (String inFile, String outFile){
    try{
//inFile takes the Path of stopWords
      File stopWordsTXT = new File(inFile);
      Scanner read = new Scanner(stopWordsTXT);
//adding lines from stopWords.txt file to HashSet
      String randomLine = emptySpace;
      while(read.hasNext()){
        randomLine = read.nextLine();
        // System.out.println(randomLine);
        this.stopWords.add(randomLine);
      }
      File uniqueStopwordstxt = new File(outFile);
      PrintWriter printer = new PrintWriter(uniqueStopwordstxt);
//Iterator will iterate each elements in HashSet in its own line to the output
//file
      Iterator<String> iterateSet = this.stopWords.iterator();
      while(iterateSet.hasNext()){
        printer.println(iterateSet.next());
      }
//Remember to close the printWriter class, or else you won't get a printed file
//it'll continue to print that's why the file is blank
      printer.close();
      read.close();
    }
    catch(FileNotFoundException ex){}

  }
/**
@return nothing
@param String of the input File, String of the output File, and boolean that
verifies whether the file contains ratings or not
This method would clean up the input text file and would create a new clean
output file, by using all the previous methods.
**/
  public void cleanData (String inFile, String outFile, boolean ratingIncluded){
    try{
      ArrayList<String> cleanArray = new ArrayList<String>();
      ArrayList<String> newCleanArray = new ArrayList<String>();
      File rawFile = new File(inFile);
      Scanner read = new Scanner(rawFile);
      File cleanFile = new File(outFile);
      PrintWriter printer = new PrintWriter(cleanFile);
//clean the text files with rating
      if(ratingIncluded == true){
        String randomLine = emptySpace;
        while(read.hasNext()){
          randomLine = read.nextLine();
          cleanArray = splitLine(randomLine);
          cleanArray = splitAtHyphensAndQuotes(cleanArray);
          cleanArray = removePunctuation(cleanArray);
          cleanArray = removeWhiteSpaces(cleanArray);
          cleanArray = removeEmptyWords(cleanArray);
          cleanArray = removeSingleLetterWords(cleanArray);
          cleanArray = toLowerCase(cleanArray);
          cleanArray = removeStopWords(cleanArray);
          newCleanArray.add(randomLine.substring(0,1)+whiteSpace);
          if(cleanArray != null){
            for(int i = 0; i < cleanArray.size(); i++){
              newCleanArray.add(cleanArray.get(i)+ whiteSpace);
            }
          }
          newCleanArray.add("\n");
        }
      }
//clean the text files without rating
      if(ratingIncluded == false){
        String randomLine = emptySpace;
        while(read.hasNext()){
          randomLine = read.nextLine();
          cleanArray = splitLine(randomLine);
          cleanArray = splitAtHyphensAndQuotes(cleanArray);
          cleanArray = removePunctuation(cleanArray);
          cleanArray = removeWhiteSpaces(cleanArray);
          cleanArray = removeEmptyWords(cleanArray);
          cleanArray = removeSingleLetterWords(cleanArray);
          cleanArray = toLowerCase(cleanArray);
          cleanArray = removeStopWords(cleanArray);
          if(cleanArray != null){
            for(int i = 0; i < cleanArray.size(); i++){
              newCleanArray.add(cleanArray.get(i)+ whiteSpace);
            }
          }
          newCleanArray.add("\n");
        }
      }
//prints out the element from the cleanArrayList into the new file
      Iterator iterateArrayList = newCleanArray.iterator();
      while(iterateArrayList.hasNext()){
        printer.print(iterateArrayList.next());
      }
      read.close();
      printer.close();
    }
    catch(FileNotFoundException ex){}
  }
  /**
@return nothing
@param String of the cleanFile(path)
This method would take the cleaned data file as input and use it to update
the Hashmaps. Keys are the words(String), values is an int array:First element
is the sum of the rating, and the second element is the number of time that the
word appear in the review.
**/
//private HashMap<String, int[]> wordFreqMap;
  public void updateHashMap(String inCleanFile){
    try{
    File cleanFile = new File(inCleanFile);
    Scanner read = new Scanner(cleanFile);
    ArrayList<String> newArray = new ArrayList<String>();
    // int [] totalRatingAndFreq = new int[2];
    // totalRatingAndFreq[0] = 0;
    // totalRatingAndFreq[1] = 0;

    String line = emptySpace;
    while(read.hasNextLine()){
      line = read.nextLine();
      newArray = splitLine(line);
      for(int i = 1; i < newArray.size(); i++){
//If the word is in the HashMap then the int[] values would change accordingly
        if(this.wordFreqMap.containsKey(newArray.get(i))){
          int [] duplicateWords = this.wordFreqMap.get(newArray.get(i));
          duplicateWords[0] += Integer.parseInt(newArray.get(0));
          duplicateWords[1] += 1;
        }
//If the word is not in the hashmap, the first value of the int [] would be
//the rating of the review and the second element would be = 1
        else{
          int [] singleWord = new int[2];
          singleWord[0] = Integer.parseInt(newArray.get(0));
          singleWord[1] = 1;
          this.wordFreqMap.put(newArray.get(i), singleWord);
        }
      }
    }
    read.close();
  }

    catch(FileNotFoundException ex){}

  }
/**
@return nothing
@param String of clean file path, and string of the rating file path()
This method will get you the rating of the reviews based on the hashmap of the
reviews.
**/
  public void rateReviews (String inCleanFile, String outRatingsFile){
    try{
      File cleanFile = new File(inCleanFile);
      Scanner read = new Scanner(cleanFile);
      File ratingFile = new File(outRatingsFile);
      PrintWriter printer = new PrintWriter(ratingFile);

//scanner would read the review
      while(read.hasNextLine()){
        ArrayList<String> newArray = new ArrayList<String>();
        ArrayList<Double> wordAverages = new ArrayList<Double>();
        String line = emptySpace;
        line = read.nextLine();
        newArray = this.splitLine(line);
//iterate through the elements of strings within the array and add the values
//of the ratings together based on the hashmap
        for(String word: newArray){
          if(this.wordFreqMap.containsKey(word)){
            int [] value = this.wordFreqMap.get(word);
            double average = ((double)value[0])/value[1];
            wordAverages.add(average);
          }
//if the word is not found in the ArrayList of strings
          else{
            wordAverages.add(2.0);
          }
        }
        double rating = 0.0;
        for(double a: wordAverages){
          rating += a;

        }

        double realRating = rating/wordAverages.size();
//This is how you'll be able to print it to the file, based on the discussion
        if(read.hasNextLine() == true){
          printer.println(String.format("%.1f", realRating));
        }
        else{
          printer.print(String.format("%.1f", realRating));
        }
      }
      printer.close();
      read.close();
    }
    catch(FileNotFoundException ex){}
  }
//testing methods with main method
  public static void main(String[] args){
    RatingPredictor obj = new RatingPredictor();
// testing SplitLine
System.out.println("testing SplitLine");
    ArrayList<String>spitLineTest = new ArrayList<String>();
    String testReview = "The Jungle-Book is a fantastic movie! It's the best!!";
    spitLineTest = obj.splitLine(testReview);
    System.out.println(spitLineTest);
//testing splitAtHyphensAndQuotes
System.out.println("testing splitAtHyphensAndQuotes");
    ArrayList<String> rawHyphenAndQuotes = new ArrayList<String>(
      Arrays.asList("The", "Jungle-Book", "is", "a", "fantastic", "movie!",
        "It's", "the", "best!!"));
    ArrayList<String> cleanHyphenAndQuotes = new ArrayList<String>();
    cleanHyphenAndQuotes = obj.splitAtHyphensAndQuotes(rawHyphenAndQuotes);
    System.out.println(cleanHyphenAndQuotes);
//testing removePunctuation
System.out.println("testing removePunctuation");
  ArrayList<String> rawRemovePunctuation = new ArrayList<String>(
    Arrays.asList("The", "Jungle", "Book", "is", "a", "fantastic", "movie!",
      "It", "s", "the", "best!!"));
  ArrayList<String> cleanRemovePunctuation = new ArrayList<String>();
  cleanRemovePunctuation = obj.removePunctuation(rawRemovePunctuation);
  System.out.println(cleanRemovePunctuation);
//testing removeWhiteSpaces
System.out.println("testing removeWhiteSpaces");
  ArrayList<String> rawRemoveWhiteSpaces = new ArrayList<String>(
    Arrays.asList("The", "Jungle", "Book ", "is", "a", " fantastic", "movie",
      " It", "s", "the ", "best"));
  System.out.print("Raw:");
  System.out.println(rawRemoveWhiteSpaces);
  ArrayList<String> cleanRemoveWhiteSpaces = new ArrayList<String>();
  cleanRemoveWhiteSpaces = obj.removeWhiteSpaces(rawRemoveWhiteSpaces);
  System.out.println(cleanRemoveWhiteSpaces);
//testing removeEmptyWords
System.out.println("testing removeEmptyWords");
  ArrayList<String> rawRemoveEmptyWords = new ArrayList<String>(
    Arrays.asList("The", "Jungle", "Book", "", "", "fantastic", "movie", "It",
      "s", "", "best"));
  System.out.println("Raw:" + rawRemoveEmptyWords);
  ArrayList<String> cleanRemoveEmptyWords = obj.removeEmptyWords(
    rawRemoveEmptyWords);
  System.out.println(cleanRemoveEmptyWords);
//testing removeSingleLetterWords
System.out.println("testing removeSingleLetterWords");
  ArrayList<String> rawRemoveSingleLetterWords = new ArrayList<String>(
    Arrays.asList("The", "Jungle", "Book", "is", "a", "fantastic", "movie",
      "It", "s", "the", "best"));
  ArrayList<String> cleanRemoveSingleLetterWords = obj.removeSingleLetterWords(
    rawRemoveSingleLetterWords);
  System.out.println(cleanRemoveSingleLetterWords);
//testing toLowerCase
System.out.println("testing toLowerCase");
  ArrayList<String> rawToLowerCase = new ArrayList<String>(
    Arrays.asList("The", "Jungle", "Book", "is", "fantastic", "movie", "It",
      "the", "best"));
  ArrayList<String> cleanToLowerCase = obj.toLowerCase(rawToLowerCase);
  System.out.println(cleanToLowerCase);
//testing removeStopWords
System.out.println("testing removeStopWords");
  ArrayList<String> rawRemoveStopWords = new ArrayList<String>(
    Arrays.asList("the", "jungle", "book", "is", "fantastic", "movie", "it",
      "the", "best"));
  ArrayList<String> cleanRemoveStopWords = obj.removeStopWords(
    rawRemoveStopWords);
  System.out.println(cleanRemoveStopWords);
//testing createStopWordsSet
System.out.println("testing ceateStopWordsSet");
  obj.createStopWordsSet("stopwords.txt", "uniqueStopwords.txt");
//testing cleanData
System.out.println("testing cleanData");
  obj.cleanData("rawReviewRatings.txt", "cleanReviewRatings.txt", true);
  obj.cleanData("rawReviewRatingsBig.txt", "cleanRawReviewRatingsBig.txt",
    true);
  obj.cleanData("rawReviews.txt", "cleanRawReviews.txt", false);
  obj.cleanData("rawReviewsBig.txt", "cleanRawReviewsBig.txt", false);
//testing updateHashMap
System.out.println("testing updateHashMap");
  obj.updateHashMap("cleanReviewRatings.txt");

//testing rateReviews
System.out.println("testing rateReviews");
  obj.rateReviews("cleanRawReviews.txt","ratings.txt" );
  }
}