//File: RatingPredictor.java //Name: Trai Pham //Date: 02/16/2020 //Course: CSE 8B /** This class will allow us to clean text files and predict ratings based on learned data. */ import java.util.Arrays; import java.util.HashSet; import java.util.HashMap; import java.util.ArrayList; import java.util.Iterator; import java.util.Scanner; import java.io.FileWriter; import java.io.File; import java.io.PrintWriter; import java.io.IOException; import java.io.FileNotFoundException; public class RatingPredictor{ /** -stores a string as a key and int array as a value -String represent a word in the review -The first element in array is the sum of the rating of the reviews that the word is a part of -The Second element hold the count for the word **/ private HashMap wordFreqMap; //stores all of the stopwords read from file stopwords.txt private HashSet stopWords; String stopWordsFile = "C:/Users/traip/Desktop/CSE8B PA7/starter/stopWords.txt"; String whiteSpace = " "; String emptySpace = ""; String allPunctuations = "!\"#$%&'()*+,-./:;<=>?@\\[\\]\\\\^_`{|}~"; //Constructor public RatingPredictor(){ this.stopWords = new HashSet(); this.wordFreqMap = new HashMap(); } /** @return ArrayList of string; of words from the review @param String that contain entire contents of one review This method should split the review(param) into words and return an ArrayList of words **/ public ArrayList splitLine (String sentence){ ArrayList wordsOfReview = new ArrayList(); String words = new String(); if(sentence == null || sentence.length() == 0){ return null; } //uses the String split method to split the words based on spaces else{ String [] splitWords = sentence.split(whiteSpace); for(int i = 0; i < splitWords.length; i++){ words = splitWords[i]; wordsOfReview.add(words); } } return wordsOfReview; } /** @return ArrayList of String(words) @param ArrayList of String, it's the individual words in an ArrayList This method splits the words in the ArrayList at the hyphens and single quotes, it'll split the Strings that were once attached together by the hyphen or quote **/ public ArrayList splitAtHyphensAndQuotes (ArrayList words){ ArrayList cleanWords = new ArrayList(); String newWords = new String(); if(words == null || words.size() ==0){ return null; } else{ String [] strArray= new String[words.size()]; for(int i = 0; i < words.size(); i++){ newWords = words.get(i).replaceAll("[-']", whiteSpace); //this will split into a String [], how do I make sure that strArray = newWords.split(whiteSpace); for(int j = 0; j < strArray.length; j++){ cleanWords.add(strArray[j]); } } } return cleanWords; } /** @return ArrayList of string @param ArrayList of String of the review/file that needs to be cleaned This will remove any type of punctuations from the ArrayList of string within the review **/ public ArrayList removePunctuation (ArrayList words){ ArrayList cleanWords = new ArrayList(); String newWords = new String(); if(words == null || words.size() ==0){ return null; } else{ String [] strArray = new String[words.size()]; for(int i = 0; i < words.size(); i++){ newWords = words.get(i).replaceAll( "[!\"#$%&'()*+,-./:;<=>?@\\[\\]\\\\^_`{|}~]", whiteSpace); strArray = newWords.split(whiteSpace); for(int j = 0; j < strArray.length; j++){ cleanWords.add(strArray[j]); } } } return cleanWords; } /** @return ArrayList of String @param ArrayList of String of the uncleaned ArrayList of Strings This method would remove all the whiteSpaces from the review based on the ArrayList of String, it'll get you a new ArrayList of String that will have no WhiteSpaces within the ArrayList **/ public ArrayList removeWhiteSpaces (ArrayList words){ ArrayList cleanWords = new ArrayList(); if(words == null || words.size() ==0){ return null; } else{ //replace whiteSpace with emptySpace, and putting elements in String[] for(int i = 0; i < words.size(); i++){ cleanWords.add(words.get(i).replaceAll(whiteSpace, emptySpace)); } } return cleanWords; } /** @return ArrayList of String @param ArrayList of String of the uncleaned ArrayList of Strings This method would remove the empty space within the ArrayList of Strings. A new ArrayList of Strings would be created that doesn't have any empty Spaces **/ public ArrayList removeEmptyWords (ArrayList words){ ArrayList cleanWords = new ArrayList(); if(words == null || words.size() ==0){ return null; } else{ String [] strArray = new String[words.size()]; //if specific element in ArrayList equals empty space, we'll remove it for(int i = words.size() - 1; i >= 0; i--){ if(words.get(i).equals(emptySpace)){ words.remove(i); } } } return words; } /** @return ArrayList of String @param ArrayList of String of the uncleaned ArrayList of Strings This method would remove Single Letter Strings within the ArrayList of Strings it'll create a new ArrayList of Strings that wouldn't have any Single letter Strings **/ public ArrayList removeSingleLetterWords (ArrayList words){ ArrayList cleanWords = new ArrayList(); if(words == null || words.size() ==0){ return null; } else{ String [] strArray = new String[words.size()]; for(int i = words.size()-1; i >= 0; i--){ if(words.get(i).length() == 1){ words.remove(i); } } cleanWords = words; } return cleanWords; } /** @return ArrayList of String @param ArrayList of String of the uncleaned ArrayList of Strings This method would convert the elements within the ArrayList of Strings to all lowercase. It'll create a new ArrayList of strings with only the lowercased Strings from the unclean ArrayList **/ public ArrayList toLowerCase (ArrayList words){ ArrayList cleanWords = new ArrayList(); if(words == null || words.size() ==0){ return null; } else{ for(int i = 0; i < words.size(); i++){ cleanWords.add(words.get(i).toLowerCase()); } } return cleanWords; } /** @return ArrayList of String @param ArrayList of String of the uncleaned ArrayList of Strings This method would read through the file stopwords.txt and would remove any words from the ArrayList of Strings from the uncleaned reviews that are in the stopwords.txt file. A new ArrayList of Strings would be created that would not have any of the stopwords in it. **/ //use HashSet stopWords which stores all stopwords from stopwords.txt public ArrayList removeStopWords (ArrayList arrList){ ArrayList cleanWords = new ArrayList(); if(arrList == null || arrList.size() ==0){ return null; } //If this doesn't work, put whole definition in try and catch the exception try{ String [] strArray = new String[arrList.size()]; File stopWords2 = new File(stopWordsFile); Scanner read = new Scanner(stopWords2); String randomLine = emptySpace; //this would add the lines/elements from stopWords file to the set. while(read.hasNext()){ randomLine = read.nextLine(); this.stopWords.add(randomLine); } /**this reads through the Set and check if the ArrayList has the element in stopWords file. Then it'll remove the element at the specific index **/ for(int i = arrList.size() -1; i >= 0; i--){ if(this.stopWords.contains(arrList.get(i))) arrList.remove(i); } cleanWords = arrList; read.close(); } catch(FileNotFoundException ex){} return cleanWords; } /** @return nothing @param String that represent the input file, String that represent the output file This method would create a new output file based on the input file stopwords.txt. This method makes sure that elements within the stopwords.txt is not repeated in the new file. **/ public void createStopWordsSet (String inFile, String outFile){ try{ //inFile takes the Path of stopWords File stopWordsTXT = new File(inFile); Scanner read = new Scanner(stopWordsTXT); //adding lines from stopWords.txt file to HashSet String randomLine = emptySpace; while(read.hasNext()){ randomLine = read.nextLine(); // System.out.println(randomLine); this.stopWords.add(randomLine); } File uniqueStopwordstxt = new File(outFile); PrintWriter printer = new PrintWriter(uniqueStopwordstxt); //Iterator will iterate each elements in HashSet in its own line to the output //file Iterator iterateSet = this.stopWords.iterator(); while(iterateSet.hasNext()){ printer.println(iterateSet.next()); } //Remember to close the printWriter class, or else you won't get a printed file //it'll continue to print that's why the file is blank printer.close(); read.close(); } catch(FileNotFoundException ex){} } /** @return nothing @param String of the input File, String of the output File, and boolean that verifies whether the file contains ratings or not This method would clean up the input text file and would create a new clean output file, by using all the previous methods. **/ public void cleanData (String inFile, String outFile, boolean ratingIncluded){ try{ ArrayList cleanArray = new ArrayList(); ArrayList newCleanArray = new ArrayList(); File rawFile = new File(inFile); Scanner read = new Scanner(rawFile); File cleanFile = new File(outFile); PrintWriter printer = new PrintWriter(cleanFile); //clean the text files with rating if(ratingIncluded == true){ String randomLine = emptySpace; while(read.hasNext()){ randomLine = read.nextLine(); cleanArray = splitLine(randomLine); cleanArray = splitAtHyphensAndQuotes(cleanArray); cleanArray = removePunctuation(cleanArray); cleanArray = removeWhiteSpaces(cleanArray); cleanArray = removeEmptyWords(cleanArray); cleanArray = removeSingleLetterWords(cleanArray); cleanArray = toLowerCase(cleanArray); cleanArray = removeStopWords(cleanArray); newCleanArray.add(randomLine.substring(0,1)+whiteSpace); if(cleanArray != null){ for(int i = 0; i < cleanArray.size(); i++){ newCleanArray.add(cleanArray.get(i)+ whiteSpace); } } newCleanArray.add("\n"); } } //clean the text files without rating if(ratingIncluded == false){ String randomLine = emptySpace; while(read.hasNext()){ randomLine = read.nextLine(); cleanArray = splitLine(randomLine); cleanArray = splitAtHyphensAndQuotes(cleanArray); cleanArray = removePunctuation(cleanArray); cleanArray = removeWhiteSpaces(cleanArray); cleanArray = removeEmptyWords(cleanArray); cleanArray = removeSingleLetterWords(cleanArray); cleanArray = toLowerCase(cleanArray); cleanArray = removeStopWords(cleanArray); if(cleanArray != null){ for(int i = 0; i < cleanArray.size(); i++){ newCleanArray.add(cleanArray.get(i)+ whiteSpace); } } newCleanArray.add("\n"); } } //prints out the element from the cleanArrayList into the new file Iterator iterateArrayList = newCleanArray.iterator(); while(iterateArrayList.hasNext()){ printer.print(iterateArrayList.next()); } read.close(); printer.close(); } catch(FileNotFoundException ex){} } /** @return nothing @param String of the cleanFile(path) This method would take the cleaned data file as input and use it to update the Hashmaps. Keys are the words(String), values is an int array:First element is the sum of the rating, and the second element is the number of time that the word appear in the review. **/ //private HashMap wordFreqMap; public void updateHashMap(String inCleanFile){ try{ File cleanFile = new File(inCleanFile); Scanner read = new Scanner(cleanFile); ArrayList newArray = new ArrayList(); // int [] totalRatingAndFreq = new int[2]; // totalRatingAndFreq[0] = 0; // totalRatingAndFreq[1] = 0; String line = emptySpace; while(read.hasNextLine()){ line = read.nextLine(); newArray = splitLine(line); for(int i = 1; i < newArray.size(); i++){ //If the word is in the HashMap then the int[] values would change accordingly if(this.wordFreqMap.containsKey(newArray.get(i))){ int [] duplicateWords = this.wordFreqMap.get(newArray.get(i)); duplicateWords[0] += Integer.parseInt(newArray.get(0)); duplicateWords[1] += 1; } //If the word is not in the hashmap, the first value of the int [] would be //the rating of the review and the second element would be = 1 else{ int [] singleWord = new int[2]; singleWord[0] = Integer.parseInt(newArray.get(0)); singleWord[1] = 1; this.wordFreqMap.put(newArray.get(i), singleWord); } } } read.close(); } catch(FileNotFoundException ex){} } /** @return nothing @param String of clean file path, and string of the rating file path() This method will get you the rating of the reviews based on the hashmap of the reviews. **/ public void rateReviews (String inCleanFile, String outRatingsFile){ try{ File cleanFile = new File(inCleanFile); Scanner read = new Scanner(cleanFile); File ratingFile = new File(outRatingsFile); PrintWriter printer = new PrintWriter(ratingFile); //scanner would read the review while(read.hasNextLine()){ ArrayList newArray = new ArrayList(); ArrayList wordAverages = new ArrayList(); String line = emptySpace; line = read.nextLine(); newArray = this.splitLine(line); //iterate through the elements of strings within the array and add the values //of the ratings together based on the hashmap for(String word: newArray){ if(this.wordFreqMap.containsKey(word)){ int [] value = this.wordFreqMap.get(word); double average = ((double)value[0])/value[1]; wordAverages.add(average); } //if the word is not found in the ArrayList of strings else{ wordAverages.add(2.0); } } double rating = 0.0; for(double a: wordAverages){ rating += a; } double realRating = rating/wordAverages.size(); //This is how you'll be able to print it to the file, based on the discussion if(read.hasNextLine() == true){ printer.println(String.format("%.1f", realRating)); } else{ printer.print(String.format("%.1f", realRating)); } } printer.close(); read.close(); } catch(FileNotFoundException ex){} } //testing methods with main method public static void main(String[] args){ RatingPredictor obj = new RatingPredictor(); // testing SplitLine System.out.println("testing SplitLine"); ArrayListspitLineTest = new ArrayList(); String testReview = "The Jungle-Book is a fantastic movie! It's the best!!"; spitLineTest = obj.splitLine(testReview); System.out.println(spitLineTest); //testing splitAtHyphensAndQuotes System.out.println("testing splitAtHyphensAndQuotes"); ArrayList rawHyphenAndQuotes = new ArrayList( Arrays.asList("The", "Jungle-Book", "is", "a", "fantastic", "movie!", "It's", "the", "best!!")); ArrayList cleanHyphenAndQuotes = new ArrayList(); cleanHyphenAndQuotes = obj.splitAtHyphensAndQuotes(rawHyphenAndQuotes); System.out.println(cleanHyphenAndQuotes); //testing removePunctuation System.out.println("testing removePunctuation"); ArrayList rawRemovePunctuation = new ArrayList( Arrays.asList("The", "Jungle", "Book", "is", "a", "fantastic", "movie!", "It", "s", "the", "best!!")); ArrayList cleanRemovePunctuation = new ArrayList(); cleanRemovePunctuation = obj.removePunctuation(rawRemovePunctuation); System.out.println(cleanRemovePunctuation); //testing removeWhiteSpaces System.out.println("testing removeWhiteSpaces"); ArrayList rawRemoveWhiteSpaces = new ArrayList( Arrays.asList("The", "Jungle", "Book ", "is", "a", " fantastic", "movie", " It", "s", "the ", "best")); System.out.print("Raw:"); System.out.println(rawRemoveWhiteSpaces); ArrayList cleanRemoveWhiteSpaces = new ArrayList(); cleanRemoveWhiteSpaces = obj.removeWhiteSpaces(rawRemoveWhiteSpaces); System.out.println(cleanRemoveWhiteSpaces); //testing removeEmptyWords System.out.println("testing removeEmptyWords"); ArrayList rawRemoveEmptyWords = new ArrayList( Arrays.asList("The", "Jungle", "Book", "", "", "fantastic", "movie", "It", "s", "", "best")); System.out.println("Raw:" + rawRemoveEmptyWords); ArrayList cleanRemoveEmptyWords = obj.removeEmptyWords( rawRemoveEmptyWords); System.out.println(cleanRemoveEmptyWords); //testing removeSingleLetterWords System.out.println("testing removeSingleLetterWords"); ArrayList rawRemoveSingleLetterWords = new ArrayList( Arrays.asList("The", "Jungle", "Book", "is", "a", "fantastic", "movie", "It", "s", "the", "best")); ArrayList cleanRemoveSingleLetterWords = obj.removeSingleLetterWords( rawRemoveSingleLetterWords); System.out.println(cleanRemoveSingleLetterWords); //testing toLowerCase System.out.println("testing toLowerCase"); ArrayList rawToLowerCase = new ArrayList( Arrays.asList("The", "Jungle", "Book", "is", "fantastic", "movie", "It", "the", "best")); ArrayList cleanToLowerCase = obj.toLowerCase(rawToLowerCase); System.out.println(cleanToLowerCase); //testing removeStopWords System.out.println("testing removeStopWords"); ArrayList rawRemoveStopWords = new ArrayList( Arrays.asList("the", "jungle", "book", "is", "fantastic", "movie", "it", "the", "best")); ArrayList cleanRemoveStopWords = obj.removeStopWords( rawRemoveStopWords); System.out.println(cleanRemoveStopWords); //testing createStopWordsSet System.out.println("testing ceateStopWordsSet"); obj.createStopWordsSet("stopwords.txt", "uniqueStopwords.txt"); //testing cleanData System.out.println("testing cleanData"); obj.cleanData("rawReviewRatings.txt", "cleanReviewRatings.txt", true); obj.cleanData("rawReviewRatingsBig.txt", "cleanRawReviewRatingsBig.txt", true); obj.cleanData("rawReviews.txt", "cleanRawReviews.txt", false); obj.cleanData("rawReviewsBig.txt", "cleanRawReviewsBig.txt", false); //testing updateHashMap System.out.println("testing updateHashMap"); obj.updateHashMap("cleanReviewRatings.txt"); //testing rateReviews System.out.println("testing rateReviews"); obj.rateReviews("cleanRawReviews.txt","ratings.txt" ); } }