raft-key-value-store / labs-fall23-AnurimaVaishnavi-master / assignment1-1 / q1.go
q1.go
Raw
package cos418_hw1_1

import (
	"fmt"
	"sort"
	"regexp"
	"strings"
	"os"
)

// Find the top K most common words in a text document.
// 	path: location of the document
//	numWords: number of words to return (i.e. k)
//	charThreshold: character threshold for whether a token qualifies as a word,
//		e.g. charThreshold = 5 means "apple" is a word but "pear" is not.
// Matching is case insensitive, e.g. "Orange" and "orange" is considered the same word.
// A word comprises alphanumeric characters only. All punctuation and other characters
// are removed, e.g. "don't" becomes "dont".
// You should use `checkError` to handle potential errors.
func check(e error) {
    if e != nil {
        panic(e)
    }
}

func topWords(path string, numWords int, charThreshold int) []WordCount {
	// TODO: implement me
	// HINT: You may find the `strings.Fields` and `strings.ToLower` functions helpful
	// HINT: To keep only alphanumeric characters, use the regex "[^0-9a-zA-Z]+"
	content, err := os.ReadFile(path)
	regex := regexp.MustCompile("[^0-9a-zA-Z]+")
	dict := make(map[string]int)
    check(err)
	checkError(err)
	contentLower := strings.ToLower(string(content))
	words := strings.Fields(contentLower)
	for _, word := range words {
		cleaned := regex.ReplaceAllString(word, "")
		if len(cleaned) >= charThreshold {
			dict[cleaned]++
		}
	}
	var ans []WordCount
	for word, count := range  dict{
		ans = append(ans, WordCount{word, count})
	}
	sortWordCounts(ans)
	if numWords > len(ans) {
		numWords = len(ans)
	}
	return ans[:numWords]
}

// A struct that represents how many times a word is observed in a document
type WordCount struct {
	Word  string
	Count int
}

func (wc WordCount) String() string {
	return fmt.Sprintf("%v: %v", wc.Word, wc.Count)
}

// Helper function to sort a list of word counts in place.
// This sorts by the count in decreasing order, breaking ties using the word.
// DO NOT MODIFY THIS FUNCTION!
func sortWordCounts(wordCounts []WordCount) {
	sort.Slice(wordCounts, func(i, j int) bool {
		wc1 := wordCounts[i]
		wc2 := wordCounts[j]
		if wc1.Count == wc2.Count {
			return wc1.Word < wc2.Word
		}
		return wc1.Count > wc2.Count
	})
}