#Chay Bagan Homework 6

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(FNN)
housing.df <- read.csv("BostonHousing.csv")

set.seed(123)
train.index <- sample(row.names(housing.df), 0.6*dim(housing.df)[1])  
valid.index <- setdiff(row.names(housing.df), train.index)  
train.df <- housing.df[train.index, -14]
valid.df <- housing.df[valid.index, -14]

train.norm.df <- train.df
valid.norm.df <- valid.df
housing.norm.df <-housing.df

norm.values <- preProcess(train.df, method=c("center", "scale"))
train.norm.df <- as.data.frame(predict(norm.values, train.df))
valid.norm.df <- as.data.frame(predict(norm.values, valid.df))
housing.norm.df <- as.data.frame(predict(norm.values, housing.df))

accuracy.df <- data.frame(k = seq(1, 5, 1), RMSE = rep(0, 5))

for(i in 1:5){
  knn.pred<-class::knn(train = train.norm.df[,-13],                          
                       test = valid.norm.df[,-13],                          
                       cl = train.df[,13], k = i)
  accuracy.df[i,2]<-RMSE(as.numeric(as.character(knn.pred)),valid.df[,13])
}

accuracy.df
##   k     RMSE
## 1 1 4.941440
## 2 2 5.143047
## 3 3 6.191194
## 4 4 6.772547
## 5 5 6.961959
new.df<-data.frame(0.2,0,7,0,0.538,6,62,4.7,4,307,21,10)
names(new.df)<-names(train.norm.df)[-13]

new.norm.values <- preProcess(new.df, method=c("center", "scale"))
## Warning in preProcess.default(new.df, method = c("center", "scale")): Std.
## deviations could not be computed for: CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS,
## RAD, TAX, PTRATIO, LSTAT
new.norm.df <- predict(new.norm.values, newdata = new.df)

new.knn.pred <- class::knn(train = train.norm.df[,-13],
                           test = new.norm.df,
                           cl = train.df$MEDV, k = 2)
new.knn.pred
## [1] 21
## 180 Levels: 5 5.6 6.3 7 7.2 7.5 8.1 8.3 8.4 8.5 8.8 9.5 9.7 10.2 10.5 ... 50
#C. The error of the training set can be seen below
new.accuracy.df<-RMSE(as.numeric(as.character(new.knn.pred)),valid.df[,13])
new.accuracy.df
## [1] 9.469794