K-means-CS470 / Kmeans.R
Kmeans.R
Raw
setwd("C:/Users/Tayler/Documents/470/finalproject")
label.input.filename <- "y_train.txt"
feature.input.filename <- "X_train.txt"

library(ggplot2)
library(plyr)
library(data.table)
# convert labels given to vector
label.vec <- as.vector(data.table::fread(label.input.filename)[[1]])
# input features as a data.table
feature.dt <- data.table::fread(feature.input.filename)

# actual file only has 561 columns not 571 - conversion unnecessary
#core.featureset <- 1:561
#core.feature.dt <- feature.dt[,..core.featureset]

# take in two observations and calculate the euclidean distance between them
pairwise_distance <- function(obs1, obs2){
  obs1 = as.vector(obs1[[1]])
  obs1 = obs1[!is.na(obs1)]
  obs2 = as.vector(obs2[[1]])
  obs2 = obs2[!is.na(obs2)]
  sq.result.sum <- sum((obs1 - obs2)^2)
  return(sqrt(sq.result.sum))
} 

# returns a data.table of cluster assignments given the assignment matrix
compute_assignment <- function(cluster.matrix){
  assmt.vec <- vector(mode = "numeric",length=nrow(cluster.matrix))
  for(n in 1:length(assmt.vec)){
    assmt.vec[n] = (which(cluster.matrix[n,]==1))
  }
  return(assmt.vec)
}

compute_colmeans <- function(data.dt, label.vector){
  mean.list <- list()
  main.data.table <<- data.dt
  for(label in 1:max(label.vector)){
    mean.list[[paste(label)]] <- 
      colMeans(feature.dt[which(label.vector == label),])
  }
  main.data.table <- do.call(rbind, mean.list)
  main.data.table <- as.data.table(main.data.table)
  return(main.data.table)
}

compute_MSE <- function(data.dt, label.vector){
  mean.dt <- compute_colmeans(data.dt, label.vector)
  #mean.dt[,k := .I]
  total.mse <- 0
  for(label in 1:max(label.vector)){
    temp.dt <- feature.dt[which(label.vector == label), ]
    for(n in 1:nrow(temp.dt)){
      total.mse = total.mse + (pairwise_distance(mean.dt[label,], temp.dt[n,]))^2
      str(total.mse)
    }
  } 
  str(total.mse)
  return(total.mse)
}
# set num of clusters & make random assignment
set.seed(1)
# k values to loop through given the prompt
K.iter.vals <- c(4:7)
clustering.result.list <- list()
MSE.outer.list <- list()
# K.iter.vals <- 2
count <- 0
MSE.total <- 0
K.iter.vals = 6
for(K.outer in K.iter.vals){
  # take samples of K.clusters random observations and set the initial mean values to the sample values
  initial.center.assignment.indices <- sample(length(feature.dt[1,]), K.outer)
  init.assmt.means <- as.vector(feature.dt[initial.center.assignment.indices,])
  
  # create variable to compare and stop running kmeans when they are equal 
  # (initializing them to not be equivalent in order for the loop to begin)
  previous.means <- init.assmt.means[-1]
  
  cluster.assmt.matrix <- matrix(0, nrow = nrow(feature.dt), ncol = K.outer) 
  dist.from.mean.vec <- vector(mode = "numeric", length = K.outer)
  
  different.k.list <- list()
  MSE.inner.list <- list()
  cluster.assmt.list <- list()
  cluster.iter.mean.list <- list()
  
  # upper bound for iterations
  iterations <- 100
  count = 0 # actual iterations carried out (needed for plot data.table creation below)
  for(iter in 1:iterations){
    if(identical(previous.means, init.assmt.means)){
      break
    }
    MSE.total = 0
    # loop through each observation in the data
    for(obs.n in 1:nrow(feature.dt)){
      dist.from.mean.vec = 0
      # for each observation, compute the euclidean distance from each mean value
      for(k in 1:K.outer){
        dist.from.mean.vec[k] = pairwise_distance(init.assmt.means[k], feature.dt[obs.n])
      }
      # assign the observation to the cluster with the closest mean
      assmt.index <- which.min(dist.from.mean.vec)
      cluster.assmt.matrix[obs.n, ] = 0
      cluster.assmt.matrix[obs.n, assmt.index] = 1
      MSE.total = MSE.total + (dist.from.mean.vec[assmt.index]^2)
    }
    # recompute cluster means
    cluster.mean.list <- list()
    for(k in 1:K.outer){
      cluster.mean.list [[paste(k)]] <- (colMeans(feature.dt[which(cluster.assmt.matrix[,k]==1),]))
    }
    previous.means <- init.assmt.means
    init.assmt.means <- data.table()
    init.assmt.means <- do.call(rbind, cluster.mean.list)
    
    # list to store all means we compute along the way for future reference
    cluster.iter.mean.list[[paste(iter)]] <- data.table(
      iter = iter,
      means = init.assmt.means)
    # list to store all assignments to clusters 
    cluster.assmt.list[[paste(iter)]] <- data.table(
      iter = iter,
      obs.assignments = compute_assignment(cluster.assmt.matrix))
    # list to keep track of mean squared error
    MSE.inner.list[[paste(iter)]] <- data.table(K.outer, iter, MSE.total)
    
    count = count + 1
  } 
  cluster.assmt.dt <- do.call(rbind, cluster.assmt.list)
  cluster.means.dt <- do.call(rbind, cluster.iter.mean.list)
  
  clustering.result.list[[paste(K.outer)]] <- as.list(c(
    assmt = cluster.assmt.dt[iter == count]$obs.assignments,
    means = cluster.means.dt[iter == count]))
  
  MSE.outer.list[[paste(K.outer)]] <- MSE.inner.list
}

#`plotting` data table for showing each iteration with a single k value

##cluster.assmt.vec <- compute_assignment(cluster.assmt.matrix)
#plotting.dt <- feature.dt[rep(1:nrow(feature.dt), count),]
#plotting.dt[,iter := cluster.assmt.dt$iter]
#plotting.dt[,obs.assignments := cluster.assmt.dt$obs.assignments]

# ggplot for showing each iteration for a single k
ggplot()+
  geom_point(aes(
    x=V1,
    y=V3,
    color=obs.assignments
  ),data=plotting.dt)+
  geom_point(aes(
    x=means.V1,
    y=means.V3,
  ),data=cluster.means.dt, color = "#FF0000",size = 2)+
  facet_grid(iter ~ ., scales = "free")

# create data structure for plotting clusters with different k values
for(k in K.iter.vals){
  expr.string <- "clustering.result.list$`"
  expr.string <- paste(expr.string, k, sep = "")
  expr.string <- paste(expr.string, "`", sep = "")
  final.res.list[[paste(k)]] <- data.table(
    k, 
    label = as.numeric(unlist(eval(parse(text = expr.string))))[1:nrow(feature.dt)]) 
}
final.res.dt <- do.call(rbind, final.res.list)
plotting.dt <- data.table()
plotting.dt <- feature.dt[rep(1:nrow(feature.dt), length(K.iter.vals)),]
plotting.dt[, k := final.res.dt$k]
plotting.dt[, label := final.res.dt$label]

# create data structure for plotting means
# refusing to cooperate
#plot.mean.list <- list()
#for(k in K.iter.vals){
#  #str(k)
#  mean.dt <- compute_colmeans(feature.dt, final.res.dt[k==k]$label)
#  #str(mean.dt)
#  plot.mean.list[[paste(k)]] <- mean.dt
#}
#plot.mean.dt <- do.call(rbind, plot.mean.list)

plot.mean.list[[paste(4)]] <- compute_colmeans(feature.dt, final.res.dt[k==4]$label)
plot.mean.list[[paste(5)]] <- compute_colmeans(feature.dt, final.res.dt[k==5]$label)
plot.mean.list[[paste(6)]] <- compute_colmeans(feature.dt, final.res.dt[k==6]$label)
plot.mean.list[[paste(7)]] <- compute_colmeans(feature.dt, final.res.dt[k==7]$label)
plot.mean.dt <- do.call(rbind, plot.mean.list)
k.vec <- vector(mode = "numeric")
for(k in K.iter.vals){
  k.vec <- c(k.vec, rep(k, each = k))
}
plot.mean.dt[, k := k.vec]

# plot for k = 4:7 showing labeling and mean centers
ggplot()+
  geom_point(aes(
    x = V1,
    y = V3,
    color = label,
  ),data=plotting.dt)+
  geom_point(aes(
    x = V1, 
    y = V3,
  ),data = plot.mean.dt, color = "red", size = 2)+
  facet_grid(k ~ ., scales = "free")
# the plot itself is not completely representative of the data, as there are 561 
# dimensions and we are plotting two of them

# ggplot of original labeling
orig.means <- compute_colmeans(feature.dt, label.vec)
ggplot()+
  geom_point(aes(
    x = V1,
    y = V3,
    color = label.vec,
  ),data = feature.dt)+
  geom_point(aes(
    x = V1,
    y = V3
  ),data = orig.means, color = "red")

MSE.plot.list <- list()
MSE.plot.list[[paste(4)]] <- as.data.table(MSE.outer.list$`4`$`25`)
MSE.plot.list[[paste(5)]] <- as.data.table(MSE.outer.list$`5`$`24`)
MSE.plot.list[[paste(6)]] <- as.data.table(MSE.outer.list$`6`$`27`)
MSE.plot.list[[paste(7)]] <- as.data.table(MSE.outer.list$`7`$`68`)
MSE.plot.dt <- do.call(rbind.fill, MSE.plot.list)
#given.mse <- compute_MSE(feature.dt, label.vec)
given.mse <- MSE.plot.list$
mse.list <- list()
for(k in K.iter.vals){
  mse.list[k] = as.numeric(compute_MSE(feature.dt, final.res.dt[k==k]$label))
}
mse.dt <- do.call(rbind, mse.list)

mse.dt[1,] = compute_MSE(feature.dt, final.res.dt[k==4]$label)
mse.dt[2,] = compute_MSE(feature.dt, final.res.dt[k==5]$label)
mse.dt[3,] = compute_MSE(feature.dt, final.res.dt[k==6]$label)
mse.dt[4,] = compute_MSE(feature.dt, final.res.dt[k==7]$label)
colnames(mse.dt) = c("mse")
mse.dt <- as.data.table(mse.dt)
mse.dt[, data.i := .I]
ggplot()+
  geom_path(aes(
    x = data.i, 
    y = mse,
  ),data=mse.dt)+
  geom_hline(aes(
    yintercept = given.mse,
    color = "red"
  ))
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Finding train and test subsets
tsize <- as.integer(nrow(feature.dt) * .9)
train.subset.indices <- sample(1:nrow(feature.dt), size = tsize)
train.subset <- feature.dt[train.subset.indices,]
test.subset.indices <- which(!((1:nrow(feature.dt))%in%train.subset.indices))
test.subset <- feature.dt[test.subset.indices,]
# ran train.subset on kmeans above

# add features to the data.tables so we can keep track of cluster assignments
K.neighbors <- 3
train.subset[,assmt := cluster.assmt]
test.subset[, assmt := K.neighbors]

# initialize variables for K.n.n
temp.list <- list()
temp.dt <- data.table()

# for each observation in the test subset, find the k closest points.
# then using the tabulate function find the index of the 
# maximum value and assign it to that cluster

for(obs.n.test in 1:nrow(test.subset)){
  for(obs.n.train in 1:nrow(train.subset)){
    # design pattern of creating a list and then using rbind to get the results
    # here we are storing the original position of test & train, 
    # the cluster assignment #, and the distance between the test and 
    # train points
    temp.list[[paste(obs.n.train)]] <- 
      data.table(obs.n.test, obs.n.train, assmt = 0, dist = pairwise_distance(test.subset[obs.n.test,-"assmt"], 
                        test.subset[obs.n.train,-"assmt"]))
  }
  temp.dt <- do.call(rbind, temp.list)
  temp.dt <- temp.dt[order(temp.dt$dist),]
  temp.dt <- temp.dt[1:K.neighbors,]
  
  # for each item in the data.table, find which cluster assignment is represented the most
  for(item in 1:nrow(temp.dt)){
    temp.dt[item,]$assmt = train.subset$assmt[temp.dt$obs.n.train[item]]
  }
  test.subset[obs.n.test,]$assmt = which.max(tabulate(temp.dt$assmt))
  
}

# mapping K.n.n results back to the size of the original data set
trained.feature.dt <- feature.dt
trained.feature.dt[, assmt := 0]
trained.feature.dt[train.subset.indices,]$assmt = train.subset$assmt
trained.feature.dt[test.subset.indices,]$assmt = test.subset$assmt
trained.means <- compute_colmeans(trained.feature.dt[,-"assmt"], trained.feature.dt$assmt)

ggplot()+
  geom_point(aes(
    x = V1,
    y = V3,
    color = assmt,
  ),data = trained.feature.dt)+
  geom_point(aes(
    x = V1,
    y = V3,
  ),data = trained.means, color = "red")
  
# PCA

pca.fit <- prcomp(feature.dt, rank=2)
digits.pca.dt <- as.data.table(pca.fit$x)
digits.pca.dt[,label:= label.vec]
# taking subset so ggplot is not crammed with labels
subset.digits.pca.dt <- digits.pca.dt[1:nrow(digits.pca.dt)/2,]
ggplot()+
  geom_text(aes(
    x = PC1,
    y = PC2,
    label = label,
  ), data = subset.digits.pca.dt, size = 2)

# GMM
K <- 6
mean.vec <- vector(mode = "numeric", length = ncol(feature.dt))
mean.vec.mat <- matrix(nrow = length(mean.vec), nrow = length(mean.vec))
cov.mat <- matrix(nrow = ncol(feature.dt), ncol = ncol(feature.dt))
prior.weight.vec <- vector(mode = "numeric", length = K)
prior.weight.mat <- 
weights.mat <- 
Estep <- function(matrix.dt){
  # phi and weights updated
  
}

Mstep <- function(matrix.dt){
  for(k in K){
    
  }
}

predict_probabilities <- function(matrix.dt){
  
}

predict <- function(matrix.dt){
  
}