# data sourced from https://www.kaggle.com/code/jesucristo/1-house-prices-solution-top-1
import numpy as np
import pandas as pd
import datetime
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder

# read only some pertinent columns for predicting sale price
# dropna drops null values
df=pd.read_csv('houseprice.csv',usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
                                         "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()
# 1200 viable properties after dropping, 10 features/attributes

# replace YearBuilt with Total Years to use house age instead
df['Total Years']=datetime.datetime.now().year-df['YearBuilt']
df.drop("YearBuilt",axis=1,inplace=True)

# all categorical features in df
cat_features=["MSSubClass", "MSZoning", "Street", "LotShape"]
# desired output prediction
out_feature="SalePrice"

# use LabelEncoder to write the categorical features as indices from 0 to len(unique(feature)) - 1 which is needed later for layer embedding
lbl_encoders={}
for feature in cat_features:
    lbl_encoders[feature]=LabelEncoder()
    df[feature]=lbl_encoders[feature].fit_transform(df[feature])

# use numpy to place all categorical features into one array and flip values vertically using second argument=1 in order to make tensors
cat_features=np.stack([df['MSSubClass'],df['MSZoning'],df['Street'],df['LotShape']],1)
# convert to tensors
cat_features=torch.tensor(cat_features,dtype=torch.int64)

# make list of all continuous features excluding SalePrice which is the desired output
cont_features=[]
for i in df.columns:
    if i in ["MSSubClass", "MSZoning", "Street", "LotShape","SalePrice"]:
        pass
    else:
        cont_features.append(i)

# place all continuous features into an array, create tensor
cont_values=np.stack([df[i].values for i in cont_features],axis=1)
cont_values=torch.tensor(cont_values,dtype=torch.float)

# convert desired SalePrice output to a tensor, use reshape to make it 2D
y=torch.tensor(df['SalePrice'].values,dtype=torch.float).reshape(-1,1)

# embedding layers, necessary only for the categorical features
# need dimensions of the cat features, the # of unique categories in each feature
cat_dims=[len(df[col].unique()) for col in ["MSSubClass", "MSZoning", "Street", "LotShape"]]

# embedding dimension tuple rule of thumb follows (cat_dim, min(50, (x + 1) // 2))
embedding_dim=[(x, min(50, (x + 1) // 2)) for x in cat_dims]

# neural network setup
# use ModuleList to store embedded layers of the given category dimensions, Embedding converts the cat data into vectors for the nn
embed_representation=nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])

pd.set_option('display.max_rows', 500)
# embedding_val holds the result of applying the embedded layer tuples on the cat features (4 of them), so it has 4 tensors
embedding_val=[]
for i,e in enumerate(embed_representation):
    embedding_val.append(e(cat_features[:,i]))

# concatenate the list of tensors and flip column-wise
z = torch.cat(embedding_val, 1)

# set dropout during training to randomly deactivate 40% of neurons during each forward pass so that model doesn't rely too much on the same neurons, prevents overfitting
dropout=nn.Dropout(.4)
final_embed=dropout(z)
# done embedding layers


# implement the nn
# feed forward nn, so the flow of data is one-way
class FeedForwardNN(nn.Module):
    # args: dimension tuples, # of continuous features, output value, list containing # of neurons to use in each layer, dropout probability
    def __init__(self, embedding_dim, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
        self.emb_drop = nn.Dropout(p)
        # normalize the # of continuous features
        self.bn_cont = nn.BatchNorm1d(n_cont)

        layerlist = []
        # sum all embedding outputs, min(50, (x + 1) // 2)
        n_emb = sum((out for inp,out in embedding_dim))
        # input in the embedding outputs + # of continuous features
        n_in = n_emb + n_cont

        # i is # of neurons used in hidden layer
        for i in layers:
            # add linear transformation layer
            layerlist.append(nn.Linear(n_in,i))
            # add ReLU activation function layer for non-linearity, zeroing out negative tensors
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i

        # final layer with 1 output, SalePrice
        layerlist.append(nn.Linear(layers[-1],out_sz))
        # model flows through layers in the order they were added
        self.layers = nn.Sequential(*layerlist)

    def forward(self, x_cat, x_cont):
        embeddings = []
        # apply embeddings, set dropout
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        # concatenate all features
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x


# set random seed
torch.manual_seed(100)
# create model
model=FeedForwardNN(embedding_dim,len(cont_features),1,[100,50],p=0.4)

# training the nn, use mean-squared error for loss function
loss_function=nn.MSELoss()
# define optimizer, parameters from model and learning rate
optimizer=torch.optim.Adam(model.parameters(),lr=0.01)

# training splits to use, 1200 houses
batch_size=1200
# use 15% of the batch as a test size
test_size=int(batch_size*0.15)
# divide training, testing of cat features/cont features/sale price based on test size
train_categorical=cat_features[:batch_size-test_size]
test_categorical=cat_features[batch_size-test_size:batch_size]
train_cont=cont_values[:batch_size-test_size]
test_cont=cont_values[batch_size-test_size:batch_size]
y_train=y[:batch_size-test_size]
y_test=y[batch_size-test_size:batch_size]

# actual training
# using 5000 iterations through the nn
epochs=5000
final_losses=[]
for i in range(epochs):
    i=i+1
    # training categorical, continuous features
    y_pred=model(train_categorical,train_cont)
    # using RMSE
    loss=torch.sqrt(loss_function(y_pred,y_train))
    # track computed loss
    final_losses.append(loss)
    # print every 10th epoch and loss for view
    if i % 10==1:
        print("Epoch number: {} and the loss : {}".format(i,loss.item()))
    # reset optimizer
    optimizer.zero_grad()
    # back propagation
    loss.backward()
    # adjust weights based on calculated loss
    optimizer.step()

# validating test data
y_pred=""
# no_grad() disables gradient calculations temporarily for validating
with torch.no_grad():
    y_pred=model(test_categorical,test_cont)
    loss=torch.sqrt(loss_function(y_pred,y_test))

# test results - only showing 180 results as per training splits
data_verify=pd.DataFrame(y_test.tolist(),columns=["Test"])
# prediction results
data_predicted=pd.DataFrame(y_pred.tolist(),columns=["Prediction"])

# concatenate to one table for view
final_output=pd.concat([data_verify,data_predicted],axis=1)
final_output['Difference']=final_output['Test']-final_output['Prediction']

# final output displaying actual house price vs. the ML algorithms' predicted house price based on the given categorical and continuous features
print(final_output)

# saving the model
torch.save(model,'HousePrice.pt')
torch.save(model.state_dict(),'HouseWeights.pt')

# to load the model
# embs_size=[(15, 8), (5, 3), (2, 1), (4, 2)]
# model1=FeedForwardNN(embs_size,5,1,[100,50],p=0.4)
# model1.load_state_dict(torch.load('HouseWeights.pt'))
# model1.eval()