# ----------------------------------------------------------------------------------
# Title: Machine Learning Model Ensemble Fitting and Prediction Workflow for Biodiversity Analysis
# Author: Vinicius Marcilio-Silva
# Contact: viniciuschms@gmail.com
# Date: October 2024
# ----------------------------------------------------------------------------------
# Description: This script provides a generalized workflow for fitting models and
# predicting biodiversity metrics based on environmental predictors. Developed as
# supplemental material for the paper "Synergies and trade-offs between biodiversity conservation, human well-being, and
# agricultural production: lessons from the Atlantic Forest in Santa Catarina, Brazil"
# and based on code from Thiago Sanna F. Silva (tsfsilva@rc.unesp.br)
# available at https://datadryad.org/stash/dataset/doi:10.5061/dryad.6m905qfzp
# ----------------------------------------------------------------------------------
# Install necessary packages (only needs to be run once)
install.packages(c("corrplot", "nnet", "caTools", "caretEnsemble", "bartMachine",
"mboost", "brnn"))
# Load required libraries
require(caret)
require(tools)
require(tidyr)
require(corrplot)
require(nnet)
require(caTools)
require(caretEnsemble)
require(bartMachine)
require(mboost)
require(brnn)
# Model fitting and prediction loop for each biodiversity metric
for (i in 1:length(response.names)) { # 'response.names' contains names of biodiversity metrics
# Step 1: Preprocess predictors by scaling and centering
pre.pred <- preProcess(data.df[, predictor.names], methods = c('scale', 'center'))
data.df[, predictor.names] <- predict(pre.pred, data.df[, predictor.names])
# Step 2: Split data into training and testing sets (70/30 split)
set.seed(1985) # For reproducibility
train.indices <- createDataPartition(data.df[, i], p = 0.7, list = FALSE)
train.df <- data.df[train.indices, ]
test.df <- data.df[-train.indices, ]
# Step 3: Define the model formula dynamically
model.formula <- as.formula(paste(response.names[i], "~", paste(predictor.names, collapse = " + ")))
# Step 4: Generate scatter plots of predictors vs. response for initial exploration
plot.df <- gather(data.df[, c(i, predictor.columns)], key = predictor, value = value, -1)
fig <- ggplot(plot.df, aes(value, get(names(data.df)[i]))) + geom_point() + facet_wrap(~predictor)
ggsave((paste("scatter_plot_", names(data.df)[i], ".pdf")), plot = fig, device = "pdf")
# Step 5: Set up control for cross-validation (10-fold with 3 repetitions)
fit.control <- trainControl(method = 'repeatedcv', number = 10, repeats = 3, savePredictions = "final")
# Step 6: Define a list of candidate models for ensemble
tuneList <- list(
rf = caretModelSpec(method = "rf", importance = TRUE),
nnet = caretModelSpec(method = "nnet", linout = TRUE, importance = TRUE),
svmRadial = caretModelSpec(method = "svmRadial", linout = TRUE, importance = TRUE),
glm = caretModelSpec(method = "glm"),
bayglm = caretModelSpec(method = "bayesglm"),
glmboost = caretModelSpec(method = "glmboost"),
bayrnn = caretModelSpec(method = "brnn"),
RRF = caretModelSpec(method = 'RRF', importance = TRUE)
)
# Step 7: Train models using caretList
models <- caretList(model.formula, data = train.df, trControl = fit.control, tuneList = tuneList)
# Step 8: Save model summary results
results <- resamples(models)
res <- as.data.frame(summary(results)[3])
write.csv(res, paste("model_summary_", names(data.df[i]), ".csv", sep = ""))
# Step 9: Train stacked ensemble model with cross-validation
stackControl <- trainControl(method = "repeatedcv", number = 10, repeats = 3, savePredictions = "final")
stack.glm <- caretEnsemble(models, metric = "RMSE", trControl = stackControl)
# Step 10: Predictions on the test set
model_preds <- lapply(models, predict, newdata = test.df)
model_preds2 <- data.frame(model_preds)
model_preds2$ensemble <- predict(stack.glm, newdata = test.df)
# Step 11: Evaluate and save test results (RMSE)
eval.df.stack <- data.frame(obs = test.df[, i], pred = model_preds2$ensemble)
sum_RMSE_stack <- defaultSummary(eval.df.stack)
write.csv(sum_RMSE_stack, paste("evaluation_RMSE_", names(data.df[i]), ".csv", sep = ""))
# Step 12: Variable importance from the ensemble model
stack.varimp <- varImp(stack.glm)
write.csv(stack.varimp, paste("variable_importance_", names(data.df[i]), ".csv", sep = ""))
# Step 13: Predict future values based on new data (optional)
pre.predf <- preProcess(future.data[, predictor.names], methods = c('scale', 'center'))
future.data[, predictor.names] <- predict(pre.predf, future.data[, predictor.names])
future.pred <- predict(stack.glm, newdata = future.data)
write.csv(future.pred, paste("future_predictions_", names(data.df[i]), ".csv", sep = ""))
}
# End of Script
# ----------------------------------------------------------------------------------
# Notes:
# - Replace 'data.df', 'future.data', 'response.names', 'predictor.names', and 'predictor.columns'
# with the actual names/indices of your dataset's components.
# - This code uses 'caretEnsemble' to train an ensemble of models for improved prediction accuracy.
# ----------------------------------------------------------------------------------