R_script · Machine-Learning-Model-Ensemble-for-Biodiversity-and-Agricultural-Revenue

# ----------------------------------------------------------------------------------
# Title: Machine Learning Model Ensemble Fitting and Prediction Workflow for Biodiversity Analysis
# Author: Vinicius Marcilio-Silva
# Contact: viniciuschms@gmail.com
# Date: October 2024
# ----------------------------------------------------------------------------------
# Description: This script provides a generalized workflow for fitting models and 
# predicting biodiversity metrics based on environmental predictors. Developed as 
# supplemental material for the paper "Synergies and trade-offs between biodiversity conservation, human well-being, and
# agricultural production: lessons from the Atlantic Forest in Santa Catarina, Brazil"
# and based on code from Thiago Sanna F. Silva (tsfsilva@rc.unesp.br) 
# available at https://datadryad.org/stash/dataset/doi:10.5061/dryad.6m905qfzp
# ----------------------------------------------------------------------------------

# Install necessary packages (only needs to be run once)
install.packages(c("corrplot", "nnet", "caTools", "caretEnsemble", "bartMachine", 
                   "mboost", "brnn"))

# Load required libraries
require(caret)
require(tools)
require(tidyr)
require(corrplot)
require(nnet)
require(caTools)
require(caretEnsemble)
require(bartMachine)
require(mboost)
require(brnn)

# Model fitting and prediction loop for each biodiversity metric
for (i in 1:length(response.names)) { # 'response.names' contains names of biodiversity metrics

  # Step 1: Preprocess predictors by scaling and centering
  pre.pred <- preProcess(data.df[, predictor.names], methods = c('scale', 'center'))
  data.df[, predictor.names] <- predict(pre.pred, data.df[, predictor.names])

  # Step 2: Split data into training and testing sets (70/30 split)
  set.seed(1985) # For reproducibility
  train.indices <- createDataPartition(data.df[, i], p = 0.7, list = FALSE)
  train.df <- data.df[train.indices, ]
  test.df <- data.df[-train.indices, ]

  # Step 3: Define the model formula dynamically
  model.formula <- as.formula(paste(response.names[i], "~", paste(predictor.names, collapse = " + ")))

  # Step 4: Generate scatter plots of predictors vs. response for initial exploration
  plot.df <- gather(data.df[, c(i, predictor.columns)], key = predictor, value = value, -1) 
  fig <- ggplot(plot.df, aes(value, get(names(data.df)[i]))) + geom_point() + facet_wrap(~predictor)
  ggsave((paste("scatter_plot_", names(data.df)[i], ".pdf")), plot = fig, device = "pdf") 

  # Step 5: Set up control for cross-validation (10-fold with 3 repetitions)
  fit.control <- trainControl(method = 'repeatedcv', number = 10, repeats = 3, savePredictions = "final")

  # Step 6: Define a list of candidate models for ensemble
  tuneList <- list(
    rf = caretModelSpec(method = "rf", importance = TRUE),
    nnet = caretModelSpec(method = "nnet", linout = TRUE, importance = TRUE),
    svmRadial = caretModelSpec(method = "svmRadial", linout = TRUE, importance = TRUE),
    glm = caretModelSpec(method = "glm"),
    bayglm = caretModelSpec(method = "bayesglm"),
    glmboost = caretModelSpec(method = "glmboost"),
    bayrnn = caretModelSpec(method = "brnn"),
    RRF = caretModelSpec(method = 'RRF', importance = TRUE)
  )

  # Step 7: Train models using caretList
  models <- caretList(model.formula, data = train.df, trControl = fit.control, tuneList = tuneList)
  
  # Step 8: Save model summary results
  results <- resamples(models)
  res <- as.data.frame(summary(results)[3])
  write.csv(res, paste("model_summary_", names(data.df[i]), ".csv", sep = ""))

  # Step 9: Train stacked ensemble model with cross-validation
  stackControl <- trainControl(method = "repeatedcv", number = 10, repeats = 3, savePredictions = "final")
  stack.glm <- caretEnsemble(models, metric = "RMSE", trControl = stackControl)

  # Step 10: Predictions on the test set
  model_preds <- lapply(models, predict, newdata = test.df)
  model_preds2 <- data.frame(model_preds)
  model_preds2$ensemble <- predict(stack.glm, newdata = test.df)

  # Step 11: Evaluate and save test results (RMSE)
  eval.df.stack <- data.frame(obs = test.df[, i], pred = model_preds2$ensemble)
  sum_RMSE_stack <- defaultSummary(eval.df.stack)
  write.csv(sum_RMSE_stack, paste("evaluation_RMSE_", names(data.df[i]), ".csv", sep = ""))

  # Step 12: Variable importance from the ensemble model
  stack.varimp <- varImp(stack.glm)
  write.csv(stack.varimp, paste("variable_importance_", names(data.df[i]), ".csv", sep = ""))

  # Step 13: Predict future values based on new data (optional)
  pre.predf <- preProcess(future.data[, predictor.names], methods = c('scale', 'center'))
  future.data[, predictor.names] <- predict(pre.predf, future.data[, predictor.names])
  future.pred <- predict(stack.glm, newdata = future.data)
  write.csv(future.pred, paste("future_predictions_", names(data.df[i]), ".csv", sep = ""))
}

# End of Script
# ----------------------------------------------------------------------------------
# Notes: 
# - Replace 'data.df', 'future.data', 'response.names', 'predictor.names', and 'predictor.columns' 
#   with the actual names/indices of your dataset's components.
# - This code uses 'caretEnsemble' to train an ensemble of models for improved prediction accuracy.
# ----------------------------------------------------------------------------------