######################################################################################################
# Code for manuscript: Chia et al. Enclosed bird nests driven by predation and thermoregulation
# 2024 November
# This script performs PCA on climatic variables and fits phylogenetic models

## INPUTS
# Species trait/environmental data with climatic PCA "data/all_traits_pca.csv"
# Consensus trees "data/consensus_tree_ultrametric.rds"
# Candidate phylogenetic trees "data/tree1k.rds"

## OUTPUTS
# Fitted models with consensus tree (main analysis) "output/model_*.rds"
# Fitted models and coefficients with 1,000 trees (supplementary analysis) "output/trees_model_*.rds" and "output/trees_coef_*.rds"
######################################################################################################

# required library
library(dplyr)
library(phytools)
library(phylolm)
library(parallel) # if using parallelization

# ---------------------------------------------
# 1 - Functions
#----------------------------------------------
# Generate dataset for model fitting
subset_data <- function(data, nest) {
  if (nest == "enclosed") {
    rs <- data %>% mutate(Y = Enclosed>0, N = Open>0 & Enclosed==0) %>% filter(Y+N > 0) %>%
      mutate_at(c("Migration","Egg","Ground","Cooperative","Clutch","npp","PC1","PC2"), ~(scale(.) %>% as.vector))
  } else if (nest == "dome") {
    rs <- data %>% mutate(Y = Dome>0, N = Open>0 & Enclosed==0) %>% filter(Y+N > 0) %>%
      mutate_at(c("Migration","Egg","Ground","Cooperative","Clutch","npp","PC1","PC2"), ~(scale(.) %>% as.vector))
  } else if (nest == "cavity") {
    rs <- data %>% mutate(Y = Cavity>0, N = Open>0 & Enclosed==0) %>% filter(Y+N > 0) %>%
      mutate_at(c("Migration","Egg","Ground","Cooperative","Clutch","npp","PC1","PC2"), ~(scale(.) %>% as.vector))
  }
  return(rs)
}

# Select initial beta values
find_best_ini_beta <- function(data, tree, n) {
  # Generate n sets of random initial beta values
  set.seed(123)
  beta <- matrix(runif(n*16, -.2, .2), ncol=16) # 16 betas per set
  
  # Build models using the initial beta values
  mlist <- list()
  for (i in 1:n) {
    tryCatch({
      print(paste("fitting model", i))
      mlist[[i]] <- phyloglm(formula, data=data, phy=tree, method="logistic_MPLE", full.matrix=T, btol=50, boot=0, start.beta=beta[i,])
    }, 
    error=function(e){cat("ERROR :",conditionMessage(e), "\n")},
    warning=function(w){cat("WARNING :",conditionMessage(w), "\n")})
  }
  
  # Select the set of beta values with the lowest AIC
  aic <- NULL
  for (i in 1:length(mlist)) {
    if (is.null(mlist[[i]])) {
      aic <- c(aic, NA)
    } else {
      aic <- c(aic, mlist[[i]]$aic)
    }
  }
  # beta.best <- beta[which(aic==min(aic, na.rm=T)), ]
  # return(beta.best)
  idx.best <- which(aic==min(aic, na.rm=T))
  beta.best <- beta[idx.best, ]
  return(list(beta.best, idx.best, aic, mlist))
}

# Fit model using given initial beta values
fit_model <- function(subset, nest, contree, n.random.beta, nboot) {
  # get data subset
  data <- subset_data(subset, nest)
  tree <- contree %>% drop.tip(.$tip.label[!.$tip.label %in% rownames(data)])
  dir.create(file.path("output"), showWarnings = FALSE)
  
  # find best fit initial beta values
  beta.ini <- find_best_ini_beta(data, tree, n.random.beta)
  # saveRDS(beta.ini, paste0("output/model_", sub("dt.", "", deparse(substitute(subset))), "_", nest, "_beta.rds"))
  
  # fit model using the nest initial betas and save results
  m <- phyloglm(formula, data=data, phy=tree, method="logistic_MPLE", start.beta=beta.ini[[1]], btol=20, boot=nboot, full.matrix=T)
  saveRDS(m, paste0("output/model_", sub("dt.", "", deparse(substitute(subset))), "_", nest, ".rds"))
  return(m)
}

# Fit model using multiple phylogenetic tree (supplementary analysis) with given initial beta values
fit_model_multi_trees <- function(subset, nest, trees, beta.ini, cores=NULL) {
  phyloreg <- function(i) {
    tryCatch({
      tree <- trees[[i]] %>% drop.tip(.$tip.label[!.$tip.label %in% rownames(data)])
      m <- phyloglm(formula, data=data, phy=tree, method="logistic_MPLE", start.beta=beta.ini, btol=20, boot=0)
      return(m)
    }, error=function(e) e)
  }
  
  # get data subset
  data <- subset_data(subset, nest)
  
  # fit models
  if (is.null(cores)) { # no parallelization
    m <- lapply(1:length(trees), phyloreg)
  } else {
    m <- mclapply(1:length(trees), phyloreg, mc.cores = cores) # (parallelized, available on Unix OS)
  }
  # saveRDS(m, paste0("output/multitrees_model_", sub("dt.", "", deparse(substitute(subset))), "_", nest, ".rds")) # optional
  
  # save coefficients
  coef <- NULL
  for (i in 1:length(trees)) coef <- rbind(coef, m[[i]]$coef)
  saveRDS(coef, paste0("output/multitrees_coef_", sub("dt.", "", deparse(substitute(subset))), "_", nest, ".rds"))
}
#---------------------------------------
# 3 - Fit models
#---------------------------------------
# Create three subsets
dt.all <- read.csv("data/all_traits_pca.csv", row.names = 1) # all apseices
dt.np <- dt.all %>% filter(Order != "Passeriformes") # non-passerines
dt.psr <- dt.all %>% filter(Order == "Passeriformes") # passerines

# Import phylogenetic consensus tree
contree <- readRDS("data/consensus_tree_ultrametric.rds")

# Model formula
formula <- formula("Y ~ npp + PC1 + PC2 + I(PC2^2) +
                   Ground + Cooperative + Migration + Egg + Clutch +
                   npp*Ground + npp*Cooperative + npp*Clutch +
                   PC1*Egg + PC2*Egg + I(PC2^2)*Egg")

#### Fit models and save results for each subset and model type (Main analysis)
# Set parameters
n.random.beta <- 300 # number of sets of random initial beta values to select from
nboot <- 999 # number of bootstrap in the final model fitting

# Fit models (parallelized version, available on Unix OS)
# Warning: long runtime (proportional to n.random.beta + nboot), 10-20hr for each line on a regular PC
tasks <- list(
  function() fit_model(dt.all, "enclosed", contree, n.random.beta, nboot),
  function() fit_model(dt.all, "dome",     contree, n.random.beta, nboot),
  function() fit_model(dt.all, "cavity",   contree, n.random.beta, nboot),
  function() fit_model(dt.np,  "enclosed", contree, n.random.beta, nboot),
  function() fit_model(dt.np,  "dome",     contree, n.random.beta, nboot),
  function() fit_model(dt.np,  "cavity",   contree, n.random.beta, nboot),
  function() fit_model(dt.psr, "enclosed", contree, n.random.beta, nboot),
  function() fit_model(dt.psr, "dome",     contree, n.random.beta, nboot),
  function() fit_model(dt.psr, "cavity",   contree, n.random.beta, nboot)
)
results <- mclapply(tasks, function(f) f(), mc.cores = 6) # specify number of cores used
results # Check results

# # Alternative: Fit model (without parallelization)
# m.all.enc <- fit_model(dt.all, "enclosed", contree, n.random.beta, nboot)
# m.all.dom <- fit_model(dt.all, "dome",     contree, n.random.beta, nboot)
# m.all.cav <- fit_model(dt.all, "cavity",   contree, n.random.beta, nboot)
# m.np.enc <-  fit_model(dt.np,  "enclosed", contree, n.random.beta, nboot)
# m.np.dom <-  fit_model(dt.np,  "dome",     contree, n.random.beta, nboot)
# m.np.cav <-  fit_model(dt.np,  "cavity",   contree, n.random.beta, nboot)
# m.psr.enc <- fit_model(dt.psr, "enclosed", contree, n.random.beta, nboot)
# m.psr.dom <- fit_model(dt.psr, "dome",     contree, n.random.beta, nboot)
# m.psr.cav <- fit_model(dt.psr, "cavity",   contree, n.random.beta, nboot)
# 
# # Check results
# summary(m.all.enc)
# summary(m.all.dom)
# summary(m.all.cav)
# summary(m.np.enc)
# summary(m.np.dom)
# summary(m.np.cav)
# summary(m.psr.enc)
# summary(m.psr.dom)
# summary(m.psr.cav)

#### Fit models using 1,000 candidate phylogenetic trees (Supplementary analysis)
# Import and select candidate phylogenetic trees
tree1k <- readRDS("data/tree1k.rds") # use Sample1kTree.R to generate file
ntree <- 1000
set.seed(123)
trees <- tree1k[sample(1000, ntree)]

# Fit model and save results
# Warning: long runtime (proportional to ntree) 8-15hr for each line with one core PC

ncores <- 5 # number of cores to run (For no parallelization, remove the attribute or set it to NULL)
fit_model_multi_trees(dt.all, "enclosed", trees, beta.ini = readRDS("output/model_all_enclosed_beta.rds")[[1]], ncores)
fit_model_multi_trees(dt.all, "dome",     trees, beta.ini = readRDS("output/model_all_dome_beta.rds")[[1]], ncores)
fit_model_multi_trees(dt.all, "cavity",   trees, beta.ini = readRDS("output/model_all_cavity_beta.rds")[[1]], ncores)
fit_model_multi_trees(dt.np,  "enclosed", trees, beta.ini = readRDS("output/model_np_enclosed_beta.rds")[[1]], ncores)
fit_model_multi_trees(dt.np,  "dome",     trees, beta.ini = readRDS("output/model_np_dome_beta.rds")[[1]], ncores)
fit_model_multi_trees(dt.np,  "cavity",   trees, beta.ini = readRDS("output/model_np_cavity_beta.rds")[[1]], ncores)
fit_model_multi_trees(dt.psr, "enclosed", trees, beta.ini = readRDS("output/model_psr_enclosed_beta.rds")[[1]], ncores)
fit_model_multi_trees(dt.psr, "dome",     trees, beta.ini = readRDS("output/model_psr_dome_beta.rds")[[1]], ncores)
fit_model_multi_trees(dt.psr, "cavity",   trees, beta.ini = readRDS("output/model_psr_cavity_beta.rds")[[1]], ncores)

