# This function was written by James B. Dorey starting on the 14th of March 2023 to wrap ChaoSpecies
# and output table for multiple species/countries/variables at once


#' Parallel estimation of species richness in a community
#' 
#' ChaoSpecies: Estimation of species richness in a single community based on five types of data: 
#' Type (1) abundance data (datatype="abundance"), Type (1A) abundance-frequency counts 
#' (datatype="abundance_freq_count"), Type (2) incidence-frequency data 
#' (datatype = "incidence_freq"), Type (2A) incidence-frequency counts 
#' (datatype="incidence_freq_count"), and Type (2B) incidence-raw data (datatype="incidence_raw"); 
#' see SpadeR-package details for data input formats.
#'
#' @param data A data frame or tibble. A data frame containing "abundance"-type data per variable
#' (population, country, species...) in columns.
#' @param datatype Character. The type of input data, "abundance", "abundance_freq_count", 
#' "incidence_freq", "incidence_freq_count" or "incidence_raw". So far only tested with "abundance"
#' data. Default = "abundace".
#' @param k Numeric. the cut-off point (default = 10), which separates species into "abundant" and "rare" 
#' groups for abundance data for the estimator ACE; it separates species into "frequent" and 
#' "infrequent" groups for incidence data for the estimator ICE.
#' Default = 10.
#' @param conf Numeric. A positive number ≤ 1 specifying the level of confidence interval.
#' Default = 0.95.
#' @param mc.cores Numeric. If > 1, the function will run in parallel
#' using mclapply using the number of cores specified. If = 1 then it will be run using a serial
#' loop. NOTE: Windows machines must use a value of 1 (see ?parallel::mclapply). Additionally,
#' be aware that each thread can use large chunks of memory.
#' Default = 1.
#'
#' @return Returns a list containing two tibbles. The first is a tibble that concatenates the outputs
#' from the basic data and rare species information in columns per input variable (column). The second is a tibble
#' that concatenates the various species richness estimates, with input variables in chunks of rows.
#' Additionally a console output will list the variables (columns) that lacked sufficient data
#' to be analysed.
#' 
#' 
#' @importFrom dplyr %>%
#' 
#' @seealso [SpadeR::ChaoSpecies()] 
#' 
#' @export
#'
#' @examples
#'   # Read in the example test data (Please don't use this for a final version or share too widely)
#'   testData <- readr::read_csv("TestData_JBD.csv")
#' 
#' countryChao_n1 <- ChaoWrapper(data = testData,
#' k = 5,
#' mc.cores = 4)


ChaoWrapper <- function(
    data = NULL,
    datatype = "abundance",
    k = 10,
    conf = 0.95,
    mc.cores = 1){
  # locally bind variables to the function
  i <- . <- inputData_i <- spOutput <- diversityOutput <- diversityTable <- basicOutput <-
    df_list <- loopVector <- wrapper <- basicCols <- variable <- rowname <- non_empty_list_test <-
    diversityOut <- basicOut <- failures <- output <- NULL
  
    #### 0.0 Prep ####
      ##### 0.1 Functions ####
  # Duplicate a function within ChaoSpecies to not run data-poor species
  f <- function(i, data) {
    length(data[which(data == i)])
  }
  
    # wrap the ChaoSpecies function in a wrapper to be passed to mclapply
  wrapper <- function(inputData_i = NULL){
    # Don't run species where there are no counts that are less than k
    # i.e. ONLY run species that won't throw an error due to weird/poor cases of data
    if(any(inputData_i[[1]] > 0 & inputData_i[[1]] < 5) & 
       !(f(1, inputData_i[[1]]) == sum(inputData_i[[1]], na.rm = TRUE))
    ){
      # Run the ChaoSpecies function
      spOutput <- SpadeR::ChaoSpecies(
        data = inputData_i[[1]][complete.cases(inputData_i[[1]])],
        datatype = datatype,
        k = k, 
        conf = conf)
        ###### a. diversityOutput ####
      # Get the diversity measures and re-format
      diversityOutput <- spOutput$Species_table %>% as.data.frame() %>% 
        dplyr::mutate(rowname = rownames(.), .before = "Estimate") 
      # Add these data to one table
      diversityTable <- dplyr::tibble(
        variable = names(inputData_i),
        Name = c(diversityOutput$rowname),
        Estimate = c(diversityOutput$Estimate),
        's.e.' = c(diversityOutput$s.e.),
        '95%Lower' = c(diversityOutput$`95%Lower`),
        '95%Upper' = c(diversityOutput$`95%Upper`))
      
      
        ##### b. basicOutput ####
      # Get Basic information into one table
      basicOutput <- spOutput$Basic_data_information %>% as.data.frame() %>% 
          # Remove hanging white spaces at the start of the rowname
        dplyr::mutate(rowname = rownames(.) %>% 
                        gsub(pattern = "^\\s+", replacement = ""),
                      .before = "Variable") %>% 
        dplyr::tibble()
      
      # Set up the data
      basicTable <- basicOutput
      # rename Value to the species name
      names(basicTable) <- c("rowname", "variable", names(inputData_i))
      # Return the tables as a list
      return(list( basicTable, diversityTable) %>%
               setNames(c( "basicTable", "diversityTable")))
    }
  }
  
#### 1.0 Data prep ####
  # Loop the data to make it into a list (it's a two-level list) because mclapply will strip one layer
df_list <- dplyr::lst()
for(i in 1:ncol(data)){
# Extract vector
      loopVector <- dplyr::pull(data, colnames(data)[i]) %>%
        dplyr::lst() %>%
        stats::setNames(., colnames(data)[i])
      # Add to the list
      df_list <- df_list %>%
      append( dplyr::lst(loopVector))
}
  # Set the names for the list
df_list <- df_list %>%
  stats::setNames(colnames(data))

  
#### 2.0 Run functions ####    
# Run the function per species or variable
diversityOutput <- parallel::mclapply(
  X = df_list,
  FUN = wrapper,
  mc.cores = mc.cores
) 


# Save the diversity table's first two columns
basicCols <- diversityOutput[[1]]$basicTable %>%
  dplyr::select(rowname, variable)


#### 3.0 Process outputs ####
  ##### 3.1 Separate F + P ####
  # Find the non-null variables and extract those
non_empty_list_test <- !sapply(diversityOutput <- diversityOutput, is.null)
  # Save a list of variables that could not be analysed
failures <- diversityOutput[!non_empty_list_test] %>%
    # Get the failure names 
  names()
  # Remove the failed list
diversityOutput <- diversityOutput[non_empty_list_test]


  ##### 3.2 Basic outputs ####
  # Now, combine each level of the list across variables
  # Extract the diversity table stuff
basicOut <- lapply(diversityOutput, function(x) x[["basicTable"]]) %>%
  # Remove the duplicate columns and only keept the output values
  lapply(., function(x) dplyr::select(x, !c("rowname", "variable"))) %>%
    # Bind together with the original two columns
  dplyr::bind_cols(basicCols, .)
  

  ##### 3.3 Diversity outputs ####
  # Row bind the diversity statistics into a single table
diversityOut <- lapply(diversityOutput, function(x) x[["diversityTable"]]) %>%
  dplyr::bind_rows()


  ##### 3.4 Combine ####
output <- dplyr::lst(basicOut, diversityOut) %>% 
  setNames(c("basicTable", "diversityTable"))
  
#### 4.0 User output ####
  # provide some user output on the failures
if(length(failures > 0)){
writeLines(paste0(
  " - We could not examine the following variables (because of insufficent data or sample size): ",
  paste(failures, collapse = ", ")
))}
  # provide user output about the file structure
writeLines(paste0(" - Outputs can be found in a list with two tibbles called 'basicTable' and",
                  " 'diversityTable'."))

  # Return the output
return(output)

} # END ChaoWrapper JBD
