# ======================================================================
#
# Script: Simulate a breeding program that selects on true genetic values for the mean performance across a TPE
# Each generation, we undertake different strategies for crossing genotypes to create products for release
# and compare their performance in terms of the TPE mean and stability across the TPE
#
# Authors: D.L. Waters and D.J. Tolhurst
#
# ======================================================================

# ---- Load necessary libraries ---- #
rm(list = ls())  # Clean up the environment

library(AlphaSimR)
library(FieldSimR)
library(data.table)
library(dplyr)
library(ggplot2)
source("0dCross_fn.R")

#============================================# Prepare simulation parameters

# Load global parameters
source("2GlobalParams.R")

# Prepare lists to collect results
rmsd_percentiles <- c(0.1, 1, 2.5, 5, 10, 20, 50, 100)   # Stability percentiles used to assess performance 

# Set directory for simulation output
output_dir <- "10SimOutput"
if (!dir.exists(output_dir)) dir.create(output_dir)

# List scenario(s) to be simulated
sim_scen <- c("mod50_het") 


#=====================================================#
#Begin Simulation
#=====================================================#

for (ge_scen in sim_scen){

   cat(" Ge scenario:", ge_scen, "\n")
  
  
  # Create dataframes to record stats
  release_full <- data.frame()                        # Record the best genotypes per strategy per stability percentile
  release_top_full <- data.frame()                    # Record the average of the top 5 genotypes per strategy per stability percentile
  release_full_var <- data.frame()                    # Record  the best genotypes per strategy per stability percentile based on raw variance
  winners_full <- data.frame()                        # Record which strategy had the best genotype per stability percentile
  winners_full_var <- data.frame()                    # Record which strategy had the best genotype per stability percentile based on raw variance 
  pop_stats <- data.frame()                           # Store summaries of the population development populations
  cor_stats <- data.frame()                           # Store average genetic correlation among progeny in PD population
  pd_stats <- data.frame()                            # Store summaries in the PD population 
  
  
  # Load GEI matrix and define relevant variables 
  Ge <- readRDS(paste0("9simGe/",ge_scen,"_GEI.RDS"))
  Ce <- cov2cor(Ge)                                   # Correlation matrix
  De <- diag(diag(Ge))                                # Genetic variances within each environment

  k <- ge_params[[ge_scen]]$k                         # Rank of the GEI matrix
  k_maxfit <- ge_params[[ge_scen]]$k                  # Number of covariates used in crossing calculations
  p <- ge_params[[ge_scen]]$p                         # Number of environments sampled 
  nEnvTPE <- ge_params[[ge_scen]]$nEnvTPE             # Number of environments in the TPE (this must equal p for this simulation)
  
 
  for (Rep in 1:nReps){
   
  cat("Starting Rep", Rep, "\n")
    
  #---- Initialise Sim ----
  
  source("3CreateFounders.R")
  
    cat(" Founder simulation complete \n")
  
  #----Burn-in Phase ----
  
  for (cycle in 1:nCyclesBurnIn){
    
     cat("Working on burn-in cycle:", cycle, "\n")
     source("4BurnIn.R")                            # Run burn-in cycles
    } 
     burnDH <- DH
     
     cat("Burn-in complete \n")
     
     # ---- Future phase ----
    
      for(cycle in (nCyclesBurnIn+1):(nCyclesBurnIn+nCyclesFuture)) {
       
        cat("Working on future cycle:", cycle, "\n")
     
      # Get rotation for the breeding population in each cycle
        source("5RotatePopulation.R")
        
        
      # Make product development crosses using different strategies on the same population,
         for(strat in ProdDevStrat){
         cat("  prodDev: ", strat, ",", " future cycle ", cycle, "\n", "Rep ", Rep, 
             ", Ge scenario: ",ge_scen, "\n", sep = "")
          source(paste0("6ProdDev/6ProdDev_", strat, ".R"))
         }

        
      # Record the performance of product development lines with reference to the parental population
         source("7RecordPerformance.R")

        
      # Perform a cycle of population improvement 
        source("8PopImp.R")  
        
      }
     
     cat("Rep", Rep, "complete \n")
  }
  
  # output the results from each ge_scen
  
  fwrite(release_top_full, file = file.path(output_dir, paste0(ge_scen, "_release_top5_sum.csv")))
  fwrite(cor_stats, file = file.path(output_dir, paste0(ge_scen, "_cor_stats.csv")))
  fwrite(pop_stats, file = file.path(output_dir, paste0(ge_scen, "_pop_stats.csv")))
  fwrite(pd_stats, file = file.path(output_dir, paste0(ge_scen, "_pd_stats.csv")))
   }


#=====================================================#
# Plot outputs
#=====================================================#

# Average performance of the top 5 products over stability percentiles 
top5_df <- read.csv("10SimOutput/mod50_het_release_top5_sum.csv")
head(top5_df)
ggplot(subset(top5_df, rmsd_percentile != 0.1), aes(x = rmsd_percentile,
           y = main_effect, group = strat, color = strat)) +
  stat_summary(fun = mean, geom = "line") +
  stat_summary(fun = mean, geom = "point") +
  theme_bw() + labs(x = "Stability percentile (%)", y = "Average genetic value", color = "Strategy")


# Number of candidate products per stability percentile 
prod_df <- read.csv("10SimOutput/mod50_het_pd_stats.csv")
head(prod_df)
ggplot(prod_df, aes(x = rmsd_percentile * 100, y = n_selected, group = strat, color = strat)) +
  stat_summary(fun = mean, geom = "line") +
  stat_summary(fun = mean, geom = "point") +
  theme_bw() + labs(x = "Stability percentile (%)", y = "Number of candidates", color = "Strategy")


# Average performance for each strategy relative to selection on mean (here, consider all candidates) 
diff_df <- subset(prod_df, rmsd_percentile == 1.0) %>% group_by(cycle, Rep) %>%
  mutate(rel_main_effect = mean_main_effect - mean_main_effect[strat == "mean"])
head(diff_df)
ggplot(diff_df, aes(x = cycle, y = rel_main_effect, group = strat, color = strat)) +
  stat_summary(fun = mean, geom = "line") +
  stat_summary(fun = mean, geom = "point") +
  theme_bw() +
  labs(y = "Average performance relative to selection on mean", color = "Strategy")


# Mean genetic correlation between environments in products over cycles
cor_df <- read.csv("10SimOutput/mod50_het_cor_stats.csv")
cor_df$strat[cor_df$strat == "dcross"] <- "dcross1.0"
cor_df$strat[cor_df$strat == "dcross25"] <- "dcross0.25"
cor_df$strat[cor_df$strat == "index"] <- "index1.0"
cor_df$strat[cor_df$strat == "index25"] <- "index0.25"
head(cor_df)
ggplot(subset(cor_df, strat != "pi"), aes(x = cycle, y = mean_cor, group = strat, color = strat)) +
  stat_summary(fun = mean, geom = "line") +
  stat_summary(fun = mean, geom = "point") +
  theme_bw() + labs(x = "Future breeding cycle", y = "Genetic correlation between environments")


# Population improvement for each rep
pop_df <- read.csv("10SimOutput/mod50_het_pop_stats.csv")
head(pop_df)
ggplot(pop_df,aes(x = cycle, y = main_effect, group = Rep)) + geom_line(color = "grey50") +
  theme_bw() + theme(legend.position = "none")  + labs(x = "Future breeding cycle", y = "Average genetic value")


#=====================================================#
# End of script
#=====================================================#

