source("RotatePopulation.R")
# Make product development crosses using different strategies on the same population,
for(strat in ProdDevStrat){
cat("  Working on prodDev:",strat, "cycle",cycle,"\n", "Rep", Rep,
" Ge scenario:",ge_scen,"\n")
source(paste0("pdStrats/",strat, "ProdDev.R"))
}
# Record the best product development lines with reference to the parental population
source("RotPiPd.R")
source("RecordGain.R")
# Perform a cycle of population improvement
source("PopImp.R")
for (p in rmsd_percentiles) {
cat("\nPercentile:", p, "\n")
print(summary(as.factor(winners_full$strat[winners_full$rmsd_percentile == p])))
}
}
}
# output the results from each ge_scen
fwrite(release_top_full, file = file.path(output_dir, paste0(ge_scen, "_release_top5_sum.csv")))
fwrite(cor_stats, file = file.path(output_dir, paste0(ge_scen, "_cor_stats.csv")))
fwrite(pop_stats, file = file.path(output_dir, paste0(ge_scen, "_pop_stats.csv")))
fwrite(pd_stats, file = file.path(output_dir, paste0(ge_scen, "_pd_stats.csv")))
}
make_top_mean_row <- function(subdf, top = 5) {
if (nrow(subdf) == 0) {
# Return a 1-row NA frame with same columns as gv_df (except 'strat' gets set later)
na_row <- as.data.frame(lapply(gv_df[0, ], function(x) NA))
na_row$n_top <- 0L
return(na_row[1, , drop = FALSE])
}
# Order by main_effect desc and take top-N (or fewer if not enough)
ord <- order(subdf$main_effect, decreasing = TRUE)
topn <- subdf[ord[seq_len(min(length(ord), top))], , drop = FALSE]
# Average numeric columns; keep non-numeric handled later
num_cols <- vapply(topn, is.numeric, logical(1))
mean_vec <- colMeans(topn[, num_cols, drop = FALSE], na.rm = TRUE)
out <- as.data.frame(t(mean_vec), stringsAsFactors = FALSE)
# Ensure all original columns are present (fill non-numeric with NA for now)
missing_cols <- setdiff(names(gv_df), names(out))
for (mc in missing_cols) out[[mc]] <- NA
# Reorder columns to match gv_df
out <- out[, names(gv_df), drop = FALSE]
# Add count actually used
out$n_top <- nrow(topn)
out
}
# Build release_top (averaged “top-N”)
results_top <- list()
for (i in seq_along(rmsd_windows)) {
r <- rmsd_windows[i]
percentile_label <- paste0(rmsd_percentiles[i])
filtered <- gv_df[gv_df$rot_rmsd < r, , drop = FALSE]
top_means_by_strat <- do.call(rbind, lapply(strats, function(s) {
subdf <- filtered[filtered$strat == s, , drop = FALSE]
row_mean <- make_top_mean_row(subdf, top = 5)  # <--- set your top value here
row_mean$strat <- s
row_mean$rmsd_percentile <- percentile_label
row_mean$rmsd_threshold  <- r
row_mean
}))
results_top[[percentile_label]] <- top_means_by_strat
}
# Define your rmsd_windows and list of all strategies
strats <- unique(gv_df$strat)
# Calculate thresholds based on parent population RMSD
rmsd_windows <- quantile(pop_df$rot_rmsd, probs = rmsd_percentiles / 100, na.rm = TRUE)
make_top_mean_row <- function(subdf, top = 5) {
if (nrow(subdf) == 0) {
# Return a 1-row NA frame with same columns as gv_df (except 'strat' gets set later)
na_row <- as.data.frame(lapply(gv_df[0, ], function(x) NA))
na_row$n_top <- 0L
return(na_row[1, , drop = FALSE])
}
# Order by main_effect desc and take top-N (or fewer if not enough)
ord <- order(subdf$main_effect, decreasing = TRUE)
topn <- subdf[ord[seq_len(min(length(ord), top))], , drop = FALSE]
# Average numeric columns; keep non-numeric handled later
num_cols <- vapply(topn, is.numeric, logical(1))
mean_vec <- colMeans(topn[, num_cols, drop = FALSE], na.rm = TRUE)
out <- as.data.frame(t(mean_vec), stringsAsFactors = FALSE)
# Ensure all original columns are present (fill non-numeric with NA for now)
missing_cols <- setdiff(names(gv_df), names(out))
for (mc in missing_cols) out[[mc]] <- NA
# Reorder columns to match gv_df
out <- out[, names(gv_df), drop = FALSE]
# Add count actually used
out$n_top <- nrow(topn)
out
}
# Build release_top (averaged “top-N”)
results_top <- list()
for (i in seq_along(rmsd_windows)) {
r <- rmsd_windows[i]
percentile_label <- paste0(rmsd_percentiles[i])
filtered <- gv_df[gv_df$rot_rmsd < r, , drop = FALSE]
top_means_by_strat <- do.call(rbind, lapply(strats, function(s) {
subdf <- filtered[filtered$strat == s, , drop = FALSE]
row_mean <- make_top_mean_row(subdf, top = 5)  # <--- set your top value here
row_mean$strat <- s
row_mean$rmsd_percentile <- percentile_label
row_mean$rmsd_threshold  <- r
row_mean
}))
results_top[[percentile_label]] <- top_means_by_strat
}
release_top <- do.call(rbind, results_top)
rownames(release_top) <- NULL
# Put annotation columns at the end
ann_cols <- c("n_top", "rmsd_percentile", "rmsd_threshold")
release_top <- release_top[, c(setdiff(names(release_top), ann_cols), ann_cols)]
# release based on top 5
release_top$cycle <- cycle
release_top$Rep <- Rep
# Append to the global release collector
release_top_full <- rbind(release_top_full, release_top)
df_list <- list(mean = df_mean,
dcross = df_dcross, dcross25 = df_dcross25,
index = df_index, index25 = df_index25,
pi = df_pi)
cor_means <- data.frame(strat = character(),
mean_cor = numeric(),
cycle = numeric(),
Rep = numeric(),
stringsAsFactors = FALSE)
# Loop over each dataset
for (name in names(df_list)) {
mat <- as.matrix(df_list[[name]][, 1:k]) %*% t(covs_tpe)
cor_mat <- cov2cor(var(mat))
mean_cor <- mean(cor_mat[upper.tri(cor_mat)])
cor_means <- rbind(cor_means,
data.frame(strat = name,
mean_cor = mean_cor,
cycle = cycle,
Rep = Rep))
}
cor_stats <- rbind(cor_stats, cor_means)
pop_sum <- data.frame(
cycle = cycle,
Rep = Rep,
main_effect = mean(pop_df$main_effect),
main_effect_var  = var(pop_df$main_effect),
pca_op = mean(pop_df$main_effect),
var_raw = mean(pop_df$var_raw),
var_scale = mean(pop_df$var_scale),
pca_rmsd = mean(pop_df$pca_rmsd),
rot_rmsd = mean(pop_df$rot_rmsd),
rot_rmsd_var = var(pop_df$rot_rmsd),
pop_rot$overall
)
# Append to global candidate stats collector
pop_stats <- rbind(pop_stats, pop_sum)
# Convert named thresholds to a numeric-friendly data.frame
rmsd_table <- data.frame(
rmsd_percentile = as.numeric(sub("%", "", names(rmsd_windows))) / 100,
threshold = as.numeric(rmsd_windows),
stringsAsFactors = FALSE
)
# Loop over strategies
for (s in unique(gv_df$strat)) {
sub_df <- gv_df[gv_df$strat == s, ]
# Loop over rmsd thresholds
for (i in seq_len(nrow(rmsd_table))) {
p_val <- rmsd_table$rmsd_percentile[i]
threshold <- rmsd_table$threshold[i]
filtered <- sub_df[sub_df$rot_rmsd < threshold, ]
# If no individuals pass, return NA
if (nrow(filtered) == 0) {
n_sel <- 0
mean_main <- NA_real_
mean_rmsd <- NA_real_
mean_var <- NA_real_
} else {
n_sel <- nrow(filtered)
mean_main <- mean(filtered$main_effect, na.rm = TRUE)
mean_rmsd <- mean(filtered$rot_rmsd, na.rm = TRUE)
mean_var <- mean(filtered$var_std, na.rm = TRUE)
}
# Append to collector
pd_stats <- rbind(pd_stats, data.frame(
cycle = cycle,
Rep = Rep,
strat = s,
rmsd_percentile = p_val,
n_selected = n_sel,
mean_main_effect = mean_main,
mean_rot_rmsd = mean_rmsd,
mean_var_std = mean_var
))
}
}
# ======================================================================
#
# Script: Simulate a breeding program that selects on true genetic values for the mean performance across a TPE
# Each generation, we undertake different strategies for crossing genotypes to create products for release
# and compare their performance in terms of the TPE mean and stability across the TPE
#
# Authors: D.L. Waters and D.J. Tolhurst
#
# ======================================================================
# ---- Load necessary libraries ---- #
rm(list = ls())  # Clean up the environment
set.seed(12)
library(AlphaSimR)
library(FieldSimR)
library(data.table)
library(dplyr)
source(paste0("seltools/drotate.R"))
source(paste0("seltools/dCross.R"))
#============================================# Prepare simulation parameters
# Load global parameters
source("GlobalParam.R")
# Prepare lists to collect results
rmsd_percentiles <- c(0.1, 1, 2.5, 5, 10, 20, 50, 100)   # rmsd percentiles used to assess performance
# Set directory for simulation output
output_dir <- "simOutput"
if (!dir.exists(output_dir)) dir.create(output_dir)
# List scenario(s) to be simulated
sim_scen <- c("mod50_het")
#=====================================================#
#Begin Simulation
#=====================================================#
for (ge_scen in sim_scen){
cat(" Ge scenario:",ge_scen,"\n")
# Create dataframes to record stats
release_full <- data.frame()                        # Record the best genotypes per strategy per stability percentile
release_top_full <- data.frame()                    # Record the average of the top 5 genotypes per strategy per stability percentile
release_full_var <- data.frame()                    # Record  the best genotypes per strategy per stability percentile based on raw variance
winners_full <- data.frame()                        # Record which strategy had the best genotype per stability percentile
winners_full_var <- data.frame()                    # Record which strategy had the best genotype per stability percentile based on raw variance
pop_stats <- data.frame()                           # Store summaries of the population development populations
cor_stats <- data.frame()                           # Store average genetic correlation among progeny in PD population
pd_stats <- data.frame()                            # Store summaries in the PD population
# Load GEI matrix and define relevant variables
Ge <- readRDS(paste0("simGe/",ge_scen,"_GEI.RDS"))
Ce <- cov2cor(Ge)                                   # Correlation matrix
De <- diag(diag(Ge))                                # Genetic variances within each environment
k <- ge_params[[ge_scen]]$k                         # Rank of the GEI matrix
k_maxfit <- ge_params[[ge_scen]]$k                  # Number of covariates used in crossing calculations
p <- ge_params[[ge_scen]]$p                         # Number of environments sampled
nEnvTPE <- ge_params[[ge_scen]]$nEnvTPE             # Number of environments in the TPE (this must equal p for this simulation)
for (Rep in 1:nReps){
#---- Initialise Sim ----
source("CreateFounders.R")
#----Burn-in Phase ----
for (cycle in 1:nCyclesBurnIn){
source("BurnIn.R")                            # Run burn-in cycles
}
burnDH <- DH
#burnF1 <- F1
print("burn in complete")
# ---- Future phase ----
for(cycle in (nCyclesBurnIn+1):(nCyclesBurnIn+nCyclesFuture)) {
cat("Working on future cycle:",cycle,"\n")
# Get rotation for the population in each cycle
source("RotatePopulation.R")
# Make product development crosses using different strategies on the same population,
for(strat in ProdDevStrat){
cat("  Working on prodDev:",strat, "cycle",cycle,"\n", "Rep", Rep,
" Ge scenario:",ge_scen,"\n")
source(paste0("pdStrats/",strat, "ProdDev.R"))
}
# Record the best product development lines with reference to the parental population
source("RecordPerformance.R")
# Perform a cycle of population improvement
source("PopImp.R")
for (p in rmsd_percentiles) {
cat("\nPercentile:", p, "\n")
print(summary(as.factor(winners_full$strat[winners_full$rmsd_percentile == p])))
}
}
}
# output the results from each ge_scen
fwrite(release_top_full, file = file.path(output_dir, paste0(ge_scen, "_release_top5_sum.csv")))
fwrite(cor_stats, file = file.path(output_dir, paste0(ge_scen, "_cor_stats.csv")))
fwrite(pop_stats, file = file.path(output_dir, paste0(ge_scen, "_pop_stats.csv")))
fwrite(pd_stats, file = file.path(output_dir, paste0(ge_scen, "_pd_stats.csv")))
}
# ======================================================================
#
# Script: Simulate a breeding program that selects on true genetic values for the mean performance across a TPE
# Each generation, we undertake different strategies for crossing genotypes to create products for release
# and compare their performance in terms of the TPE mean and stability across the TPE
#
# Authors: D.L. Waters and D.J. Tolhurst
#
# ======================================================================
# ---- Load necessary libraries ---- #
rm(list = ls())  # Clean up the environment
set.seed(12)
library(AlphaSimR)
library(FieldSimR)
library(data.table)
library(dplyr)
source(paste0("seltools/drotate.R"))
source(paste0("seltools/dCross.R"))
#============================================# Prepare simulation parameters
# Load global parameters
source("GlobalParam.R")
# Prepare lists to collect results
rmsd_percentiles <- c(0.1, 1, 2.5, 5, 10, 20, 50, 100)   # rmsd percentiles used to assess performance
# Set directory for simulation output
output_dir <- "simOutput"
if (!dir.exists(output_dir)) dir.create(output_dir)
# List scenario(s) to be simulated
sim_scen <- c("mod50_het")
#=====================================================#
#Begin Simulation
#=====================================================#
for (ge_scen in sim_scen){
cat(" Ge scenario:",ge_scen,"\n")
# Create dataframes to record stats
release_full <- data.frame()                        # Record the best genotypes per strategy per stability percentile
release_top_full <- data.frame()                    # Record the average of the top 5 genotypes per strategy per stability percentile
release_full_var <- data.frame()                    # Record  the best genotypes per strategy per stability percentile based on raw variance
winners_full <- data.frame()                        # Record which strategy had the best genotype per stability percentile
winners_full_var <- data.frame()                    # Record which strategy had the best genotype per stability percentile based on raw variance
pop_stats <- data.frame()                           # Store summaries of the population development populations
cor_stats <- data.frame()                           # Store average genetic correlation among progeny in PD population
pd_stats <- data.frame()                            # Store summaries in the PD population
# Load GEI matrix and define relevant variables
Ge <- readRDS(paste0("simGe/",ge_scen,"_GEI.RDS"))
Ce <- cov2cor(Ge)                                   # Correlation matrix
De <- diag(diag(Ge))                                # Genetic variances within each environment
k <- ge_params[[ge_scen]]$k                         # Rank of the GEI matrix
k_maxfit <- ge_params[[ge_scen]]$k                  # Number of covariates used in crossing calculations
p <- ge_params[[ge_scen]]$p                         # Number of environments sampled
nEnvTPE <- ge_params[[ge_scen]]$nEnvTPE             # Number of environments in the TPE (this must equal p for this simulation)
for (Rep in 1:nReps){
#---- Initialise Sim ----
source("CreateFounders.R")
#----Burn-in Phase ----
for (cycle in 1:nCyclesBurnIn){
source("BurnIn.R")                            # Run burn-in cycles
}
burnDH <- DH
#burnF1 <- F1
print("burn in complete")
# ---- Future phase ----
for(cycle in (nCyclesBurnIn+1):(nCyclesBurnIn+nCyclesFuture)) {
cat("Working on future cycle:",cycle,"\n")
# Get rotation for the breeding population in each cycle
source("RotatePopulation.R")
# Make product development crosses using different strategies on the same population,
for(strat in ProdDevStrat){
cat("  Working on prodDev:",strat, "cycle",cycle,"\n", "Rep", Rep,
" Ge scenario:",ge_scen,"\n")
source(paste0("pdStrats/",strat, "ProdDev.R"))
}
# Record the performance of product development lines with reference to the parental population
source("RecordPerformance.R")
# Perform a cycle of population improvement
source("PopImp.R")
}
}
# output the results from each ge_scen
fwrite(release_top_full, file = file.path(output_dir, paste0(ge_scen, "_release_top5_sum.csv")))
fwrite(cor_stats, file = file.path(output_dir, paste0(ge_scen, "_cor_stats.csv")))
fwrite(pop_stats, file = file.path(output_dir, paste0(ge_scen, "_pop_stats.csv")))
fwrite(pd_stats, file = file.path(output_dir, paste0(ge_scen, "_pd_stats.csv")))
}
# ======================================================================
#
# Script: Simulate a breeding program that selects on true genetic values for the mean performance across a TPE
# Each generation, we undertake different strategies for crossing genotypes to create products for release
# and compare their performance in terms of the TPE mean and stability across the TPE
#
# Authors: D.L. Waters and D.J. Tolhurst
#
# ======================================================================
# ---- Load necessary libraries ---- #
rm(list = ls())  # Clean up the environment
library(AlphaSimR)
library(FieldSimR)
library(data.table)
library(dplyr)
source(paste0("seltools/drotate.R"))
source(paste0("seltools/dCross.R"))
#============================================# Prepare simulation parameters
# Load global parameters
source("GlobalParam.R")
# Prepare lists to collect results
rmsd_percentiles <- c(0.1, 1, 2.5, 5, 10, 20, 50, 100)   # rmsd percentiles used to assess performance
# Set directory for simulation output
output_dir <- "simOutput"
if (!dir.exists(output_dir)) dir.create(output_dir)
# List scenario(s) to be simulated
sim_scen <- c("mod50_het")
#=====================================================#
#Begin Simulation
#=====================================================#
for (ge_scen in sim_scen){
cat(" Ge scenario:",ge_scen,"\n")
# Create dataframes to record stats
release_full <- data.frame()                        # Record the best genotypes per strategy per stability percentile
release_top_full <- data.frame()                    # Record the average of the top 5 genotypes per strategy per stability percentile
release_full_var <- data.frame()                    # Record  the best genotypes per strategy per stability percentile based on raw variance
winners_full <- data.frame()                        # Record which strategy had the best genotype per stability percentile
winners_full_var <- data.frame()                    # Record which strategy had the best genotype per stability percentile based on raw variance
pop_stats <- data.frame()                           # Store summaries of the population development populations
cor_stats <- data.frame()                           # Store average genetic correlation among progeny in PD population
pd_stats <- data.frame()                            # Store summaries in the PD population
# Load GEI matrix and define relevant variables
Ge <- readRDS(paste0("simGe/",ge_scen,"_GEI.RDS"))
Ce <- cov2cor(Ge)                                   # Correlation matrix
De <- diag(diag(Ge))                                # Genetic variances within each environment
k <- ge_params[[ge_scen]]$k                         # Rank of the GEI matrix
k_maxfit <- ge_params[[ge_scen]]$k                  # Number of covariates used in crossing calculations
p <- ge_params[[ge_scen]]$p                         # Number of environments sampled
nEnvTPE <- ge_params[[ge_scen]]$nEnvTPE             # Number of environments in the TPE (this must equal p for this simulation)
for (Rep in 1:nReps){
#---- Initialise Sim ----
source("CreateFounders.R")
#----Burn-in Phase ----
for (cycle in 1:nCyclesBurnIn){
source("BurnIn.R")                            # Run burn-in cycles
}
burnDH <- DH
#burnF1 <- F1
print("burn in complete")
# ---- Future phase ----
for(cycle in (nCyclesBurnIn+1):(nCyclesBurnIn+nCyclesFuture)) {
cat("Working on future cycle:",cycle,"\n")
# Get rotation for the breeding population in each cycle
source("RotatePopulation.R")
# Make product development crosses using different strategies on the same population,
for(strat in ProdDevStrat){
cat("  Working on prodDev:",strat, "cycle",cycle,"\n", "Rep", Rep,
" Ge scenario:",ge_scen,"\n")
source(paste0("pdStrats/",strat, "ProdDev.R"))
}
# Record the performance of product development lines with reference to the parental population
source("RecordPerformance.R")
# Perform a cycle of population improvement
source("PopImp.R")
}
}
# output the results from each ge_scen
fwrite(release_top_full, file = file.path(output_dir, paste0(ge_scen, "_release_top5_sum.csv")))
fwrite(cor_stats, file = file.path(output_dir, paste0(ge_scen, "_cor_stats.csv")))
fwrite(pop_stats, file = file.path(output_dir, paste0(ge_scen, "_pop_stats.csv")))
fwrite(pd_stats, file = file.path(output_dir, paste0(ge_scen, "_pd_stats.csv")))
}
setwd("~/LECTURER/research/roslin/Paper_supps/Simulation_V2/Simulation_V2")
# ---- Load necessary libraries ---- #
rm(list = ls())  # Clean up the environment
library(AlphaSimR)
library(FieldSimR)
library(data.table)
library(dplyr)
source(paste0("seltools/drotate.R"))
source(paste0("seltools/dCross.R"))
# Load global parameters
source("GlobalParam.R")
# Prepare lists to collect results
rmsd_percentiles <- c(0.1, 1, 2.5, 5, 10, 20, 50, 100)   # Stability percentiles used to assess performance
# Set directory for simulation output
output_dir <- "simOutput"
if (!dir.exists(output_dir)) dir.create(output_dir)
# List scenario(s) to be simulated
sim_scen <- c("mod50_het")
for (ge_scen in sim_scen){
cat(" Ge scenario:",ge_scen,"\n")
# Create dataframes to record stats
release_full <- data.frame()                        # Record the best genotypes per strategy per stability percentile
release_top_full <- data.frame()                    # Record the average of the top 5 genotypes per strategy per stability percentile
release_full_var <- data.frame()                    # Record  the best genotypes per strategy per stability percentile based on raw variance
winners_full <- data.frame()                        # Record which strategy had the best genotype per stability percentile
winners_full_var <- data.frame()                    # Record which strategy had the best genotype per stability percentile based on raw variance
pop_stats <- data.frame()                           # Store summaries of the population development populations
cor_stats <- data.frame()                           # Store average genetic correlation among progeny in PD population
pd_stats <- data.frame()                            # Store summaries in the PD population
# Load GEI matrix and define relevant variables
Ge <- readRDS(paste0("simGe/",ge_scen,"_GEI.RDS"))
Ce <- cov2cor(Ge)                                   # Correlation matrix
De <- diag(diag(Ge))                                # Genetic variances within each environment
k <- ge_params[[ge_scen]]$k                         # Rank of the GEI matrix
k_maxfit <- ge_params[[ge_scen]]$k                  # Number of covariates used in crossing calculations
p <- ge_params[[ge_scen]]$p                         # Number of environments sampled
nEnvTPE <- ge_params[[ge_scen]]$nEnvTPE             # Number of environments in the TPE (this must equal p for this simulation)
for (Rep in 1:nReps){
#---- Initialise Sim ----
source("CreateFounders.R")
#----Burn-in Phase ----
for (cycle in 1:nCyclesBurnIn){
source("BurnIn.R")                            # Run burn-in cycles
}
burnDH <- DH
print("burn in complete")
# ---- Future phase ----
for(cycle in (nCyclesBurnIn+1):(nCyclesBurnIn+nCyclesFuture)) {
cat("Working on future cycle:",cycle,"\n")
# Get rotation for the breeding population in each cycle
source("RotatePopulation.R")
# Make product development crosses using different strategies on the same population,
for(strat in ProdDevStrat){
cat("  Working on prodDev:",strat, "cycle",cycle,"\n", "Rep", Rep,
" Ge scenario:",ge_scen,"\n")
source(paste0("pdStrats/",strat, "ProdDev.R"))
}
# Record the performance of product development lines with reference to the parental population
source("RecordPerformance.R")
# Perform a cycle of population improvement
source("PopImp.R")
}
}
# output the results from each ge_scen
fwrite(release_top_full, file = file.path(output_dir, paste0(ge_scen, "_release_top5_sum.csv")))
fwrite(cor_stats, file = file.path(output_dir, paste0(ge_scen, "_cor_stats.csv")))
fwrite(pop_stats, file = file.path(output_dir, paste0(ge_scen, "_pop_stats.csv")))
fwrite(pd_stats, file = file.path(output_dir, paste0(ge_scen, "_pd_stats.csv")))
}
