

## LIBRARY
library(optparse)
library(readxl)
library(ggplot2)

## DATA
FAM_FILE="results/cnv_filtering/NORDiC_2021.analysisready.fam"
CNV_BED <- "results/assoc_tests/NORDiC_2021.callset.analysisready.bed"
IID_GROUP_PHE_TSV <- "results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv"
METRICS_TSV <- "results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.tsv"
LOCUS_CNVTYPE_IID_FREQ_TSV <- "results/annot_cnv/NORDiC_2021.callset.locus_cnvtype_iid_freq.tsv"
CNV_CDS_OVERLAPS_BED <- "results/assoc_tests/NORDiC_2021.callset.analysisready.CDS_gene_overlaps.bed"
EIGENVEC_FILE="results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.PCA.eigenvec"
INTERVAL_GENEOVERLAPS_TSV <- "results/annot_cnv/NORDiC_2021.callset.interval_geneoverlaps.tsv"
GNOMAD_LOF_METRICS_TSV <- "results/global_cnv_burden_analysis/gnomad.v2.1.1.lof_metrics.by_gene.txt"
NDEV_DEL_BED <- "results/annot_cnv/NORDiC_2021.qual_cnv.del_neurodev.bed"
NDEV_DUP_BED <- "results/annot_cnv/NORDiC_2021.qual_cnv.dup_neurodev.bed"
NDEV_EXONICDEL_GENELIST <- "data/annot_cnv/PMID_30994872_TableS11.exonic_del.gene.list"
NDD_GENES <- "data/annot_cnv/PMID_35982160.NDD_genes_FDR05.txt"
CONS_BINARY_TSV <- "results/annot_cnv/NORDiC_2021.qual_cnv.zoonomia.binary.tsv"
CONS_INT_TSV <- "results/annot_cnv/NORDiC_2021.qual_cnv.zoonomia.integer.tsv"
HAPLOTRIPLO_SCORE_XLSX <- "data/annot_cnv/1-s2.0-S0092867422007887-mmc7.xlsx"
CNV_CALL_METRICS_TSV <- "results/annot_cnv/NORDiC_2021.qual_cnv.metrics.tsv"

Main <- function() {

  # get user args
  option_list <- list( 
      make_option("--no-angi", default=FALSE, action="store_true",
                  help="no ANGI samples in analysis [default %default]"),
      make_option("--size-ge-100kb", default=FALSE, action="store_true",
                  help="Only include CNVs that are at least 100kb in size [default %default]"),
      make_option("--in-iid-group-phe-tsv", default=NA, action="store",
                  type='character', 
                  help="name of input tsv with IID/group/phenotype [REQUIRED, default %default]"),
      make_option("--in-fam", default=NA, action="store", type="character",
          help="name of input fam file [REQUIRED, default %default]"),
      make_option("--in-eigenvec", default=NA, action="store", type="character",
          help="name of input PLINK eigenvec file [REQUIRED, default %default]"),
      make_option("--outroot", default=NA, action="store", type="character",
          help="root name for output files [REQUIRED, default %default]")
      )

  # get command line options, if help option encountered print help and exit,
  # otherwise if options not found on command line then set defaults, 
  opt_parser <- OptionParser(option_list=option_list)
  opt <- parse_args(opt_parser)
  if (is.na(opt[["in-iid-group-phe-tsv"]]) | is.na(opt[["in-fam"]]) | is.na(opt[["in-eigenvec"]]) | is.na(opt[["outroot"]])) {
    print_help(opt_parser)
    stop("missing at least one REQUIRED opt")
  }

  # read sample data
  df <- read.table(opt[["in-iid-group-phe-tsv"]], header=F, sep="\t", stringsAsFactors=F)
  colnames(df) <- c("IID","GROUP","CASE")
  df <- unique(df)

  # grab case and control IIDs
  case_iids <- subset(df, CASE==1)$IID
  ctrl_iids <- subset(df, CASE==0)$IID
  
  # merge in sex from fam file
  fam <- read.table(opt[["in-fam"]], header=F, stringsAsFactors=F)
  fam <- fam[,c(2,5)]
  colnames(fam) <- c("IID","FEMALE")
  fam$FEMALE <- fam$FEMALE - 1
  df <- merge(df, fam, by="IID")

  # merge in metrics
  metrics <- read.table(METRICS_TSV, header=T, sep="\t", stringsAsFactors=F)
  metrics <- unique(metrics)
  metrics$n_cnv <- NULL
  metrics$n_del <- NULL
  metrics$n_dup <- NULL
  print(summary(metrics$n_cnv_raw))
  df <- merge(df, metrics, by="IID")

  # merge in eigenvec file
  evec <- read.table(opt[["in-eigenvec"]], stringsAsFactors=F)
  colnames(evec) <- c("FID","IID",paste0("PC",1:20))
  evec$FID <- NULL
  df <- merge(df, evec, by="IID")
  

  # make IID rownames of df
  rownames(df) <- df$IID

  # read constraint files
  cons_01 <- read.table(CONS_BINARY_TSV, header=T, stringsAsFactors=F)
  # cols_convert_list <- list("X.20to2.27"="uncons", "X2.27to12"="cons")
  cons_01$notcons <- cons_01$X.20to2.27
  cons_01$cons <- cons_01$X2.27to12
  cons_int <- read.table(CONS_INT_TSV, header=T, stringsAsFactors=F)
  uncons_cols <- c("X.20to.19", "X.19to.18", "X.18to.17", "X.17to.16",
                   "X.16to.15", "X.15to.14", "X.14to.13", "X.13to.12", 
                   "X.12to.11", "X.11to.10", "X.10to.9", "X.9to.8",
                   "X.8to.7", "X.7to.6", "X.6to.5", "X.5to.4", "X.4to.3",
                   "X.3to.2", "X.2to.1", "X.1to0")
  cons <- merge(cons_01, 
                cons_int[,c("locus",
                            uncons_cols,
                            "X0to1","X1to2","X2to3",
                            "X3to4","X4to5","X5to6","X6to7","X7to8","X8to9")],
                by="locus")
  
  # which covars are assoc with case status?
  covars_test <- c(paste0("PC",1:20), "LRR_SD", "LRR_mean",
                   "FEMALE","n_del_raw","n_dup_raw")
  covars_test <- c(paste0("PC",1:20), "LRR_SD", "FEMALE")
  covar_caco_assoc <-data.frame(covar=character(), 
                                lg_est=numeric(),
                                lg_p=numeric())
  for (covar in covars_test) {
    print(covar)
    print(summary(df[[covar]]))
    mdl_str <- paste0("CASE~",covar)
    res <- summary(glm(as.formula(mdl_str), data=df, family=binomial))
    est <- res$coefficients[covar, 1]
    p <- res$coefficients[covar, 4]
    covar_caco_assoc <- rbind(covar_caco_assoc,
                              data.frame(covar=covar, lg_est=est, lg_p=p)
                             )
  }
  covar_caco_assoc_signif <- subset(covar_caco_assoc, lg_p < 0.01)$covar
  covar_caco_assoc_signif <- as.character(covar_caco_assoc_signif)
  
  print(covar_caco_assoc)
  cat("Covariates associated (p<0.01) with case status : \n")
  print(covar_caco_assoc_signif)

  # which covars are assoc with ncnv_raw?
  covars_test <- c(paste0("PC",1:20), "LRR_SD", "FEMALE")
  covar_ncnv_assoc <-data.frame(covar=character(), 
                                lm_est=numeric(),
                                lm_p=numeric())
  for (covar in covars_test) {
    mdl_str <- paste0("n_cnv_raw~",covar)
    res <- summary(glm(as.formula(mdl_str), data=df))
    est <- res$coefficients[covar, 1]
    p <- res$coefficients[covar, 4]
    covar_ncnv_assoc <- rbind(covar_ncnv_assoc,
                              data.frame(covar=covar, lm_est=est, lm_p=p)
                             )
  }
  print(covar_ncnv_assoc)
  covar_ncnv_assoc_signif <- subset(covar_ncnv_assoc, lm_p < 0.01)$covar
  covar_ncnv_assoc_signif <- as.character(covar_ncnv_assoc_signif)
  
  cat("Covariates associated (p<0.01) with number of raw cnv calls : \n")
  print(covar_ncnv_assoc_signif)
  assoc_covar <- intersect(covar_caco_assoc_signif,
                           covar_ncnv_assoc_signif)

  print("Covariates assoc with case status and raw cnv calls : \n")
  print(assoc_covar)
  cat("Adding in sex and PCs 1-5 to covariates if not already present.\n")
  assoc_covar <- unique(sort(c("FEMALE",paste0("PC",1:5),
                               assoc_covar)))
  print("Covariates assoc with case status and raw cnv calls (or major PCs) : \n")
  print(assoc_covar)

  # add broad dataset group classifications
  df$dataset_adj <- df$dataset
  df$dataset_adj <- ifelse(grepl("ANGI",df$dataset_adj),"ANGI",df$dataset_adj)
  df$dataset_adj <- ifelse(grepl("norway_ctrls",df$dataset_adj),"norway_ctrls",df$dataset_adj)
  df$dataset_adj <- ifelse(grepl("NORDiC-SWE",df$dataset_adj),"NORDiC-SWE",df$dataset_adj)
  df$dataset_adj <- ifelse(grepl("NORDiC-NOR",df$dataset_adj),"NORDiC-NOR",df$dataset_adj)
  df$dataset_adj <- ifelse(grepl("LG500_ctrls",df$dataset_adj),"LG500_ctrls",df$dataset_adj)
  
  # if desired, remove ANGI
  if (opt[["no-angi"]] == TRUE) {
    df <- subset(df, dataset_adj != "ANGI")
  }

  # read cnv callset, form del-only and dup-only subsets
  cnv <- read.table(CNV_BED, header=F, stringsAsFactors=F, sep="\t")
  colnames(cnv) <- c("chrom","start0","end","locus","cnvtype","IID")
  cnv$length <- cnv$end - cnv$start0

  # add in cnv frequency column
  cnv$locus_cnvtype_iid <- paste0(cnv$locus,"_",cnv$cnvtype,"_",cnv$IID)
  frq_df <- read.table(LOCUS_CNVTYPE_IID_FREQ_TSV, header=F, stringsAsFactors=F)
  colnames(frq_df)[4] <- "freq"
  frq_df$locus_cnvtype_iid <- paste0(frq_df[,1],"_",frq_df[,2],"_",frq_df[,3])
  cnv <- merge(cnv, frq_df[,c("locus_cnvtype_iid","freq")], by="locus_cnvtype_iid")
  cnv$locus_cnvtype_iid <- NULL

  # read gene cds overlaps per cnv locus
  interval_geneoverlaps <- read.table(INTERVAL_GENEOVERLAPS_TSV,
                                 header=F, stringsAsFactors=F, sep="\t")
  locus_ngenes <- table(interval_geneoverlaps[,1])

  # only keep cnvs that are at least 100kb in length for main analysis
  # and conduct analysis of <100kb cnvs seperately
  # keep cnvs that are at least 30kb in length (should already be qualifying
  # criteria)
  if (opt[["size-ge-100kb"]] == TRUE) {
    cnv <- subset(cnv, length >= 100000)
  } else {
    cnv <- subset(cnv, length >= 30000)
  }

  # get CNVs that hit protein-coding bases of only 1 gene
  g1_loci <- names(locus_ngenes[locus_ngenes == 1])
  cnv_g1 <- subset(cnv, locus %in% g1_loci)

  # only keep cnv calls where carrier sample is in selected case/control 
  cnv <- subset(cnv, IID %in% df$IID)
  cnv_g1 <- subset(cnv_g1, IID %in% df$IID) 
  
  # store cnv calls
  del <- subset(cnv, cnvtype=="DEL")
  dup <- subset(cnv, cnvtype=="DUP")
  cnv_list <- list("cnv"=cnv,
                   "del"=del,
                   "dup"=dup)
  del_g1 <- subset(cnv_g1, cnvtype=="DEL")
  dup_g1 <- subset(cnv_g1, cnvtype=="DUP")
  cnv_g1_list <- list("cnv"=cnv_g1,
                      "del"=del_g1,
                      "dup"=dup_g1)

  # from raw and analysis ready cnv counts, get number of cnvs per sample that
  # ended up being filtered from analysis
  rownames(df) <- df$IID
  for (x in c("cnv","del","dup")) {
    col <- paste0("n_",x)
    colraw <- paste0(col,"_raw")
    col0 <- paste0(col,"_0")
    df <- CnvCount(df, cnv_list[[x]], col, count_type="n")
    df[[col0]] <- df[[colraw]] - df[[col]]
  }

  # before proceeding, plot analysis-ready cnv counts per dataset
  sumstats <- data.frame(metric=character(), 
                         group=character(), dataset=character(),
                         mean=numeric(), sd=numeric(),
                         lowerbound=numeric(), upperbound=numeric())
  datasets <- unique(sort(df$dataset))
  for (col in c("LRR_SD",
                "n_cnv_raw",
                "n_cnv_0",
                "n_cnv")) {
    for (ds in datasets) {
      df.d <- subset(df, dataset==ds)
      mean.x.ds <- mean(df.d[[col]])
      sd.x.ds <- sd(df.d[[col]])
      sumstats <- rbind(sumstats,
                        data.frame(metric=col, dataset=ds,
                                   group='GSA',
                                   mean=mean.x.ds,
                                   sd=sd.x.ds,
                                   lowerbound = mean.x.ds - sd.x.ds,
                                   upperbound = mean.x.ds + sd.x.ds)
                       )

    }
    sumstats.x <- subset(sumstats, metric==col)
    print(sumstats.x)
    print(dim(sumstats.x))
    print(length(datasets))
    sumstats.x <- sumstats.x[order(sumstats.x$mean), ]
    sumstats.x$dataset <- factor(sumstats.x$dataset,
                                 levels=sumstats.x$dataset)

    # plot mean / sd per dataset for metric
    gg <- ggplot(sumstats.x, aes(x=mean, y=dataset, color))
    gg <- gg + geom_pointrange(aes(xmin=lowerbound, xmax=upperbound))
    gg <- gg + ggtitle(col)
    gg <- gg + theme(axis.title.x = element_blank(),
                     axis.title.y = element_blank(),
                     axis.text.y = element_text(angle = 45, hjust = 1))
              ggsave(gg, file=paste0(opt[["outroot"]], ".dataset.",col,".pdf")) 
  }  

  # read gnomad pLI file, only keep subset with defined pLI
  gnomad_pli <- read.table(GNOMAD_LOF_METRICS_TSV,header=T, sep="\t",
                           stringsAsFactors=F)
  gnomad_pli <- gnomad_pli[,c("gene","pLI","oe_lof_upper_bin")]
  gnomad_pli <- subset(gnomad_pli, is.na(pLI) == F)

  # how many genes per pLI bin? print to stdout
  cat("N genes with pLI < 0.5 : ", nrow(subset(gnomad_pli, pLI < 0.5)),"\n")
  cat("N genes with pLI > 0.5 : ", nrow(subset(gnomad_pli, pLI > 0.5)),"\n")
  cat("N genes with pLI 0.5-0.995 : ", 
      nrow(subset(gnomad_pli, (pLI > 0.5) & (pLI < 0.995))),"\n")
  cat("N genes with pLI > 0.995 : ", nrow(subset(gnomad_pli, pLI > 0.995)),"\n")

  # get max pLI per interval
  interval_max_pli <- data.frame(interval=unique(sort(interval_geneoverlaps[,1])))
  rownames(interval_max_pli) <- interval_max_pli$interval
  interval_max_pli$max_pli <- rep(0, nrow(interval_max_pli))
  interval_max_pli$max_loeuf <- rep(-1, nrow(interval_max_pli))
  for (interval in rownames(interval_max_pli)) {
    interval_geneoverlaps.i <- subset(interval_geneoverlaps,
                                      interval_geneoverlaps[,1]==interval)
    genes.i <- interval_geneoverlaps.i[,2]
    gnomad_pli.i <- subset(gnomad_pli, gene %in% genes.i)
    if (nrow(gnomad_pli.i) > 0) {
      pli_max <- max(gnomad_pli.i$pLI)
      loeuf_max <- max(gnomad_pli.i$oe_lof_upper_bin)
      interval_max_pli[interval, "max_pli"] <- pli_max
      interval_max_pli[interval, "max_loeuf"] <- loeuf_max
    }
  }

  # read phaplo / ptriplo gene-based score file
  ht <- data.frame(read_excel(HAPLOTRIPLO_SCORE_XLSX, sheet="Table S7"))
  print(head(ht))
  haploinsuff_genes <- ht[ht$pHaplo >= 0.86, "Gene"]
  haplosuff_genes <- ht[ht$pHaplo < 0.86, "Gene"]
  triploinsuff_genes <- ht[ht$pTriplo >= 0.94, "Gene"]
  triplosuff_genes <- ht[ht$pTriplo < 0.94, "Gene"]
  haploinsuff_triploinsuff_genes <- intersect(haploinsuff_genes,
                                              triploinsuff_genes)
  haplosuff_triplosuff_genes <- intersect(haplosuff_genes,
                                          triplosuff_genes)

  # global cnv count
  for (x in c("n","nbp","n100kb")) {
    for (y in c("cnv","del","dup")) {
      xy <- paste0(x,"_",y)
      df <- CnvCount(df, cnv_list[[y]], xy, count_type=x)
    }
  }

  # by size bin
  for (x in c("n")) {
    for (y in c("cnv","del","dup")) {
      xy <- paste0(x,"_",y)
      df <- CnvCount(df, 
                     cnv_list[[y]], 
                     paste0(xy,"_30kb_to_100kb"), 
                     count_type=x,
                     minlength=30001,
                     maxlength=100000)
      df <- CnvCount(df, 
                     cnv_list[[y]], 
                     paste0(xy,"_100kb_to_500kb"), 
                     count_type=x,
                     minlength=100001,
                     maxlength=500000)
      df <- CnvCount(df, 
                     cnv_list[[y]], 
                     paste0(xy,"_500kb_to_1mb"), 
                     count_type=x,
                     minlength=500001,
                     maxlength=1000000)
      df <- CnvCount(df, 
                     cnv_list[[y]], 
                     paste0(xy,"_gt_1mb"), 
                     count_type=x,
                     minlength=1000001)

    }
  }

  # by frequency
  freq_lowerbounds <- c(0, 2,  11, 21)
  freq_upperbounds <- c(1, 10, 20, 38)
  freq_lowerbounds <- c(0, 2, 6,  11)
  freq_upperbounds <- c(1, 5, 10, 38)
  n_bounds <- length(freq_lowerbounds)
  for (i in 1:n_bounds) {
   for (x in c("n")) {
    for (y in c("cnv","del","dup")) {
      xy <- paste0(x,"_",y)
      df <- CnvCount(df, 
                     cnv_list[[y]], 
                     paste0(xy,"_freq",
                            freq_lowerbounds[i],
                            "to",
                            freq_upperbounds[i]), 
                     count_type=x,
                     minfreq=freq_lowerbounds[i],
                     maxfreq=freq_upperbounds[i])
    
    }
   }
  }
 
  # store counts of n gene overlaps per sample 
  print('get gene counts ')
  df <- NgenesPerSample(df, cnv_list, interval_geneoverlaps, nbp_mode=FALSE)
  print('done')

  X <- 1
  if (X==1) {
  # by genic/nongenic
  for (x in c("n")) {
    for (y in c("cnv","del","dup")) {
      nongenic <- subset(cnv_list[[y]], (locus %in% names(locus_ngenes))==F)
      genic <- subset(cnv_list[[y]], locus %in% names(locus_ngenes)) 
      xy <- paste0(x,"_",y)
      df <- CnvCount(df, 
                     nongenic, 
                     paste0(xy,"_nongenic"), 
                     count_type=x)
                     
      df <- CnvCount(df, 
                     genic, 
                     paste0(xy,"_genic"), 
                     count_type=x) 
    }
  }

  # by genic/nongenic (small cnvs)
  for (x in c("n")) {
    for (y in c("cnv","del","dup")) {
      nongenic <- subset(cnv_g1_list[[y]], (locus %in% names(locus_ngenes))==F)
      genic <- subset(cnv_g1_list[[y]], locus %in% names(locus_ngenes)) 
      xy <- paste0(x,"_",y)
      df <- CnvCount(df, 
                     nongenic, 
                     paste0(xy,"_g1_nongenic"), 
                     count_type=x)
                     
      df <- CnvCount(df, 
                     genic, 
                     paste0(xy,"_g1_genic"), 
                     count_type=x) 
    }
  }


  # by pli bin
  pli_0_to_5.intervals <- subset(interval_max_pli, max_pli < 0.5)[,1]
  pli_5_to_995.intervals <- subset(interval_max_pli, 
                                   (max_pli >= 0.5) & (max_pli < 0.995))[,1]
  pli_995_to_1.intervals <- subset(interval_max_pli, 
                                   (max_pli >= 0.995) & (max_pli <= 1))[,1]
  pli_5_to_1.intervals <- c(pli_5_to_995.intervals,
                            pli_995_to_1.intervals)
  for (x in c("n")) {
    for (y in c("cnv","del","dup")) {
      cnv.pli_0_to_5 <- subset(cnv_list[[y]],
                               locus %in% pli_0_to_5.intervals)
      cnv.pli_5_to_1 <- subset(cnv_list[[y]],
                               locus %in% pli_5_to_1.intervals)
      cnv.pli_5_to_995 <- subset(cnv_list[[y]],
                                 locus %in% pli_5_to_995.intervals)
      cnv.pli_995_to_1 <- subset(cnv_list[[y]],
                                 locus %in% pli_995_to_1.intervals)
      xy <- paste0(x,"_",y)
      df <- CnvCount(df, 
                     cnv.pli_0_to_5, 
                     paste0(xy,"_pli_0_to_50"), 
                     count_type=x)
      df <- CnvCount(df, 
                     cnv.pli_5_to_1, 
                     paste0(xy,"_pli_50_to_1"), 
                     count_type=x)
      df <- CnvCount(df, 
                     cnv.pli_5_to_995, 
                     paste0(xy,"_pli_50_to_995"), 
                     count_type=x)
      df <- CnvCount(df, 
                     cnv.pli_995_to_1, 
                     paste0(xy,"_pli_995_to_1"), 
                     count_type=x)


    }
  }

  # init list for keeping track of deleterious cnvs:
  # 1. impacting a gene with pLI > 0.995
  # 2. known neurodevelopmental CNV (Kendall et al. 2019)
  # 3. known neurodevelopmental gene (Fu et al. 2022)
  dmg_cnv_list <- list()

  # write CNVs overlapping a gene with pLI > 0.995 to file and to list
  cnv.pli_995_to_1 <- subset(cnv_list[["cnv"]],
                             locus %in% pli_995_to_1.intervals)
  pli_gt_995_genes <- subset(gnomad_pli, pLI > 0.995)$gene
  x <- interval_geneoverlaps[interval_geneoverlaps[,2] %in% pli_gt_995_genes,,drop=F]
  colnames(x) <- c("locus","gene_overlaps")
  cnv.pli_995_to_1.bed <- merge(cnv.pli_995_to_1, x, by='locus')
  cnv.pli_995_to_1.bed <- cnv.pli_995_to_1.bed[,c(2,3,4,1,5,6,9)]
  cnv.pli_995_to_1.bed <- unique(cnv.pli_995_to_1.bed)
  cnv.pli_995_to_1.bed <- subset(cnv.pli_995_to_1.bed, IID %in% case_iids)
  colnames(cnv.pli_995_to_1.bed) <- c("chrom","start0","end","locus","cnvtype",
                                      "IID","overlap")
  write.table(cnv.pli_995_to_1.bed,
              file=paste0(opt[["outroot"]], ".pLI_gt_995_overlaps.cases_only.bed"),
              row.names=F, col.names=F, quote=F, sep="\t")
  dmg_cnv_list[["pLI_995"]] <- cnv.pli_995_to_1.bed

  # by pli bin (small cnvs(
  for (x in c("n")) {
    for (y in c("cnv","del","dup")) {
      cnv.pli_0_to_5 <- subset(cnv_g1_list[[y]],
                               locus %in% pli_0_to_5.intervals)
      cnv.pli_5_to_1 <- subset(cnv_g1_list[[y]],
                               locus %in% pli_5_to_1.intervals)
      cnv.pli_5_to_995 <- subset(cnv_g1_list[[y]],
                               locus %in% pli_5_to_995.intervals)
      cnv.pli_995_to_1 <- subset(cnv_g1_list[[y]],
                               locus %in% pli_995_to_1.intervals)
      xy <- paste0(x,"_",y)
      print(xy)
      df <- CnvCount(df, 
                     cnv.pli_0_to_5, 
                     paste0(xy,"_g1_pli_0_to_50"), 
                     count_type=x)
      df <- CnvCount(df, 
                     cnv.pli_5_to_1, 
                     paste0(xy,"_g1_pli_50_to_1"), 
                     count_type=x)
      df <- CnvCount(df, 
                     cnv.pli_5_to_995, 
                     paste0(xy,"_g1_pli_50_to_995"), 
                     count_type=x)
      df <- CnvCount(df, 
                     cnv.pli_995_to_1, 
                     paste0(xy,"_g1_pli_995_to_1"), 
                     count_type=x)

    }
  }
 
  }

  # get total number of cnv-impacted bases per sample
  # for (x in c("del","dup")) {
  #   col <- paste0("nbp_",x)
  #   cnv <- cnv_list[[x]]
  #   df[[col]] <- rep(0, nrow(df))
  #   cnv$length <- cnv$end - cnv$start0
  #   iids <- unique(cnv$IID)
  #   for (iid in iids) {
  #     nbp.i <- sum(subset(cnv, IID == iid)$length)
  #   }
  # }

  # deletions (by constraint bin) 
  del <- cnv_list[["del"]]
  # del <- subset(del, locus %in% names(locus_ngenes)) 
  del$length <- del$end - del$start0
  del.cscores <- subset(cons, locus %in% del$locus)
  df$nbp_del_notcons <- rep(0, nrow(df))
  df$nbp_del_cons <- rep(0, nrow(df))
  df$nbp_del_cons0to1 <- rep(0, nrow(df))
  df$nbp_del_cons1to2 <- rep(0, nrow(df))
  df$nbp_del_cons2to3 <- rep(0, nrow(df))
  df$nbp_del_cons3to4 <- rep(0, nrow(df))
  df$nbp_del_cons4to5 <- rep(0, nrow(df))
  df$nbp_del_cons5to6 <- rep(0, nrow(df))
  df$nbp_del_cons6to7 <- rep(0, nrow(df))
  df$nbp_del_cons7to8 <- rep(0, nrow(df))
  df$nbp_del_cons8to9 <- rep(0, nrow(df))
  carrier_iids <- unique(del$IID)
  for (iid in carrier_iids) {
    del.i <- subset(del, IID == iid)
    del.cscores.i <- subset(del.cscores,
                            locus %in% del.i$locus)
    if (nrow(del.cscores.i) == 0) {next}
    # df[iid, "nbp_del_notcons"] <- sum(del.cscores.i$X.20to2.27)
    df[iid, "nbp_del_notcons"] <- sum(del.cscores.i[,uncons_cols])   
    df[iid, "nbp_del_cons"] <- sum(del.cscores.i$X2.27to12)
    for (i in 0:8) {
      j <- i + 1
      cscores.col <- paste0("X",i,"to",j)
      col <- paste0("nbp_del_cons",i,"to",j)
      df[iid, col] <- sum(del.cscores.i[[cscores.col]])

    }
  }
  df$nbp_del_cons0to3 <- df$nbp_del_cons0to1 + df$nbp_del_cons1to2 + df$nbp_del_cons2to3
  df$nbp_del_cons3to6 <- df$nbp_del_cons3to4 + df$nbp_del_cons4to5 + df$nbp_del_cons5to6
  df$nbp_del_cons6to9 <- df$nbp_del_cons6to7 + df$nbp_del_cons7to8 + df$nbp_del_cons8to9

  # duplications (by constraint bin)
  dup <- cnv_list[["dup"]]
  # dup <- subset(dup, locus %in% names(locus_ngenes))
  dup$length <- dup$end - dup$start0
  dup.cscores <- subset(cons, locus %in% dup$locus)
  df$nbp_dup_notcons <- rep(0, nrow(df))
  df$nbp_dup_cons <- rep(0, nrow(df))
  df$nbp_dup_cons0to1 <- rep(0, nrow(df))
  df$nbp_dup_cons1to2 <- rep(0, nrow(df))
  df$nbp_dup_cons2to3 <- rep(0, nrow(df))
  df$nbp_dup_cons3to4 <- rep(0, nrow(df))
  df$nbp_dup_cons4to5 <- rep(0, nrow(df))
  df$nbp_dup_cons5to6 <- rep(0, nrow(df))
  df$nbp_dup_cons6to7 <- rep(0, nrow(df))
  df$nbp_dup_cons7to8 <- rep(0, nrow(df))
  df$nbp_dup_cons8to9 <- rep(0, nrow(df))
  carrier_iids <- unique(dup$IID)
  for (iid in carrier_iids) {
    dup.i <- subset(dup, IID == iid)
    dup.cscores.i <- subset(dup.cscores,
                            locus %in% dup.i$locus)
    if (nrow(dup.cscores.i) == 0) {next}
    # df[iid, "nbp_dup_notcons"] <- sum(dup.cscores.i$X.20to2.27)
    df[iid, "nbp_dup_notcons"] <- sum(dup.cscores.i[,uncons_cols])
    df[iid, "nbp_dup_cons"] <- sum(dup.cscores.i$X2.27to12)
    for (i in 0:8) {
      j <- i + 1
      cscores.col <- paste0("X",i,"to",j)
      col <- paste0("nbp_dup_cons",i,"to",j)
      df[iid, col] <- sum(dup.cscores.i[[cscores.col]])

    }
  }
  df$n_dup_cons <- rep(0, nrow(df))
  for (iid in carrier_iids) {
    dup.i <- subset(dup, IID == iid)
    dup.cscores.i <- subset(dup.cscores,
                                     locus %in% dup.i$locus)
    n_dup <- nrow(dup.cscores.i)
    df[iid, "n_dup_cons"] <- n_dup
  }
  df$nbp_dup_cons0to3 <- df$nbp_dup_cons0to1 + df$nbp_dup_cons1to2 + df$nbp_dup_cons2to3
  df$nbp_dup_cons3to6 <- df$nbp_dup_cons3to4 + df$nbp_dup_cons4to5 + df$nbp_dup_cons5to6
  df$nbp_dup_cons6to9 <- df$nbp_dup_cons6to7 + df$nbp_dup_cons7to8 + df$nbp_dup_cons8to9

  # deletions (by constraint bin) - noncoding cnvs only
  del <- cnv_list[["del"]]
  del <- subset(del, (locus %in% names(locus_ngenes))==F)
  del$length <- del$end - del$start0
  del.cscores <- subset(cons, locus %in% del$locus)
  df$nbp_del_notcons_nc <- rep(0, nrow(df))
  df$nbp_del_cons_nc <- rep(0, nrow(df))
  df$nbp_del_cons0to1_nc <- rep(0, nrow(df))
  df$nbp_del_cons1to2_nc <- rep(0, nrow(df))
  df$nbp_del_cons2to3_nc <- rep(0, nrow(df))
  df$nbp_del_cons3to4_nc <- rep(0, nrow(df))
  df$nbp_del_cons4to5_nc <- rep(0, nrow(df))
  df$nbp_del_cons5to6_nc <- rep(0, nrow(df))
  df$nbp_del_cons6to7_nc <- rep(0, nrow(df))
  df$nbp_del_cons7to8_nc <- rep(0, nrow(df))
  df$nbp_del_cons8to9_nc <- rep(0, nrow(df))
  carrier_iids <- unique(del$IID)
  for (iid in carrier_iids) {
    del.i <- subset(del, IID == iid)
    del.cscores.i <- subset(del.cscores,
                            locus %in% del.i$locus)
    if (nrow(del.cscores.i) == 0) {next}
    # df[iid, "nbp_del_notcons_nc"] <- sum(del.cscores.i$X.20to2.27)
    df[iid, "nbp_del_notcons_nc"] <- sum(del.cscores.i[,uncons_cols])
    df[iid, "nbp_del_cons_nc"] <- sum(del.cscores.i$X2.27to12)
    for (i in 0:8) {
      j <- i + 1
      cscores.col <- paste0("X",i,"to",j)
      col <- paste0("nbp_del_cons",i,"to",j,"_nc")
      df[iid, col] <- sum(del.cscores.i[[cscores.col]])

    }
  }
  df$nbp_del_cons0to3_nc <- df$nbp_del_cons0to1_nc + df$nbp_del_cons1to2_nc + df$nbp_del_cons2to3_nc
  df$nbp_del_cons3to6_nc <- df$nbp_del_cons3to4_nc + df$nbp_del_cons4to5_nc + df$nbp_del_cons5to6_nc
  df$nbp_del_cons6to9_nc <- df$nbp_del_cons6to7_nc + df$nbp_del_cons7to8_nc + df$nbp_del_cons8to9_nc
  df$nbp_del_nc <- df$nbp_del_notcons_nc + df$nbp_del_cons_nc

  # duplications (by constraint bin) - noncoding cnvs
  dup <- cnv_list[["dup"]]
  dup <- subset(dup, (locus %in% names(locus_ngenes))==F) 
  dup$length <- dup$end - dup$start0
  dup.cscores <- subset(cons, locus %in% dup$locus)
  df$nbp_dup_notcons_nc <- rep(0, nrow(df))
  df$nbp_dup_cons_nc <- rep(0, nrow(df))
  df$nbp_dup_cons0to1_nc <- rep(0, nrow(df))
  df$nbp_dup_cons1to2_nc <- rep(0, nrow(df))
  df$nbp_dup_cons2to3_nc <- rep(0, nrow(df))
  df$nbp_dup_cons3to4_nc <- rep(0, nrow(df))
  df$nbp_dup_cons4to5_nc <- rep(0, nrow(df))
  df$nbp_dup_cons5to6_nc <- rep(0, nrow(df))
  df$nbp_dup_cons6to7_nc <- rep(0, nrow(df))
  df$nbp_dup_cons7to8_nc <- rep(0, nrow(df))
  df$nbp_dup_cons8to9_nc <- rep(0, nrow(df))
  carrier_iids <- unique(dup$IID)
  for (iid in carrier_iids) {
    dup.i <- subset(dup, IID == iid)
    dup.cscores.i <- subset(dup.cscores,
                            locus %in% dup.i$locus)
    if (nrow(dup.cscores.i) == 0) {next}
    # df[iid, "nbp_dup_notcons_nc"] <- sum(dup.cscores.i$X.20to2.27)
    df[iid, "nbp_dup_notcons_nc"] <- sum(dup.cscores.i[,uncons_cols])
    df[iid, "nbp_dup_cons_nc"] <- sum(dup.cscores.i$X2.27to12)
    for (i in 0:8) {
      j <- i + 1
      cscores.col <- paste0("X",i,"to",j)
      col <- paste0("nbp_dup_cons",i,"to",j,"_nc")
      df[iid, col] <- sum(dup.cscores.i[[cscores.col]])

    }
  }
  df$n_dup_cons_nc <- rep(0, nrow(df))
  for (iid in carrier_iids) {
    dup.i <- subset(dup, IID == iid)
    dup.cscores.i <- subset(dup.cscores,
                                     locus %in% dup.i$locus)
    n_dup <- nrow(dup.cscores.i)
    df[iid, "n_dup_cons_nc"] <- n_dup
  }
  df$nbp_dup_cons0to3_nc <- df$nbp_dup_cons0to1_nc + df$nbp_dup_cons1to2_nc + df$nbp_dup_cons2to3_nc
  df$nbp_dup_cons3to6_nc <- df$nbp_dup_cons3to4_nc + df$nbp_dup_cons4to5_nc + df$nbp_dup_cons5to6_nc
  df$nbp_dup_cons6to9_nc <- df$nbp_dup_cons6to7_nc + df$nbp_dup_cons7to8_nc + df$nbp_dup_cons8to9_nc
  df$nbp_dup_nc <- df$nbp_dup_notcons_nc + df$nbp_dup_cons_nc

  # neurodev cnvs (for dels, add in exonic dels that qualify)
  ndev_del <- read.table(NDEV_DEL_BED, header=F, sep="\t", stringsAsFactors=F)
  ndev_dup <- read.table(NDEV_DUP_BED, header=F, sep="\t", stringsAsFactors=F)
  
  # exclude the following neurodev CNV classes due to having a 
  # case/control CNV count of at least 10 :
  # 1. dels : 15q11.2_del_BP1-BP2
  # 2. dups : 15q11.2_dup_BP1-BP2, 16p13.11_dup
  ndev_del <- ndev_del[(ndev_del[,10] %in% c("15q11.2_del_BP1-BP2"))==F, ]
  ndev_dup <- ndev_dup[(ndev_dup[,10] %in% c("15q11.2_dup_BP1-BP2",
                                             "16p13.11_dup"))==F, ] 

  # write combined and filtered neurodev CNV table to file, and store to list
  ndev_cnv_bed <- rbind(ndev_del, ndev_dup)
  ndev_cnv_bed <- ndev_cnv_bed[,c(1:6,10)]
  colnames(ndev_cnv_bed) <- c("chrom","start0","end","locus","cnvtype","IID",
                              "overlap")
  ndev_cnv_bed <- subset(ndev_cnv_bed, IID %in% case_iids)
  write.table(ndev_cnv_bed,
              file=paste0(opt[["outroot"]], ".neurodev_CNV_overlaps.cases_only.bed"),
              row.names=F, col.names=F, quote=F, sep="\t")
  dmg_cnv_list[["neurodev"]] <- ndev_cnv_bed

  # get ndev cnv counts per sample
  ndev_del <- ndev_del[,1:6]
  ndev_dup <- ndev_dup[,1:6]
  ndev_exonicdel_genes <- scan(NDEV_EXONICDEL_GENELIST, what=character())
  interval_geneoverlaps.x <- interval_geneoverlaps[interval_geneoverlaps[,2]
                                                   %in% ndev_exonicdel_genes, ,
                                                   drop=F] 
  interval_geneoverlaps.x <- interval_geneoverlaps.x[interval_geneoverlaps.x[,1]
                                                     %in% cnv_list[["del"]], ,
                                                     drop=F]
  ndev_del.x <- subset(cnv_list[["del"]], locus %in% interval_geneoverlaps.x[,1])
  ndev_del.x <- subset(ndev_del.x, (locus %in% ndev_del[,4])==F)
  ndev_del <- rbind(ndev_del, ndev_del.x)
  colnames(ndev_del) <- c("chrom","start0","end","locus","cnvtype","IID")
  colnames(ndev_dup) <- c("chrom","start0","end","locus","cnvtype","IID")
  ndev_cnv <- rbind(ndev_del, ndev_dup)
  df <- CnvCount(df,
                 ndev_del,
                 "n_del_neurodev",
                 count_type="n")
  df <- CnvCount(df,
                 ndev_dup,
                 "n_dup_neurodev",
                 count_type="n")
  df <- CnvCount(df,
                 ndev_cnv,
                 "n_cnv_neurodev",
                 count_type="n")

  # write neurodev overlaps BED to file
  write.table(subset(ndev_del.x, IID %in% case_iids),
              file=paste0(opt[["outroot"]],
                          ".neurodev_del_exonoverlaps.cases_only.bed"),
              row.names=F, col.names=F, sep="\t", quote=F)

  # which genes overlap NDD genes from Fu et al. 2021 manuscript ?
  ndd_genes <- scan(NDD_GENES, what=character())

  # exclude PTGDR2 (CNV count > 10),
  # CORO1A (CNV count > 10)
  ndd_genes <- ndd_genes[(ndd_genes %in% c("PTGDR2","CORO1A"))==F]

  interval_geneoverlaps.x <- interval_geneoverlaps[interval_geneoverlaps[,2]
                                                   %in% ndd_genes, ,
                                                   drop=F] 
  interval_geneoverlaps.x <- interval_geneoverlaps.x[interval_geneoverlaps.x[,1]
                                                     %in% cnv_list[["cnv"]][,4], ,
                                                     drop=F]
  ndd_del <- subset(cnv_list[["del"]], locus %in% interval_geneoverlaps.x[,1])
  ndd_dup <- subset(cnv_list[["dup"]], locus %in% interval_geneoverlaps.x[,1])
  ndd_cnv <- subset(cnv_list[["cnv"]], locus %in% interval_geneoverlaps.x[,1])

  # write NDD BED to file and to list
  colnames(interval_geneoverlaps.x) <- c("locus","overlap")
  ndd_cnv_bed <- merge(ndd_cnv, interval_geneoverlaps.x, by='locus')
  ndd_cnv_bed <- subset(ndd_cnv_bed, IID %in% case_iids)
  ndd_cnv_bed <- ndd_cnv_bed[,c(2,3,4,1,5,6,9)]
  ndd_cnv_bed <- unique(ndd_cnv_bed)
  colnames(ndd_cnv_bed) <- c("chrom","start0","end","locus","cnvtype",
                             "IID","overlap")
  write.table(ndd_cnv_bed,
              file=paste0(opt[["outroot"]], ".NDD_gene_overlaps.cases_only.bed"),
              row.names=F, col.names=F, sep="\t", quote=F)
  dmg_cnv_list[["NDD"]] <- ndd_cnv_bed

  # get NDD gene CNV counts per sample
  df <- CnvCount(df,
                 ndd_del,
                 "n_del_NDD",
                 count_type="n")
  df <- CnvCount(df,
                 ndd_dup,
                 "n_dup_NDD",
                 count_type="n")
  df <- CnvCount(df,
                 ndd_cnv,
                 "n_cnv_NDD",
                 count_type="n")

  # get non-NDD counts
  ndd0_del <- subset(cnv_list[["del"]], 
                     (locus %in% interval_geneoverlaps.x[,1])==F)
  ndd0_dup <- subset(cnv_list[["dup"]], 
                     (locus %in% interval_geneoverlaps.x[,1])==F)
  ndd0_cnv <- subset(cnv_list[["cnv"]], 
                     (locus %in% interval_geneoverlaps.x[,1])==F)
  df <- CnvCount(df,
                 ndd0_del,
                 "n_del_NDD0",
                 count_type="n")
  df <- CnvCount(df,
                 ndd0_dup,
                 "n_dup_NDD0",
                 count_type="n")
  df <- CnvCount(df,
                 ndd0_cnv,
                 "n_cnv_NDD0",
                 count_type="n")

  # get cnvs overlapping a haploinsuff or triploinsuff gene
  interval_geneoverlaps.h <- interval_geneoverlaps[interval_geneoverlaps[,2]
                                                   %in% haploinsuff_genes, ,
                                                   drop=F] 
  interval_geneoverlaps.t <- interval_geneoverlaps[interval_geneoverlaps[,2]
                                                   %in% triploinsuff_genes, ,
                                                   drop=F] 
  interval_geneoverlaps.h0 <- interval_geneoverlaps[interval_geneoverlaps[,2]
                                                    %in% haplosuff_genes, ,
                                                    drop=F] 
  interval_geneoverlaps.t0 <- interval_geneoverlaps[interval_geneoverlaps[,2]
                                                    %in% triplosuff_genes, ,
                                                    drop=F] 
  haplo1_del <- subset(cnv_list[["del"]], 
                       (locus %in% interval_geneoverlaps.h[,1])==T)
  haplo1_dup <- subset(cnv_list[["dup"]], 
                       (locus %in% interval_geneoverlaps.h[,1])==T)
  triplo1_del <- subset(cnv_list[["del"]], 
                        (locus %in% interval_geneoverlaps.t[,1])==T)
  triplo1_dup <- subset(cnv_list[["dup"]], 
                        (locus %in% interval_geneoverlaps.t[,1])==T)
  haplo0_del <- subset(cnv_list[["del"]], 
                       (locus %in% interval_geneoverlaps.h[,1])==F)
  haplo0_dup <- subset(cnv_list[["dup"]], 
                       (locus %in% interval_geneoverlaps.h[,1])==F)
  triplo0_del <- subset(cnv_list[["del"]], 
                        (locus %in% interval_geneoverlaps.t[,1])==F)
  triplo0_dup <- subset(cnv_list[["dup"]], 
                        (locus %in% interval_geneoverlaps.t[,1])==F)
  df <- CnvCount(df,
                 haplo1_del,
                 "n_del_phaplo_ge_86",
                 count_type="n")
  df <- CnvCount(df,
                 haplo1_dup,
                 "n_dup_phaplo_ge_86",
                 count_type="n")
  df <- CnvCount(df,
                 triplo1_del,
                 "n_del_ptriplo_ge_94",
                 count_type="n")
  df <- CnvCount(df,
                 triplo1_dup,
                 "n_dup_ptriplo_ge_94",
                 count_type="n")
  df <- CnvCount(df,
                 haplo0_del,
                 "n_del_phaplo_lt_86",
                 count_type="n")
  df <- CnvCount(df,
                 haplo0_dup,
                 "n_dup_phaplo_lt_86",
                 count_type="n")
  df <- CnvCount(df,
                 triplo0_del,
                 "n_del_ptriplo_lt_94",
                 count_type="n")
  df <- CnvCount(df,
                 triplo0_dup,
                 "n_dup_ptriplo_lt_94",
                 count_type="n")

  # get CNVs overlapping a gene that is both haplo- and triplo-insufficient
  interval_geneoverlaps.ht <- interval_geneoverlaps[interval_geneoverlaps[,2]
                                                    %in% haploinsuff_triploinsuff_genes, ,
                                                    drop=F] 
  haplotriplo1_del <- subset(cnv_list[["del"]], 
                             (locus %in% interval_geneoverlaps.ht[,1])==T)
  haplotriplo0_del <- subset(cnv_list[["del"]], 
                             (locus %in% interval_geneoverlaps.ht[,1])==F)
  haplotriplo1_dup <- subset(cnv_list[["dup"]], 
                             (locus %in% interval_geneoverlaps.ht[,1])==T)
  haplotriplo0_dup <- subset(cnv_list[["dup"]], 
                             (locus %in% interval_geneoverlaps.ht[,1])==F)
  df <- CnvCount(df,
                 haplotriplo1_del,
                 "n_del_phaplotriplo_1",
                 count_type="n")
  df <- CnvCount(df,
                 haplotriplo0_del,
                 "n_del_phaplotriplo_0",
                 count_type="n")
  df <- CnvCount(df,
                 haplotriplo1_dup,
                 "n_dup_phaplotriplo_1",
                 count_type="n")
  df <- CnvCount(df,
                 haplotriplo0_dup,
                 "n_dup_phaplotriplo_0",
                 count_type="n")



  # init results df
  res_df <- NULL
  cmh_df <- NULL
  covars_list <- list(
                      "ALL"=assoc_covar
                     )

  # before running tests, write full metrics to file
  write.table(df,
              file=paste0(opt[["outroot"]],".metrics.tsv"),
              row.names=F,
              col.names=T,
              sep="\t",
              quote=F)

  # for each analysis group ..
  groups <- c("ALL")
  for (grp in groups) {

    Y <- 1
    if (Y == 1) {

    # run global cnv tests
    RUNTHIS <- 1
    if (RUNTHIS == 1) {
    for (x in c("n","n100kb")) {
      for (y in c("cnv","del","dup")) {
        xy <- paste0(x,"_",y)
        res_df <- AssocTest(df, xy, 
                            group=grp,
                            covariates=covars_list[[grp]],
                            outcome="CASE",
                            out_df=res_df)
        # TEST
        cmh_df <- CMHTest(df, 
                          xy,
                          group_col='GROUP',
                          groupname_recode=NA,
                          outcome="CASE",
                          out_df=cmh_df)

      }
    }

    # dataset col to use for checks
    ds_col <- "dataset_adj"
     
    # global cnv tests (test for unequal means across ctrl-only, case-only data)
    ds_cmp_stats <- data.frame(pheno=character(),
                               metric=character(),
                               metric_aov_p=numeric())
    dataset_stats <- data.frame(dataset=character(),
                                dataset_n=numeric(),
                                metric=character(),
                                metric_mean=numeric(),
                                metric_95ci_l=numeric(),
                                metric_95ci_u=numeric())
    df.ca <- subset(df, CASE==1)
    df.co <- subset(df, CASE==0)
    for (x in c("n")) {
      for (y in c("del","dup","cnv")) {
        for (z in c("",
                    "_nongenic")) {
          xyz<-paste0(x,"_",y,z)

          # get means per ds
          datasets <- unique(sort(df[[ds_col]]))
          for (ds in datasets) {
            df.ds <- subset(df, df[[ds_col]] == ds)
            vals <- df.ds[[xyz]]
            mean.x <- as.numeric(t.test(vals)$estimate)
            ci.l.x <- t.test(vals)$conf.int[1]
            ci.u.x <- t.test(vals)$conf.int[2] 
            dataset_stats <- rbind(dataset_stats,
                                   data.frame(dataset=ds,
                                              dataset_n=nrow(df.ds),
                                              metric=xyz,
                                              metric_mean=mean.x,
                                              metric_95ci_l=ci.l.x,
                                              metric_95ci_u=ci.u.x)
                                  )
          }                                      

             
          # form model for case dataset comparison, ctrl dataset comparison
          mdl_str <- paste0(xyz,"~",ds_col)
          mdl <- as.formula(mdl_str)

          # do case dataset comparison and control dataset comparison
          res.ca <- summary(aov(mdl,data=df.ca))
          res.co <- summary(aov(mdl,data=df.co))
          
          # extract pvals
          p.ca <- res.ca[[1]][1,5]
          p.co <- res.co[[1]][1,5]

          # add to df
          ds_cmp_stats <- rbind(ds_cmp_stats,
                                data.frame(pheno="case",
                                           metric=xyz,
                                           metric_aov_p=p.ca)
                               )
          ds_cmp_stats <- rbind(ds_cmp_stats,
                                data.frame(pheno="ctrl",
                                           metric=xyz,
                                           metric_aov_p=p.co)
                               )


        } 
      }
    }
    print(dataset_stats)
    print(ds_cmp_stats)
    
    # write stats to file
    write.table(dataset_stats, 
                file=paste0(opt[["outroot"]],".stats_per_dataset.tsv"),
                row.names=F, col.names=T,
                sep="\t", quote=F)
    write.table(ds_cmp_stats, 
                file=paste0(opt[["outroot"]],".dataset_comparison_stats.tsv"),
                row.names=F, col.names=T,
                sep="\t", quote=F)
 
    COCO_CACA <- TRUE
    if (COCO_CACA==TRUE) {
    # global cnv tests
    coco_df <- NULL
    caca_df <- NULL
    for (i in c("n","nbp")) {
      for (j in c("del","dup")) {
        x<-paste0(i,"_",j)
        print(x)
        if (opt[["no-angi"]] == FALSE) {
        
          # ANGI vs LG500_ctrls
          df.x <- subset(df, dataset_adj %in% c("ANGI","LG500_ctrls"))
          df.x$CASE <- ifelse(df.x$dataset_adj=="ANGI",1,0)
          coco_df <- AssocTest(df.x, x, 
                               group="ALL",
                               covariates=c("LRR_SD","FEMALE"),
                               groupname_recode="ANGI_vs_LG500ctrls",
                               outcome="CASE",
                               out_df=coco_df)

          # ANGI vs norway_ctrls
          df.x <- subset(df, dataset_adj %in% c("ANGI","norway_ctrls"))
          df.x$CASE <- ifelse(df.x$dataset_adj=="ANGI",1,0)
          coco_df <- AssocTest(df.x, x, 
                               group="ALL",
                               covariates=c("LRR_SD","FEMALE"),
                               groupname_recode="ANGI_vs_norwayctrls",
                               outcome="CASE",
                               out_df=coco_df)
        
        }

        # norway_ctrls vs LG500
        df.x <- subset(df, dataset_adj %in% c("LG500_ctrls","norway_ctrls"))
        df.x$CASE <- ifelse(df.x$dataset_adj=="LG500_ctrls",1,0)
        coco_df <- AssocTest(df.x, x, 
                             group="ALL",
                             covariates=c("LRR_SD","FEMALE"),
                             groupname_recode="LG500ctrls_vs_norwayctrls",
                             outcome="CASE",
                             out_df=coco_df)
        # NORDiC-NOR vs NORDiC-SWE
        df.x <- subset(df, dataset_adj %in% c("NORDiC-NOR","NORDiC-SWE"))
        df.x$CASE <- ifelse(df.x$dataset_adj=="NORDiC-NOR",1,0)
        caca_df <- AssocTest(df.x, x, 
                             group="ALL",
                             covariates=c("LRR_SD","FEMALE"),
                             groupname_recode="NORDiC-NOR_vs_NORDiC-SWE",
                             outcome="CASE",
                             out_df=caca_df)
   
      }
    }

    # write burden comparisons to file
    write.table(coco_df,
                file=paste0(opt[["outroot"]],".coco.tsv"),
                col.names=T, row.names=F,
                quote=F, sep="\t")
    write.table(caca_df,
                file=paste0(opt[["outroot"]],".caca.tsv"),
                col.names=T, row.names=F,
                quote=F, sep="\t")

    }

    # global cnv tests (leave-one-out analysis wrt datasets)
    datasets <- unique(sort(df[[ds_col]]))
    tests <- c("cnv","del","dup",
              "del_nongenic","dup_nongenic",
              "cnv_gt_1mb","del_pli_50_to_1","dup_pli_50_to_1")
    for (ds in datasets) {
      df.d0 <- subset(df, df[[ds_col]] != ds)
      for (x in c("n")) {
        for (y in tests) {
          xy <- paste0(x,"_",y)
          print(xy)
          res_df <- AssocTest(df.d0, xy, 
                              group="ALL",
                              covariates=c("LRR_SD","FEMALE"),
                              groupname_recode=paste0("leaveoneout_",ds),
                              outcome="CASE",
                              out_df=res_df)
   
        }
      }
    }

    # global cnv tests (leave-one-out analysis wrt covariates)
    covariates_full <- covars_list[[grp]]
    tests <- c("cnv","del","dup")
    for (covar in covariates_full) {
      covariates_loo <- covariates_full[covariates_full != covar]
      for (x in c("n")) {
        for (y in tests) {
          xy <- paste0(x,"_",y)
          res_df <- AssocTest(df, xy, 
                              group="ALL",
                              covariates=covariates_loo,
                              groupname_recode=paste0("covarloo_",covar),
                              outcome="CASE",
                              out_df=res_df)
   
        }
      }
    }
   
    # run assoc tests for size bins
    size_bins = c("30kb_to_100kb",
                  "100kb_to_500kb",
                  "500kb_to_1mb",
                  "gt_1mb")
    if (opt[["size-ge-100kb"]] == TRUE) {
      size_bins <- c("100kb_to_500kb",
                     "500kb_to_1mb",
                     "gt_1mb")
    } 
    for (x in c("n")) {
      for (y in c("del","dup","cnv")) {
        for (z in size_bins) {
          xyz <-paste0(x,"_",y,"_",z)
          print(xyz)
          res_df <- AssocTest(df, xyz, 
                              group=grp,
                              covariates=covars_list[[grp]],
                              outcome="CASE",
                              out_df=res_df)
          cmh_df <- CMHTest(df, 
                            xyz,
                            group_col='GROUP',
                            groupname_recode=NA,
                            outcome="CASE",
                            out_df=cmh_df)


        }
      }
    }      

    # cnv frequency bins
    n_bounds <- length(freq_lowerbounds)
    for (x in c("n")) {
      for (y in c("cnv","del","dup")) {
        for (i in 1:n_bounds) {
          z <- paste0("freq",freq_lowerbounds[i],"to",freq_upperbounds[i])
          xyz <-paste0(x,"_",y,"_",z)
          print(xyz)
          res_df <- AssocTest(df, xyz, 
                              group=grp,
                              covariates=covars_list[[grp]],
                              outcome="CASE",
                              out_df=res_df)
          cmh_df <- CMHTest(df, 
                            xyz,
                            group_col='GROUP',
                            groupname_recode=NA,
                            outcome="CASE",
                            out_df=cmh_df)

      
      }
     }
    }
   
    # run genic/nongenic
    for (x in c("n")) {
      for (y in c("del","dup","cnv")) {
        for (z in c("nongenic","genic")) {
          xyz <- paste0(x,"_",y,"_",z)
          print(xyz)
          res_df <- AssocTest(df, xyz, 
                              group=grp,
                              covariates=covars_list[[grp]],
                              outcome="CASE",
                              out_df=res_df)
          cmh_df <- CMHTest(df, 
                            xyz,
                            group_col='GROUP',
                            groupname_recode=NA,
                            outcome="CASE",
                            out_df=cmh_df)


        }
      }
    }

    # run tests for assoc between ngenes overlap and case status
    for (x in c("ngenes")) {
      for (y in c("del","dup","cnv")) {
        xy <- paste0(x,"_",y)
        res_df <- AssocTest(df, 
                            xy, 
                            group=grp,
                            covariates=covars_list[[grp]],
                            outcome="CASE",
                            out_df=res_df)
        cmh_df <- CMHTest(df, 
                          xy,
                          group_col='GROUP',
                          groupname_recode=NA,
                          outcome="CASE",
                          out_df=cmh_df)

      }
    }

    # run assoc tests for pli bins
    for (x in c("n")) {
      for (y in c("del","dup","cnv")) {
        for (z in c("pli_0_to_50","pli_50_to_1","pli_50_to_995","pli_995_to_1")) {
          xyz <-paste0(x,"_",y,"_",z)
          print(xyz)
          res_df <- AssocTest(df, 
                              xyz, 
                              group=grp,
                              covariates=covars_list[[grp]],
                              outcome="CASE",
                              out_df=res_df)
          cmh_df <- CMHTest(df, 
                            xyz,
                            group_col='GROUP',
                            groupname_recode=NA,
                            outcome="CASE",
                            out_df=cmh_df)



        }
      }
    }   

    # run assoc tests for phaplo, ptriplo bins
    for (x in c("n")) {
      for (y in c("del","dup")) {
        for (z in c("phaplo_ge_86","phaplo_lt_86",
                    "ptriplo_ge_94","ptriplo_lt_94")) {
          xyz <-paste0(x,"_",y,"_",z)
          res_df <- AssocTest(df, 
                              xyz, 
                              group=grp,
                              covariates=covars_list[[grp]],
                              outcome="CASE",
                              out_df=res_df)
          cmh_df <- CMHTest(df, 
                            xyz,
                            group_col='GROUP',
                            groupname_recode=NA,
                            outcome="CASE",
                            out_df=cmh_df)



        }
      }
    }   

    # run assoc test for cnv in phaploptriplo intersect
    for (x in c("del","dup")) {
      for (z in c("1","0")) {
        xyz <- paste0("n_",x,"_phaplotriplo_", z)
        print(xyz)
        res_df <- AssocTest(df, 
                            xyz, 
                            group=grp,
                            covariates=covars_list[[grp]],
                            outcome="CASE",
                            out_df=res_df)
        cmh_df <- CMHTest(df, 
                          xyz,
                          group_col='GROUP',
                          groupname_recode=NA,
                          outcome="CASE",
                          out_df=cmh_df)
      }
    }

    # run assoc tests for genic/nongenic (small cnvs only)
    for (x in c("n")) {
      for (y in c("dup_g1","del_g1")) {
        for (z in c("nongenic","genic")) {
          xyz <-paste0(x,"_",y,"_",z)
          print(xyz)
          print(summary(df[[xyz]]))
          print(df[is.na(df[[xyz]]), ])
          if (max(df[[xyz]]) > 0) {
            res_df <- AssocTest(df, 
                                xyz, 
                                group=grp,
                                covariates=covars_list[[grp]],
                                outcome="CASE",
                                out_df=res_df)
            cmh_df <- CMHTest(df, 
                              xyz,
                              group_col='GROUP',
                              groupname_recode=NA,
                              outcome="CASE",
                              out_df=cmh_df)


          }
        }
      }
    }   

    # run assoc tests for pli bins (small cnvs only)
    for (x in c("n")) {
      for (y in c("dup_g1","del_g1")) {
        for (z in c("pli_0_to_50","pli_50_to_1","pli_50_to_995","pli_995_to_1")) {
          xyz <-paste0(x,"_",y,"_",z)
          print(xyz)
          if (max(df[[xyz]]) > 0) {
            res_df <- AssocTest(df, xyz, 
                                group=grp,
                                covariates=covars_list[[grp]],
                                outcome="CASE",
                                out_df=res_df)
            cmh_df <- CMHTest(df, 
                              xyz,
                              group_col='GROUP',
                              groupname_recode=NA,
                              outcome="CASE",
                              out_df=cmh_df)


          }
        }
      }
    }   
    }   




    }



  }

  # run assoc test for neurodev cnvs, NDD genes
  for (x in c("n_del", "n_dup", "n_cnv")) {
    for (y in c("neurodev","NDD","NDD0")) {
      xy <- paste0(x, "_", y)
      print(xy)
      res_df <- AssocTest(df, xy, 
                          group=grp,
                          covariates=covars_list[["ALL"]],
                          outcome="CASE",
                          out_df=res_df)
      cmh_df <- CMHTest(df, 
                        xy,
                        group_col='GROUP',
                        groupname_recode=NA,
                        outcome="CASE",
                        out_df=cmh_df)


    }
  }
  
  # final test : in Halvorsen et al. 2021, an excess of damaging coding de novo
  # mutations was seen in intolerant genes in male cases relative to female
  # cases. Do we see that in NORDiC as well?
  df.ca <- subset(df, CASE==1)
  for (x in c("NORDiC-","_casectrl_","female","male")) {
    df.ca$GROUP <- gsub(x,"",df.ca$GROUP)
  }
  df.ca$MALE <- ifelse(df.ca$FEMALE == 0, 1, 0)
  for (x in c("n")) {
    for (y in c("del","dup")) {
      for (z in c("pli_0_to_50","pli_50_to_1","pli_50_to_995","pli_995_to_1")) {
        xyz <-paste0(x,"_",y,"_",z)
        res_df <- AssocTest(df.ca, 
                            xyz, 
                            group=grp,
                            covariates=c("dataset","LRR_SD"),
                            groupname_recode="male_vs_female",
                            outcome="MALE",
                            out_df=res_df)
        cmh_df <- CMHTest(df.ca, 
                          xyz,
                          group_col='dataset',
                          groupname_recode="male_vs_female",
                          outcome="MALE",
                          out_df=cmh_df)

      }
    }
  }

  CONS <- TRUE
  if (CONS == TRUE) {

  # comparison of perc nongenic dels that hit constrained nts
  # comparison of n bases that are cnv burdened, partitioned by constraint bin
  for (x in c("nbp")) {
    for (y in c("del","dup")) {
      for (z in c(
                  "cons","cons0to3","cons3to6","cons6to9"
                  )) {

        # define count column
        xyz <- paste0(x,"_",y,"_",z)
        print(xyz)

        # convert to kilobases before running assoc test
        xyz1 <- gsub("nbp","nkbp",xyz)
        df[[xyz1]] <- df[[xyz]] / 1000
        df[[xyz]] <- NULL

        # run assoc
        res_df <- AssocTest(df, xyz1, 
                            group="ALL",
                            covariates=c(covars_list[["ALL"]],
                                         paste0("nbp_",y,"_notcons")),
                            outcome="CASE",
                            out_df=res_df)
      }
    }
  }

  # repeat, but focus on noncoding cnvs only
  for (x in c("nbp")) {
    for (y in c("del","dup")) {
      for (z in c(
                  "cons","cons0to3","cons3to6","cons6to9"
                 )) {

        # define count column
        xyz <- paste0(x,"_",y,"_",z,"_nc")
        print(xyz)

        # convert to kilobases before running assoc test
        xyz1 <- gsub("nbp","nkbp",xyz)
        df[[xyz1]] <- df[[xyz]] / 1000
        df[[xyz]] <- NULL

        # run assoc
        res_df <- AssocTest(df, xyz1, 
                            group="ALL",
                            covariates=c(covars_list[["ALL"]],
                                         paste0("nbp_",y,"_notcons_nc")),
                            outcome="CASE",
                            out_df=res_df)
      }
    }
  }



  }

  # write results to file
  write.table(res_df,
              file=paste0(opt[["outroot"]],".res.tsv"),
              row.names=F,
              col.names=T,
              sep="\t",
              quote=F)
  write.table(cmh_df,
              file=paste0(opt[["outroot"]],".cmh.tsv"),
              row.names=F,
              col.names=T,
              sep="\t",
              quote=F)

  # before finishing, make table with deleterious cnv calls, their
  # classification and their carriers and write to table. add numsnp underlying
  # each call in the process
  cnv_calls <- cnv_list[["cnv"]]
  cnv_calls <- merge(df[,c("IID","dataset","FEMALE")], cnv_calls, by="IID")
  cnv_call_data <-read.table(CNV_CALL_METRICS_TSV, header=T, stringsAsFactors=F)
  cnv_calls <- merge(cnv_calls, unique(cnv_call_data[,c("locus","numsnp")]), by="locus")
  cnv_calls$locus_IID <- paste0(cnv_calls$locus, "_", cnv_calls$IID)
  categories <- names(dmg_cnv_list)
  loci_iids <- c()
  for (x in categories) {
    cnv_calls.x <- dmg_cnv_list[[x]]
    loci_iids.x <- paste0(cnv_calls.x$locus, "_", cnv_calls.x$IID)
    dmg_cnv_list[[x]]$locus_IID <- loci_iids.x 
    loci_iids <- c(loci_iids, loci_iids.x)

  }
  loci_iids <- unique(sort(loci_iids))
  cnv_calls_dmg <- subset(cnv_calls, locus_IID %in% loci_iids)
  rownames(cnv_calls_dmg) <- cnv_calls_dmg$locus_IID
  for (x in categories) {
    cnv_calls_dmg[[paste0("overlaps_",x)]] <- rep(NA,nrow(cnv_calls_dmg))
  }
  for (i in rownames(cnv_calls_dmg)) {
    locus_i <- cnv_calls_dmg[i, "locus"]
    iid_i <- cnv_calls_dmg[i, "IID"]
    for (x in categories) {
      cnv_calls_dmg_x <- subset(dmg_cnv_list[[x]],
                                (IID == iid_i) &
                                (locus == locus_i)
                               )
      if (nrow(cnv_calls_dmg_x) > 0) {
        overlaps <- cnv_calls_dmg_x$overlap
        overlaps <- unique(sort(overlaps))
        overlaps_str <- paste(overlaps, collapse=",")
        cnv_calls_dmg[i, paste0("overlaps_",x)] <- overlaps_str
      }
    }
  }
  cnv_calls_dmg$locus_IID <- NULL
  write.table(cnv_calls_dmg,
              file=paste0(opt[["outroot"]], ".deleterious_cnv_annots.tsv"),
              row.names=F, col.names=T, sep="\t", quote=F)
}

NgenesPerSample <- function(df, cnv_list, interval_geneoverlaps,
                            nbp_mode=FALSE) {
  if (nbp_mode == TRUE) {
    prefix <- "nbpgenes_"
  } else {
    prefix <- "ngenes_"
  }
  for (x in c("del","dup","cnv")) {
    cnv_bed <- cnv_list[[x]]
    df[[paste0(prefix,x)]] <- rep(0, nrow(df))  
    carrier_iids <- unique(sort(cnv_bed$IID))
    for (iid in carrier_iids) {
      cnv_bed.i <- subset(cnv_bed, IID==iid)
      cnv_bed.i <- subset(cnv_bed.i, cnv_bed.i[,4] %in% interval_geneoverlaps[,1])
      if (nbp_mode == TRUE) {
        if (nrow(cnv_bed.i)>0) {
          nbp.i <- sum(cnv_bed.i[,3] - cnv_bed.i[,2])
          df[iid, paste0(prefix,x)] <- nbp.i
        }
      } else {
        interval_geneoverlaps.i <- subset(interval_geneoverlaps,
                                          interval_geneoverlaps[,1] %in% cnv_bed.i[,4])
        n.genes.cnv.i <- length(unique(sort(interval_geneoverlaps.i[,2])))
        if (n.genes.cnv.i > 0) {
          df[iid, paste0(prefix,x)] <- n.genes.cnv.i
        }
      }
    }
  }
  return(df)
}

CnvCount <- function(df, cnv_bed, out_col, count_type="n",
                     minfreq=NULL, maxfreq=NULL,
                     minlength=NULL, maxlength=NULL) {
  # init column for counts in data frame
  df[[out_col]] <- rep(0, nrow(df))

  # if freq thresholds defined by user, apply to data
  if (is.null(minfreq)==FALSE) {
    cnv_bed <- subset(cnv_bed, freq >= minfreq)
  }
  if (is.null(maxfreq)==FALSE) {
    cnv_bed <- subset(cnv_bed, freq <= maxfreq)
  }


  # if length thresholds defined by user, apply to data
  if (is.null(minlength)==FALSE) {
    cnv_bed <- subset(cnv_bed, length >= minlength)
  }
  if (is.null(maxlength)==FALSE) {
    cnv_bed <- subset(cnv_bed, length <= maxlength)
  }

  # make sure counts are only for samples in count df
  cnv_bed <- subset(cnv_bed, IID %in% df$IID)

  cnv_counts <- table(cnv_bed$IID)
  if (count_type == "n") {
    for (iid in names(cnv_counts)) {
      df[iid, out_col] <- cnv_counts[[iid]]
    }
  } else if (count_type == "nbp") {
    for (iid in names(cnv_counts)) {
      cnv_bed.i <- subset(cnv_bed, IID == iid)
      df[iid, out_col] <- sum(cnv_bed.i$length)
    }
  } else if (count_type == "nkb") {
    for (iid in names(cnv_counts)) {
      cnv_bed.i <- subset(cnv_bed, IID == iid)
      df[iid, out_col] <- sum(cnv_bed.i$length) / 1000
    }
  } else if (count_type == "n100kb") {
    for (iid in names(cnv_counts)) {
      cnv_bed.i <- subset(cnv_bed, IID == iid)
      df[iid, out_col] <- sum(cnv_bed.i$length) / 100000
    }
  }


  return(df)
}

AssocTest <- function(metrics.df, 
                      predictor,
                      group="ALL",
                      groupname_recode=NA,
                      lm_only=FALSE,
                      covariates=NULL,
                      outcome="CASE",
                      out_df=NULL) {

  # don't do if no samples
  if (nrow(metrics.df) == 0) {
    return(metrics.df)
  } else if (nrow(subset(metrics.df, metrics.df[[outcome]]==1)) == 0) {
    return(metrics.df)
  } else if (nrow(subset(metrics.df, metrics.df[[outcome]]==0)) == 0) {
    return(metrics.df)
  }

  # if group is not all, subset on it
  metrics.df$group <- as.character(metrics.df$group)
  if (group != "ALL") {
    print(dim(metrics.df))
    metrics <- metrics.df[metrics.df$group == group, , drop=F]
    print(dim(metrics))
  } else {
    metrics <- metrics.df
  }

  # recode group name if user has one to replace it with
  if (is.na(groupname_recode)==F) {
    group <- groupname_recode
  }
 
  # get nca, nco
  nca <- nrow(subset(metrics, metrics[[outcome]]==1))
  nco <- nrow(subset(metrics, metrics[[outcome]]==0)) 

  # get n, effective n
  n <- nca+nco
  neff_ca <- 2 / (1/nca + 1/nco)
  neff_co <- neff_ca
  neff <- neff_ca + neff_co

  # form and test linear model (predictor ~ covariates + PHE)
  if (is.null(covariates) == F) {
    predictors=paste0(paste(covariates, collapse="+"), "+", outcome)
  } else {
    predictors=outcome

  }
  lm_str_i <- paste0(predictor,"~",predictors)
  lm_i <- lm(as.formula(lm_str_i), data=metrics)
  lm_i_summary <- summary(lm_i)
  lm_i_coef <- lm_i_summary$coefficients
  lm_i_ci <- confint(lm_i)

  if (lm_only == T) {
    out_df_i <- data.frame(predictor=predictor,
                           lm_est=lm_i_coef[outcome,1],
                           lm_p=lm_i_coef[outcome,4])
 
  } else {
    # form mean control rate estimate
    mean_control_rate = mean(subset(metrics,metrics[[outcome]]==0)[[predictor]])

    # form and test logistic regression model (CASE ~ covariates + predictor)
     if (is.null(covariates) == F) {
      predictors=paste0(paste(covariates, collapse="+"), "+", predictor)
    } else {
      predictors=predictor
    }
    lg_str_i <- paste0(outcome,"~",predictors)
    lg_i <- glm(as.formula(lg_str_i), data=metrics, family=binomial)
    lg_i_summary <- summary(lg_i)
    lg_i_coef <- lg_i_summary$coefficients
    lg_i_ci <- confint(lg_i)

    # if ((predictor %in% rownames(lg_i_coef))==F) {
    #   return(out_df)
    # }
    # if ((predictor %in% rownames(lm_i_coef))==F) {
    #   return(out_df)
    # }

    # form out_df and return
    out_df_i <- data.frame(predictor=predictor,
                           group=group,
                           control_rate=mean_control_rate,
                           n_cases=nca,
                           n_controls=nco,
                           n_samples=n,
                           n_samples_eff=neff,
                           lm_est=lm_i_coef[outcome,1],
                           lm_est_95ci_l=lm_i_ci[outcome,1],
                           lm_est_95ci_u=lm_i_ci[outcome,2],
                           lm_p=lm_i_coef[outcome,4],
                           lg_est=exp(lg_i_coef[predictor, 1]),
                           lg_est_95ci_l=exp(lg_i_ci[predictor, 1]),
                           lg_est_95ci_u=exp(lg_i_ci[predictor, 2]),
                           lg_p=lg_i_coef[predictor, 4]
                          )
  }

  if (is.null(out_df)) {
    return(out_df_i)
  } else {
    out_df <- rbind(out_df, out_df_i)
    return(out_df)
  }
   
}

CMHTest <- function(metrics, 
                    predictor,
                    group_col="GROUP",
                    group="ALL",
                    groupname_recode=NA,
                    outcome="CASE",
                    out_df=NULL) {

  # don't do if no samples
  if (nrow(metrics) == 0) {
    return(metrics)
  } else if (nrow(subset(metrics, metrics[[outcome]]==1)) == 0) {
    return(metrics)
  } else if (nrow(subset(metrics, metrics[[outcome]]==0)) == 0) {
    return(metrics)
  }

  # recode group name if user has one to replace it with
  if (is.na(groupname_recode)==F) {
    group <- groupname_recode
  }
 
  # get nca, nco
  nca <- nrow(subset(metrics, metrics[[outcome]]==1))
  nco <- nrow(subset(metrics, metrics[[outcome]]==0)) 

  # define analysis groups for contingency tables
  analysis_groups <- unique(sort(metrics[[group_col]]))

  # for each analysis group, get number of cases and controls with at least one
  # variant call, and number of cases/controls with zero
  conting.tbls <- c()
  k <- 0
  nca <- c()
  nco <- c()
  nca_1 <- c()
  nco_1 <- c()
  nca_0 <- c()
  nco_0 <- c()
  for (analysis_group in analysis_groups) {
    metrics.g <- subset(metrics, metrics[[group_col]]==analysis_group)
    metrics.g.ca <- subset(metrics.g, metrics.g[[outcome]]==1)
    metrics.g.co <- subset(metrics.g, metrics.g[[outcome]]==0)
    nca.g <- nrow(metrics.g.ca)
    nco.g <- nrow(metrics.g.co)
    if ((nca.g > 0) & (nco.g > 0)) {
      k <- k + 1
      nca <- c(nca, nca.g)
      nco <- c(nco, nco.g)
      nca.g.1 <- nrow(subset(metrics.g.ca, metrics.g.ca[[predictor]] > 0))
      nco.g.1 <- nrow(subset(metrics.g.co, metrics.g.co[[predictor]] > 0))
      nca.g.0 <- nrow(subset(metrics.g.ca, metrics.g.ca[[predictor]] == 0))
      nco.g.0 <- nrow(subset(metrics.g.co, metrics.g.co[[predictor]] == 0))
      conting.tbls <- c(conting.tbls, c(nca.g.1, nca.g.0, nco.g.1, nco.g.0))
      nca_1 <- c(nca_1, nca.g.1)
      nca_0 <- c(nca_0, nca.g.0)
      nco_1 <- c(nco_1, nco.g.1)
      nco_0 <- c(nco_0, nco.g.0)
    }
  }
  conting.tbls.3d <- array(conting.tbls, dim=c(2,2,k))

  # perform CMH test on stack of contingency tables
  res <- mantelhaen.test(conting.tbls.3d, exact=T)

  # extract results
  cmh.or.est <- res$estimate
  cmh.or.ci95_l <- res$conf.int[1]
  cmh.or.ci95_u <- res$conf.int[2]
  cmh.p <- res$p.value

  # form output df and return
  out_df_i <- data.frame(predictor=predictor,
                         group=group,
                         cmh_or_est=cmh.or.est,
                         cmh_or_ci95_l=cmh.or.ci95_l,
                         cmh_or_ci95_u=cmh.or.ci95_u,
                         cmh_p=cmh.p,
                         perc_cases_1=sum(nca_1)/sum(nca),
                         perc_controls_1=sum(nco_1)/sum(nco), 
                         n_cases_1=sum(nca_1),
                         n_cases_0=sum(nca_0),
                         n_controls_1=sum(nco_1),
                         n_controls_0=sum(nco_0),
                         n_cases_1_strat=paste(nca_1,collapse=","),
                         n_cases_0_strat=paste(nca_0,collapse=","),
                         n_controls_1_strat=paste(nco_1,collapse=","),
                         n_controls_0_strat=paste(nco_0,collapse=",")
                        )

  if (is.null(out_df)) {
    return(out_df_i)
  } else {
    out_df <- rbind(out_df, out_df_i)
    return(out_df)
  }
 

}


if (interactive() == F) {
  Main()
}
