

library(readxl)
library(DescTools)
library(dplyr)

## DATA
GNOMAD_LOF_METRICS_TXT <- "results/global_cnv_burden_analysis/gnomad.v2.1.1.lof_metrics.by_gene.txt"
HALVORSEN_2021_SUPP_TABLES_XLSX <- "data/wes_meta/41593_2021_876_MOESM3_ESM.xlsx"
IN_CNV_IID_GROUP_PHENO_TXT <- "results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv"
IN_CNV_CDS_BED <- "results/assoc_tests/NORDiC_2021.callset.analysisready.CDS_gene_overlaps.bed"
IN_CDS_LOCI_BED <- "results/wes_meta/Homo_sapiens.GRCh37.87.CDS.minmax_per_gene.bed"
OUTROOT <- "results/wes_meta/NORDiC_2022.OCD_rarevar_meta-analysis.771trios_2724ca_5369co"

## PARAM
NITER <- 20000
EXTTADA_SCRIPT_DIR <- "../src/extTADA_20220225/script/"
SEED_NUMBER <- 781482

## EXTTADA
fileR <- dir(EXTTADA_SCRIPT_DIR, ".R$")
for (ii in fileR) {
  source(paste0(EXTTADA_SCRIPT_DIR, ii))
}

Main <- function(){

  # for extTADA, make rstan run faster
  # set_cppo(mode="fast")

  # if seed value is set (and not NA) then set seed for analysis
  if (is.na(SEED_NUMBER) == F) {
    set.seed(SEED_NUMBER)
  }

  # incorporate multi-core processing
  options(mc.cores = parallel::detectCores())

  # read gnomad lof metrics, get lof-intol genes
  gnomad <-read.table(GNOMAD_LOF_METRICS_TXT,
                      header=T, sep="\t", stringsAsFactors=F)
  gnomad <- subset(gnomad, is.na(pLI)==F)

  # read Table S15 from Halvorsen 2021 and remove columns that are unneeded
  tbl <- data.frame(read_excel(HALVORSEN_2021_SUPP_TABLES_XLSX,
                               sheet='Table S15',
                               skip=1))
  cols_rm <- c("dn_lof_poisson_p","dn_lofmisD_poisson_p",
               "extTADA_BF","extTADA_PP","extTADA_qvalue")
  for (col in cols_rm) {
    tbl[[col]] <- NULL
  }

  # add row to table with obs / exp damaging (lof + misD) DNMs
  tbl$mut_dmg <- tbl$mut_lof + tbl$mut_misD
  tbl$dn_dmg <- tbl$dn_lof + tbl$dn_misD

  # read input cnv caco iid/group/pheno file
  iid_grp_phe <- read.table(IN_CNV_IID_GROUP_PHENO_TXT,
                            stringsAsFactors=F)
  grps <- unique(sort(iid_grp_phe[,2]))
  colnames(iid_grp_phe) <- c("IID","GROUP","CASE")
  case_iids <- subset(iid_grp_phe, CASE==1)$IID
  ctrl_iids <- subset(iid_grp_phe, CASE==0)$IID
  cnv_n_ca <- length(case_iids)
  cnv_n_co <- length(ctrl_iids)
  iids_list <- list()
  nca_list <- list()
  nco_list <- list()
  for (grp in grps) {
    iid_grp_phe.g <- subset(iid_grp_phe, GROUP==grp)
    iids_list[[grp]] <- iid_grp_phe.g$IID
    nca_list[[grp]] <- nrow(subset(iid_grp_phe,
                                   (CASE==1) & (GROUP==grp)))
    nco_list[[grp]] <- nrow(subset(iid_grp_phe,
                                   (CASE==0) & (GROUP==grp)))

  }

  # build caco sumstats from cnv
  cnv <- read.table(IN_CNV_CDS_BED, stringsAsFactors=F, header=F)
  colnames(cnv) <- c("chrom","start0","end","locus","cnvtype",
                     "IID","gene")

  # get all cnvs that hit CDS of only 1 gene 
  interval_geneoverlaps <- unique(cnv[, c("locus","gene")]) 
  locus_ngenes <- table(interval_geneoverlaps[,1]) 
  loci_g1 <- names(locus_ngenes[locus_ngenes == 1 ])
  cnv_g1 <- subset(cnv, locus %in% loci_g1)
  
  # get genes hit by these CNVs only in cases
  cnv_g1_ca <- subset(cnv_g1, IID %in% case_iids)
  cnv_g1_ca_genes <- unique(sort(cnv_g1_ca$gene))
  cnv_g1_co <- subset(cnv_g1, IID %in% ctrl_iids)
  cnv_g1_co_genes <- unique(sort(cnv_g1_co$gene))
  cnv_g1_genes_ca_only <- setdiff(cnv_g1_ca_genes, cnv_g1_co_genes)
  cnv_g1_genes_co_only <- setdiff(cnv_g1_co_genes, cnv_g1_ca_genes)

  # print number of genes hit by single-gene cnvs in cases only to stdout
  cat("Number of genes hit by single-gene CNVs in cases only:",
      length(cnv_g1_genes_ca_only), "\n")
  cat("Number of genes hit by single-gene CNVs in controls only:",
      length(cnv_g1_genes_co_only), "\n")

  # init df for storing results of overlap between cnv and exome results
  overlap_df <- data.frame(genes=character(), category=character(),
                           obs_count=numeric(), exp_count=numeric(),
                           obs_rate=numeric(), exp_rate=numeric(),
                           obs_rate_95ci_l=numeric(),
                           rate_ratio=numeric(),
                           poisson_onesided_p=numeric())

  # for genes hit with single-gene CNVs only in cases and not in controls,
  # use Table S15 of Halvorsen 2021 OCD exome paper to compare observed vs
  # expected rate of damaging (LoF + misD) DNMs in 771 trios, and rate of 
  # LoF variants in 476 seperate cases versus 1761 controls
  wes_n_trios <- 771
  wes_n_ca <- 476
  wes_n_co <- 1761
  tbl_cnv_g1ca_genes <- subset(tbl, gene %in% cnv_g1_genes_ca_only)
  cnv_g1ca_genes_dmg_exp <- sum(tbl_cnv_g1ca_genes$mut_dmg) * 2 * wes_n_trios
  cnv_g1ca_genes_dmg_obs <- sum(tbl_cnv_g1ca_genes$dn_dmg)
  dmg_rateratio <- cnv_g1ca_genes_dmg_obs / cnv_g1ca_genes_dmg_exp
  o_e_dmg_res <- poisson.test(cnv_g1ca_genes_dmg_obs, wes_n_trios,
                              r=cnv_g1ca_genes_dmg_exp / wes_n_trios,
                              alternative='greater')
  caco_lof_ca <- sum(tbl_cnv_g1ca_genes$cc_ocdjhu2020caucasian_lof_ca)
  caco_lof_co <- sum(tbl_cnv_g1ca_genes$cc_ocdjhu2020caucasian_lof_co)
  o_e_caco_res <- poisson.test(caco_lof_ca, wes_n_ca,
                               r=caco_lof_co / wes_n_co,
                               alternative='greater')
  caco_lof_ca_rate <- caco_lof_ca / wes_n_ca
  caco_lof_co_rate <- caco_lof_co / wes_n_co
  lof_rateratio <- caco_lof_ca_rate / caco_lof_co_rate

  # add results to df
  overlap_df <- rbind(overlap_df,
                      data.frame(genes=rep("hit by single-gene CNVs (case-only)",2), 
                                 category=c("LoF/misD DNMs","LoF case/control"),
                                 obs_count=c(cnv_g1ca_genes_dmg_obs,
                                             caco_lof_ca),
                                 exp_count=c(cnv_g1ca_genes_dmg_exp,
                                             caco_lof_co/wes_n_co*wes_n_ca),
                                 obs_rate=c(cnv_g1ca_genes_dmg_obs/wes_n_trios,
                                            caco_lof_ca_rate), 
                                 exp_rate=c(cnv_g1ca_genes_dmg_exp/wes_n_trios,
                                            caco_lof_co_rate),
                                 obs_rate_95ci_l=c(o_e_dmg_res$conf.int[1],
                                                   o_e_caco_res$conf.int[1]),
                                 rate_ratio=c(dmg_rateratio,
                                              lof_rateratio),
                                 poisson_onesided_p=c(o_e_dmg_res$p.value,
                                                      o_e_caco_res$p.value)
                                )
                      )

  # repeat with genes only hit with CNVs in controls (shouldn't defy null)
  tbl_cnv_g1co_genes <- subset(tbl, gene %in% cnv_g1_genes_co_only)
  cnv_g1co_genes_dmg_exp <- sum(tbl_cnv_g1co_genes$mut_dmg) * 2 * wes_n_trios
  cnv_g1co_genes_dmg_obs <- sum(tbl_cnv_g1co_genes$dn_dmg)
  dmg_rateratio <- cnv_g1co_genes_dmg_obs / cnv_g1co_genes_dmg_exp
  o_e_dmg_res <- poisson.test(cnv_g1co_genes_dmg_obs, wes_n_trios,
                              r=cnv_g1co_genes_dmg_exp / wes_n_trios,
                              alternative='greater')
  caco_lof_ca <- sum(tbl_cnv_g1co_genes$cc_ocdjhu2020caucasian_lof_ca)
  caco_lof_co <- sum(tbl_cnv_g1co_genes$cc_ocdjhu2020caucasian_lof_co)
  o_e_caco_res <- poisson.test(caco_lof_ca, wes_n_ca,
                               r=caco_lof_co / wes_n_co,
                               alternative='greater')
  caco_lof_ca_rate <- caco_lof_ca / wes_n_ca
  caco_lof_co_rate <- caco_lof_co / wes_n_co
  lof_rateratio <- caco_lof_ca_rate / caco_lof_co_rate

  # add results to df
  overlap_df <- rbind(overlap_df,
                      data.frame(genes=rep("hit by single-gene CNVs (control-only)",2), 
                                 category=c("LoF/misD DNMs","LoF case/control"),
                                 obs_count=c(cnv_g1co_genes_dmg_obs,
                                             caco_lof_ca),
                                 exp_count=c(cnv_g1co_genes_dmg_exp,
                                             caco_lof_co/wes_n_co*wes_n_ca),
                                 obs_rate=c(cnv_g1co_genes_dmg_obs/wes_n_trios,
                                            caco_lof_ca_rate), 
                                 exp_rate=c(cnv_g1co_genes_dmg_exp/wes_n_trios,
                                            caco_lof_co_rate),
                                 obs_rate_95ci_l=c(o_e_dmg_res$conf.int[1],
                                                   o_e_caco_res$conf.int[1]),
                                 rate_ratio=c(dmg_rateratio,
                                              lof_rateratio),
                                 poisson_onesided_p=c(o_e_dmg_res$p.value,
                                                      o_e_caco_res$p.value)
                                )
                      )


  print(overlap_df)
  
  # write overlap results to file
  write.csv(overlap_df,
            file=paste0(OUTROOT, ".cnv_g1_caseonly.ocd_wes_overlap.csv"),
            row.names=F, quote=F)

  # read cds loci, merge into sumstats df
  cds <- read.table(IN_CDS_LOCI_BED, stringsAsFactors=F, header=F)
  colnames(cds) <- c("CHR","START","END","gene")
  cds$START <- cds$START + 1
  cds <- cds[,c("gene","CHR","START","END")]
  tbl <- merge(cds, tbl, by='gene')

  # add cnv length to cnv df
  cnv$length <- cnv$end - cnv$start0

  # del only, dup only
  del <- subset(cnv, cnvtype=="DEL")
  dup <- subset(cnv, cnvtype=="DUP")
  cnv_list <- list("del"=del,"dup"=dup,"cnv"=cnv)

  # get case, control carrier counts and store in count df
  cnv_caco_df <- data.frame(gene=cds$gene, 
                            cc_nordic_del_ca=rep(0, length(cds$gene)),
                            cc_nordic_del_co=rep(0, length(cds$gene)),
                            cc_nordic_dup_ca=rep(0, length(cds$gene)),
                            cc_nordic_dup_co=rep(0, length(cds$gene))
                           )
  rownames(cnv_caco_df) <- cnv_caco_df$gene
  for (x in c("del","dup")) {
    cnv_ca <- subset(cnv_list[[x]], IID %in% case_iids)
    cnv_co <- subset(cnv_list[[x]], IID %in% ctrl_iids)

    # only use cnvs hitting 1 gene
    cnv_ca <- subset(cnv_ca, locus %in% loci_g1)
    cnv_co <- subset(cnv_co, locus %in% loci_g1)

    cnv_ca_g_n <- table(cnv_ca$gene)
    for (gene in names(cnv_ca_g_n)) { 
      cnv_caco_df[gene, paste0("cc_nordic_",x,"_ca")] <- cnv_ca_g_n[[gene]]
    }
    cnv_co_g_n <- table(cnv_co$gene)
    for (gene in names(cnv_co_g_n)) { 
      cnv_caco_df[gene, paste0("cc_nordic_",x,"_co")] <- cnv_co_g_n[[gene]]
    }
  }
  rownames(cnv_caco_df) <- NULL

  # merge NORDiC gene-based cnv case/control counts into exome df
  tbl <- merge(tbl, cnv_caco_df, by='gene')
  rownames(tbl) <- tbl$gene

  # write TADA input table to file
  write.table(tbl,
              file=paste0(OUTROOT, ".extTADA_input.tsv"),
              row.names=F, col.names=T,
              sep="\t", quote=F)
  
  # define ca/co sample sizes
  ncas <- c(wes_n_ca, cnv_n_ca, cnv_n_ca)
  ncon <- c(wes_n_co, cnv_n_co, cnv_n_co)

  # format sample sizes in a way that extTADA will recognize
  n_exttada = list(dn = c(771, 771), ca=ncas, cn=ncon)

  # get denovo obs and exp mut data
  allDNData <- tbl[, paste0("dn_", c("misD", "lof"))]
  allMutData <- tbl[,paste0("mut_", c("misD", "lof"))]

  # collect ca/co count data
  caco_datasets <- c("ocdjhu2020caucasian_lof","nordic_del","nordic_dup")
  dataCCcase <- tbl[,paste0("cc_",caco_datasets,"_ca"),drop=F]
  dataCCcontrol <- tbl[,paste0("cc_",caco_datasets,"_co"),drop=F]

  # derive parameters from the full data using extTADA procedure
  mcmcDD <- extTADAmcmc(modelName = DNandCCextTADA, 
                        dataDN = allDNData, mutRate = allMutData,
                        Ndn = n_exttada$dn, ## There are two de novo categories
              		      dataCCcase = dataCCcase, dataCCcontrol = dataCCcontrol,
	  	        Ncase = n_exttada$ca, Ncontrol = n_exttada$cn,
                        nCore = 3, nChain = 3,
                        nIteration = NITER)

  print(mcmcDD)

  options(repr.plot.width=5, repr.plot.height=5)
  stan_trace(mcmcDD)

  gamma.cc<-c('hyperGammaMeanCC[1]','hyperGammaMeanCC[2]','hyperGammaMeanCC[3]')
  beta.cc<-c('hyperBetaCC[1]','hyperBetaCC[2]','hyperBetaCC[3]')
  pars0 <- estimatePars(pars = c('pi0',
                                 'hyperGammaMeanDN[1]', 'hyperGammaMeanDN[2]',
                                 'hyperBetaDN[1]', 'hyperBetaDN[2]',
                                 gamma.cc, beta.cc),
                        mcmcResult = mcmcDD)

  print(pars0)

  options(repr.plot.width=4, repr.plot.height=3)
  par(mfrow = c(1, 2))
  plotParHeatmap(pars = c("pi0", "hyperGammaMeanDN[1]"), mcmcResult = mcmcDD)
  plotParHeatmap(pars = c("pi0", "hyperGammaMeanDN[2]"), mcmcResult = mcmcDD)


  ##Get gene list
  geneName <- tbl$gene
  ##Set parameters: use pars0 above
  parsFDR <- list(gammaMeanDN = pars0[, 1][c('hyperGammaMeanDN[1]', 'hyperGammaMeanDN[2]')],
                  betaDN = pars0[, 1][c('hyperBetaDN[1]', 'hyperBetaDN[2]')],
                  gammaMeanCC = pars0[,1][gamma.cc],
                  betaCC = pars0[, 1][beta.cc],
                  pi0 = pars0[, 1][1],
                  nfamily = n_exttada$dn,
                  ncase = n_exttada$ca,
                  ncontrol = n_exttada$cn)
  print(parsFDR)
  dataFDR <- calculateFDR(pars = parsFDR,
                          dnData = allDNData, mutData = allMutData,
                          caseData=dataCCcase, controlData=dataCCcontrol,
                          geneName = geneName)

  ## order by increasing q-value
  dataFDR <- dataFDR[order(dataFDR$qvalue), ]

  head(dataFDR)

  dim(dataFDR[dataFDR$qvalue < 0.1, ])
  dim(dataFDR[dataFDR$qvalue < 0.05, ])

  ## write table to file
  write.table(dataFDR, file=paste0(OUTROOT, ".gene_res.tsv"),
              row.names=F, col.names=T,
              sep="\t", quote=F)

  ## save all RData from analysis to file
  save(allDNData, allMutData, dataCCcase, dataCCcontrol,
       mcmcDD, pars0, parsFDR, dataFDR, n_exttada, 
       file=paste0(OUTROOT,".RData"))

  q()
  
  # format df for ManhattanPlot function
  # out.df$P <- out.df$cmh_p
  # ntests_tot <- nrow(out.df)

  # make manhattan plot
  # ManhattanPlot(out.df,
  #               paste0(OUTROOT,".res.manhattan.pdf"),
  #               ntests_tot)
  
}

ManhattanPlot <- function(df, out_pdf, 
                          ntests_tot, 
                          plot_title=NULL) {
  df$MIDPT <- (df$START + df$END) / 2
  df <- df[ order(df$CHR, df$MIDPT), ]
  df$MIDPT.RANK <- rep(0, nrow(df))
  x<-1
  df$CHR <- as.numeric(df$CHR)
  chroms <- c()
  chroms.ticks <- c()
  for (chrom.i in seq(1,22,by=1)) {
    df.c <- subset(df, CHR==chrom.i)
    df.c <- df.c[ order(df.c$MIDPT), ]
    cids <- rownames(df.c)
    df[ cids, "MIDPT.RANK" ] <- seq(x,
                                    (x+nrow(df.c)-1),
                                    by=1)
    df.c[ cids, "MIDPT.RANK" ] <- seq(x,
                                      (x+nrow(df.c)-1),
                                      by=1)
    midpt.mid <- (min(df.c$MIDPT.RANK) + max(df.c$MIDPT.RANK)) / 2

    chroms <- c(chroms, x)
    chroms.ticks <- c(chroms.ticks, midpt.mid)
    x <- x + nrow(df.c) +1
  }
  chroms <- c(chroms, x)

  df$LOG10P <- -log10( df$P )

  if (is.null(out_pdf) == F) {
    pdf(out_pdf, 10, 5)
  }

  df <- df[order(df$MIDPT.RANK), ]

  add.to.plot <- F
  for (chrom in seq(1,22,by=1)) {
    if (chrom %% 2 == 1) {
      colchrom = rgb(0.0,0.3,0.2,1)
    } else {
      colchrom = rgb(0.0,0.0,0.8,1)
    }
   
    df.c <- df[ df$CHR == chrom, ]
    if (chrom == 1) {
      plot(df.c$MIDPT.RANK, df.c$LOG10P, type="p",
           xlim=c(1,max(df$MIDPT.RANK)), xaxt='n',
           ylim=c(0,max(c(df$LOG10P, -log10( 0.05/ntests_tot )+1))),
           col=colchrom,
           xlab="", ylab="-log10(P)", main=plot_title, pch=20)
    
    } else {
      points(df.c$MIDPT.RANK, df.c$LOG10P, col=colchrom, pch=20)
    }
  }

  axis(1, at=chroms.ticks, labels=seq(1,22,by=1), cex.axis=0.5, las=0)

  abline(h= -log10( 0.05/ntests_tot ), col="firebrick4")

  df.p <- df[df$P < (0.05/ntests_tot), , drop=F]
  df.p <- subset(df.p, is.na(gene)==F)
  if (nrow(df.p) > 0) {
    textplot(df.p$MIDPT.RANK, df.p$LOG10P, df.p$gene, 
             pos=2, cex=0.5, new=F, show.lines=F)
  }
  if(is.null(out_pdf)==F) {
    dev.off()
  }

}

CacoFET <- function(calls, geneset.x, geneset.x1, 
                    cnvtype, genesetname,
                    case_iids, ctrl_iids, 
                    geneset0='baseline', res_df=NULL) {
  # before proceeding make sure that geneset x1 is in fact intersected with x
  geneset.x1 <- intersect(geneset.x1, geneset.x)

  calls.x <- subset(calls, gene %in% geneset.x)
  calls.x1 <- subset(calls.x, gene %in% geneset.x1)
  calls.x.ca <- subset(calls.x, IID %in% case_iids)
  calls.x.co <- subset(calls.x, IID %in% ctrl_iids)
  calls.x1.ca <- subset(calls.x1, IID %in% case_iids)
  calls.x1.co <- subset(calls.x1, IID %in% ctrl_iids)
  nca <- length(unique(calls.x.ca$IID))
  nco <- length(unique(calls.x.co$IID))
  nca.1 <- length(unique(calls.x1.ca$IID))
  nco.1 <- length(unique(calls.x1.co$IID))
  tbl <- data.frame(ca=c(nca.1, nca-nca.1),co=c(nco.1,nco-nco.1))
  res <- fisher.test(tbl, alternative='greater')
  res_df_x <- data.frame(cnvtype=cnvtype,
                         geneset0=geneset0,
                         geneset=genesetname,
                         nca_1=nca.1,
                         nca_0=nca-nca.1,
                         nco_1=nco.1,
                         nco_0=nco-nco.1,
                         or_est=res$estimate,
                         or_ci95_l=res$conf.int[1],
                         or_ci95_u=res$conf.int[2],
                         p_value=res$p.value)
  if (is.null(res_df)) {
    return(res_df_x)
  } else {
    res_df <- rbind(res_df, res_df_x)
    return(res_df)
  }
}

if (interactive() == F) {
  Main()
}

