
## MODULE
library(xlsx)

## DATA
FAM_FILE <- "results/cnv_filtering/NORDiC_2021.analysisready.fam"
METRICS_TSV <- "results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.tsv"
CNV_BED <- "results/assoc_tests/NORDiC_2021.callset.analysisready.bed"
INTERVAL_GENEOVERLAPS_TSV <- "results/annot_cnv/NORDiC_2021.callset.interval_geneoverlaps.tsv"
GNOMAD_LOF_METRICS_TSV <- "results/global_cnv_burden_analysis/gnomad.v2.1.1.lof_metrics.by_gene.txt"
NDEV_DEL_BED <- "results/annot_cnv/NORDiC_2021.qual_cnv.del_neurodev.bed"
NDEV_DUP_BED <- "results/annot_cnv/NORDiC_2021.qual_cnv.dup_neurodev.bed"
NDEV_EXONICDEL_GENELIST <- "data/annot_cnv/PMID_30994872_TableS11.exonic_del.gene.list"
NDD_GENES <- "data/annot_cnv/Fu_2021_medrxiv.NDD_genes_FDR05.txt"
NORDICSWE_2020_SAMPLETABLE_TSV <- "data/cnv_carrier_table/NORDiCSWE_cases_20201201.raw.tsv"
NORDICSWE_2018_TXT <- "data/cnv_carrier_table/samplesheet_2018-198-ILL_GSAKIS_N=1108.csv"
NORDICSWE_2018_SAMPLETABLE_CSV <- "data/cnv_carrier_table/LB_Samplesheet_signed_ER_deliveryreport.NORDiC-SWE_2018_sample_table.csv"
NORDICNOR_2019_1_TXT <- "data/cnv_carrier_table/samplesheet_2018-065-ILL_GSAUHB_N=243.csv"
NORDICNOR_2019_2_TXT <- "data/cnv_carrier_table/samplesheet_2018-227-ILL_GSAUHB_N=239.csv"
NORDICNOR_2020_TXT <- "data/cnv_carrier_table/samplesheet_2020-202-ILL_GSAUHB_N=371.csv"
OCD_PRS_TXT <- "data/cnv_carrier_table/ocd_ndnorswe.PGC_OCD_020522_NORDiC0.20220516.PLINK.profile"

## PARAM
UNDEF_VAL <- NA
UNDEF_COLS <- c("Age_at_diagnosis", "Age_at_symptom_debut",
                "YBOCS_pre", "YBOCS_post", "GAF", "CGAS", "WSAS",
                "ASD", "ID", "ADHD", "SCZ", "BIP", "TS_or_chronic_tics")

Main <- function(){

  ARGS <- commandArgs(trailingOnly=T) 
  if (length(ARGS) != 1) {
    cat("08.cnv_carrier_table.R <outroot>\n")
    q()
  }
  outroot <- ARGS[1]

  # read fam file
  df <- read.table(FAM_FILE, header=F, stringsAsFactors=F)
  df <- df[,c(2,5,6)]
  colnames(df) <- c("IID","FEMALE","CASE")
  df$FEMALE <- df$FEMALE - 1
  df$CASE <- df$CASE - 1

  # cases only
  df <- subset(df, CASE==1)

  # merge dataset classification into table
  metrics <- read.table(METRICS_TSV, header=T, sep="\t")
  metrics <- metrics[, c("IID","dataset")]
  colnames(metrics)[2] <- "DATASET"
  df <- merge(metrics, df, by="IID")

  # merge OCD PRS (raw, not z-score transformed) into table
  prs <- read.table(OCD_PRS_TXT, header=T, stringsAsFactors=F)
  prs <- prs[,c("IID","SCORE")]
  colnames(prs)[2] <- "OCD_PRS"
  df <- merge(df, prs, by="IID")

  # read cnv bed file
  cnv <- read.table(CNV_BED, header=F, stringsAsFactors=F, sep="\t")
  colnames(cnv) <- c("chrom","start0","end","locus","cnvtype","IID")
  cnv$length <- cnv$end - cnv$start0

  # read neurodev del, dup bed files
  ndev_del <- read.table(NDEV_DEL_BED, header=F, stringsAsFactors=F, sep="\t")
  colnames(ndev_del) <- c("chrom","start0","end","locus","cnvtype","IID")
  ndev_dup <- read.table(NDEV_DUP_BED, header=F, stringsAsFactors=F, sep="\t")
  colnames(ndev_dup) <- c("chrom","start0","end","locus","cnvtype","IID")

  # exclude the following neurodev CNV classes due to having a 
  # case/control CNV count of at least 10 :
  # 1. dels : 15q11.2_del_BP1-BP2
  # 2. dups : 15q11.2_dup_BP1-BP2, 16p13.11_dup
  ndev_del <- ndev_del[(ndev_del[,10] %in% c("15q11.2_del_BP1-BP2"))==F, ]
  ndev_dup <- ndev_dup[(ndev_dup[,10] %in% c("15q11.2_dup_BP1-BP2",
                                             "16p13.11_dup"))==F, ] 

  # read gnomad pLI file, only keep subset with pLI > 0.995
  gnomad_pli <- read.table(GNOMAD_LOF_METRICS_TSV,header=T, sep="\t",
                           stringsAsFactors=F)
  gnomad_pli <- gnomad_pli[,c("gene","pLI","oe_lof_upper_bin")]
  gnomad_pli <- subset(gnomad_pli, is.na(pLI) == F)
  pli995_genes <- subset(gnomad_pli, pLI > 0.995)$gene

  # read gene cds overlaps per cnv locus, get those overlapping gene with 
  # pLI > 0.995
  interval_geneoverlaps <- read.table(INTERVAL_GENEOVERLAPS_TSV,
                                 header=F, stringsAsFactors=F, sep="\t")
  colnames(interval_geneoverlaps) <- c("locus","gene")
  pli995_loci <- subset(interval_geneoverlaps, gene %in% pli995_genes)$locus

  # get cnvs overlapping a gene with pli > 0.995
  cnv.pli995 <- subset(cnv, locus %in% pli995_loci)

  # write cnvs hitting genes with pli > 0.995 to file
  write.table(subset(merge(cnv, interval_geneoverlaps, by='locus'), 
                     gene %in% pli995_genes),
              file=paste0(outroot, ".pLI_gt_995.bed"),
              row.names=F, col.names=F,
              quote=F, sep="\t")

  # add genes with exonic del in key ndev genes to ndev del list
  ndev_exonicdel_genes <- scan(NDEV_EXONICDEL_GENELIST, what=character(), quiet=T)  
  loci <- subset(interval_geneoverlaps, gene %in% ndev_exonicdel_genes)$locus
  ndev_exonicdel <- subset(cnv, (cnvtype=="DEL") & (locus %in% loci))[,1:6]
  ndev_del <- ndev_del[,1:6]
  ndev_del <- rbind(ndev_del, ndev_exonicdel)

  # write ndev cnvs to file
  write.table(subset(rbind(ndev_del[,1:6], ndev_dup[,1:6])),
              file=paste0(outroot, ".neurodev_cnv.bed"),
              row.names=F, col.names=F, quote=F, sep="\t")

  # store whether or not a sample has at least 1 neurodev del, dup
  df$has_neurodev_del <- ifelse(df$IID %in% ndev_del$IID, 1, 0)
  print(table(df$has_neurodev_del))
  df$has_neurodev_dup <- ifelse(df$IID %in% ndev_dup$IID, 1, 0)
  print(table(df$has_neurodev_dup))

  # store whether or not a sample has at least 1 del in pLI > 0.995 gene
  pli995_del_iids <- subset(cnv, (cnvtype=="DEL") & (locus %in% pli995_loci))$IID
  df$has_del_pli995 <- ifelse(df$IID %in% pli995_del_iids, 1, 0)

  # store whether or not a sample has at least 1 dup in pLI > 0.995 gene  
  pli995_dup_iids <- subset(cnv, (cnvtype=="DUP") & (locus %in% pli995_loci))$IID
  df$has_dup_pli995 <- ifelse(df$IID %in% pli995_dup_iids, 1, 0)
 
  # add whether or not a sample has a CNV overlapping CNV of at least one
  # NDD gene (Fu et al. 2021)
  ndd_genes <- scan(NDD_GENES, what=character(), quiet=T)
  # remove PTGDR2 and CORO1A (10 or more CNVs in case/control data)
  ndd_genes <- ndd_genes[(ndd_genes %in% c("PTGDR2","CORO1A"))==F]
  loci <- subset(interval_geneoverlaps, gene %in% ndd_genes)$locus
  ndd_del_iids <- subset(cnv, (cnvtype=="DEL") & (locus %in% loci))$IID
  df$has_del_nddgene <- ifelse(df$IID %in% ndd_del_iids, 1, 0)
  ndd_dup_iids <- subset(cnv, (cnvtype=="DUP") & (locus %in% loci))$IID
  df$has_dup_nddgene <- ifelse(df$IID %in% ndd_dup_iids, 1, 0)
  
  # write cnvs that hit ndd genes to file
  write.table(unique(subset(merge(cnv, interval_geneoverlaps, by='locus'),             
                     gene %in% ndd_genes)),
              file=paste0(outroot, ".NDD_genes_FDR05.bed"),
              row.names=F, col.names=T, sep="\t", quote=F)

  # init columns where clinical info will go
  for (undef_col in UNDEF_COLS) {
    df[[undef_col]] <- rep(UNDEF_VAL, nrow(df))
  }

  # before writing to file, need to swap out iids from NORDiC-SWE 2018 batch
  sheet <- read.table(NORDICSWE_2018_TXT, 
                      skip=8, header=T,
                      stringsAsFactors=F, sep=";")
  sheet$IID <- paste0(sheet$SentrixBarcode_A, "_", sheet$SentrixPosition_A)
  sheet <- sheet[, c("IID", "Sample_ID")]
  df <- merge(sheet, df, by="IID", all.y=T)
  colnames(df)[1] <- "Analysis_ID"
  df$Sample_ID <- ifelse(is.na(df$Sample_ID), df$Analysis_ID, df$Sample_ID)

  # write to file
  write.table(df,
              file=paste0(outroot, ".del_dup.carriers.tsv"),
              row.names=F, col.names=T, quote=F, sep="\t")

  # split data into NOR and SWE
  df.nor <- subset(df, grepl("NORDiC-NOR", df$DATASET))
  df.swe <- subset(df, grepl("NORDiC-SWE", df$DATASET))

  # NORDiC-SWE : build translation table and merge into df
  df.swe.trans <- data.frame(Patient=character(),
                             Sample_ID=character())

  # NORDiC-SWE 2020
  df.swe.x <- read.table(NORDICSWE_2020_SAMPLETABLE_TSV, 
                         sep="\t", header=T, stringsAsFactors=F)
  df.swe.x <- df.swe.x[,c("Patient","Sample_ID")]
  df.swe.trans <- rbind(df.swe.trans, df.swe.x)

  # NORDiC-SWE 2018
  sheet <- read.table(NORDICSWE_2018_TXT, 
                      skip=8, header=T,
                      stringsAsFactors=F, sep=";")
  df.swe.x <- sheet[, c("Sample_ID","Sample_Name")]
  sheet <- read.csv(NORDICSWE_2018_SAMPLETABLE_CSV, stringsAsFactors=F)
  # fix some columns where column order is shifted by 1, and Patient IDs
  # are listed as 1 for certain samples (only 7 or so)
  sheet$Patient.ID <- ifelse(sheet$Patient.ID == 1,
                             sheet$Diagnosis,
                             sheet$Patient.ID)
  df.swe.y <- sheet[, c("Sample_Name","Patient.ID")]
  df.swe.x <- merge(df.swe.x, df.swe.y, by="Sample_Name")
  df.swe.x <- df.swe.x[,c("Patient.ID","Sample_ID")]
  colnames(df.swe.x) <- c("Patient","Sample_ID")
  df.swe.trans <- rbind(df.swe.trans, df.swe.x)

  # NORDiC-NOR : build translation table and merge into df
  df.nor.trans <- data.frame(Sample_Name=character(),
                             Sample_ID=character())

  # NORDiC-NOR 2019 (batch1)
  sheet <- read.table(NORDICNOR_2019_1_TXT, 
                      skip=8, header=T,
                      stringsAsFactors=F, sep=";")
  df.nor.x <- sheet[, c("Sample_Name","Sample_ID")]
  df.nor.trans <- rbind(df.nor.trans, df.nor.x)

  # NORDiC-NOR 2019 (batch2)
  sheet <- read.table(NORDICNOR_2019_2_TXT, 
                      skip=8, header=T,
                      stringsAsFactors=F, sep=";")
  df.nor.x <- sheet[, c("Sample_Name","Sample_ID")]
  df.nor.trans <- rbind(df.nor.trans, df.nor.x)

  # NORDiC-NOR 2020
  sheet <- read.table(NORDICNOR_2020_TXT, 
                      skip=8, header=T,
                      stringsAsFactors=F, sep=";")
  df.nor.x <- sheet[, c("Sample_Name","Sample_ID")]
  df.nor.trans <- rbind(df.nor.trans, df.nor.x)

  # merge ID translation into tables
  df.swe <- merge(df.swe.trans, df.swe, by="Sample_ID")
  df.nor <- merge(df.nor.trans, df.nor, by="Sample_ID") 

  # write nor-only and swe-only data to files
  write.table(df.nor,
              file=paste0(outroot, ".del_dup.carriers.NOR.tsv"),
              row.names=F, col.names=T, quote=F, sep="\t")
  write.table(df.swe,
              file=paste0(outroot, ".del_dup.carriers.SWE.tsv"),
              row.names=F, col.names=T, quote=F, sep="\t")

  # write tables to excel spreadsheets
  write.xlsx(df.nor,
             file=paste0(outroot, ".del_dup.carriers.NOR.xlsx"),
             sheetName = "NORDiC-NOR",
             col.names = TRUE,
             row.names = FALSE,
             append = FALSE,
             showNA = TRUE,
             password = NULL
            )
  write.xlsx(df.swe,
             file=paste0(outroot, ".del_dup.carriers.SWE.xlsx"),
             sheetName = "NORDiC-SWE",
             col.names = TRUE,
             row.names = FALSE,
             append = FALSE,
             showNA = TRUE,
             password = NULL
            )


}


if (interactive() == F) {
  Main()
}
