library(optparse)

Main <- function(){
  option_list <- list( 
      make_option(c("--samplelevel-mode"), action="store_true", default=FALSE, 
          help="make plot with each row representing a sample with a CNV on chrom"),
      make_option(c("--add-ctrlonly-aov"), action="store_true", default=FALSE, 
          help="include columns with control-only anova relative to dataset"),
      make_option("--ref-genome", default="hg19", action="store", type="character",
                  help="name of reference genome build (hg19/hg18/hg38) [default %default]"),
      make_option("--group", default=NA, action="store", type="character",
          help="name of group to focus counting on [default %default]"),
      make_option("--dataset-adj", default=NA, action="store", type="character",
          help="comma-delim transformed dataset names [default %default]"),
      make_option("--dels-only", default=FALSE, action="store_true",
          help="report stats for deletions only [default %default]"),
      make_option("--dups-only", default=FALSE, action="store_true",
          help="report stats for duplications only [default %default]"),
      make_option("--report-n-cnv", default=FALSE, action="store_true",
          help="report cnv counts per dataset, rather than rate per sample [default %default]"),
      make_option("--group-dataset-iid-tsv", default=NA, action="store", type="character",
          help="name of input group_dataset_iid tsv file [REQUIRED, default %default]"),
      make_option("--fam-file", default=NA, action="store", type="character",
          help="name of PLINK fam file [REQUIRED, default %default]"),
      make_option("--out-tsv", default=NA, action="store", type="character",
          help="name of output pdf file [REQUIRED, default %default]")
  )

  # get command line options, if help option encountered print help and exit,
  # otherwise if options not found on command line then set defaults, 
  opt_parser <- OptionParser(option_list=option_list)
  opt <- parse_args(opt_parser, positional_arguments=T, convert_hyphens_to_underscores=T)
  filterstep_bedfile_sets <- opt$args
  opts <- opt$options

  # skip if no filtersteps/bedfiles
  if (length(filterstep_bedfile_sets) == 0) {
    print_help(opt_parser)
    stop("missing REQUIRED arg(s)")
  }

  # skip if req opts not defined
  if (is.na(opts$fam_file) | is.na(opts$group_dataset_iid_tsv) | is.na(opts$out_tsv)) {
    print_help(opt_parser)
    stop("missing at least one REQUIRED opt")
  }

  # read group/dataset/iid tsv
  group_dataset_iid <- read.table(opts$group_dataset_iid_tsv,
                                  header=T, stringsAsFactors=F)
  group_dataset_iid <- unique(group_dataset_iid)
  colnames(group_dataset_iid) <- c("group","dataset","IID") 
  if (is.na(opts$group)==F) {
    if (opts$group != "full") {
      group_dataset_iid <- subset(group_dataset_iid, group==opts$group)
    }
  }

  # get samplesize 
  n <- nrow(group_dataset_iid)

  # read in PLINK fam file, merge sex and pheno into data
  fam <- read.table(opts$fam_file, header=F, stringsAsFactors=F)
  colnames(fam) <- c("FID","IID","PID","MID","SEX","PHE")
  fam <- fam[,c("IID","SEX","PHE")]
  fam$FEMALE <- fam$SEX - 1 
  fam$CASE <- fam$PHE - 1
  df <- merge(group_dataset_iid, fam[,c("IID","FEMALE","CASE")], by="IID")
  rownames(df) <- df$IID
 
  # if defined, create adjusted dataset column
  df$dataset_adj <- df$dataset
  if (is.na(opts$dataset_adj) == F) {
    dataset_adj <- strsplit(opts$dataset_adj,",")[[1]]
    for (x in dataset_adj) {
      df$dataset_adj <- ifelse(grepl(x, df$dataset_adj), x, df$dataset_adj)
    }
  }
  df$dataset <- df$dataset_adj
 
  # get filterstep and corresponding bed files to go with it
  filtersteps <- c()
  bedfile_list <- list()
  for (filterstep_bedfile_str in filterstep_bedfile_sets) {
    filterstep_bedfile <- strsplit(filterstep_bedfile_str, ":")[[1]]
    filterstep <- filterstep_bedfile[1]
    bedfile <- filterstep_bedfile[2]
    filtersteps <- c(filtersteps, filterstep)
    bedfile_list[[filterstep]] <- bedfile    
  }

  # init output df
  out_df <- data.frame(filter_step=filtersteps)
  rownames(out_df) <- filtersteps
  
  # init columns 
  datasets <- unique(sort(df$dataset))
  columns <- datasets
  for (col in columns) {
    out_df[[col]] <- rep(0, nrow(out_df))
  }
  
  # for each filter step ..
  for (filterstep in filtersteps) {

    # read bed file
    bedfile <- bedfile_list[[filterstep]]
    cnv <- read.table(bedfile, header=F, stringsAsFactors=F)
    colnames(cnv) <- c("chrom","start0","end","locus","cnvtype","iid")
    cnv$length <- cnv$end - cnv$start0

    # keep samples in group/dataset/iid df (shouldn't be any samples
    # not in original group/dataset/iid)
    cnv <- subset(cnv, iid %in% df$IID)

    # if desired by user, remove dels or dups
    if (opts$dels_only == T) {
      cnv <- subset(cnv, cnvtype == "DEL")
    } else if (opts$dups_only == T) {
      cnv <- subset(cnv, cnvtype == "DUP")
    }

    # for each dataset ..
    for (ds in datasets) {
      
      # derive subset of cnv callset with samples from dataset
      df_x <- subset(df, dataset == ds)
      n <- nrow(df_x)
      cnv_x <- subset(cnv, iid %in% df_x$IID)
      
      # get total count 
      n_cnv <- nrow(cnv_x) 
      nbp_cnv <- sum(cnv_x$length)

      # get rate per sample
      n_cnv_per_sample <- n_cnv / n
      nbp_cnv_per_sample <- nbp_cnv / n

      # write to table
      value <- nbp_cnv_per_sample
      if (opts$report_n_cnv == T) {
        value <- n_cnv_per_sample
      }
      out_df[filterstep, ds] <- value
    
    }
    
  }

  # write stats to file
  write.table(out_df,
              file=opts$out_tsv,
              col.names=T, row.names=F,
              sep="\t", quote=F)

}

if (interactive() == F) {
  Main()
}  
