require(optparse, quietly=T)

Main <- function() {

  # ADD OPTION FOR median(df$nbp_cnv) +/- 3*mad(df$nbp_cnv)

  # List of command-line options
  option_list <- list(
    make_option(c("--iids-subset-txt"), type="character", default=NA,
                help="text file with IIDs to subset on before outlier pruning, [default %default]"),
    make_option(c("--param-maxes"), type="character", default=NA,
                  help="param/max combinations, format param1=max1,param2=max2.., [default %default]"),
    make_option(c("--n-cnv-max"), type="numeric", default=NA,
                  help="maximum allowed n_cnv allowed per sample, [default %default]"),
    make_option(c("--nbp-cnv-max"), type="numeric", default=NA,
                  help="maximum allowed nbp_cnv allowed per sample, [default %default]"),
    make_option(c("--n-sd-thresh"), type="numeric", default=3, 
                help="Number of standard deviations from the mean to define as threshold, [default %default]"),
    make_option(c("--use-median-mad"), type="logical", default=FALSE, action="store_true",
                help="Instead of mean +/- SD, define outliers based on \
                      median +/- MAD, where MAD is median absolute deviation, [default %default]"),
    make_option(c("--outlier-metric-count-tsv"), type="character", default=NA,
                help="file to write metric count sumstats to, [default %default]"),
    make_option(c("--group-dataset-col-values"), type="character", default=NA,
                help="comma-delim set of group,dataset identifiers to add to front of table, [default %default]"),
    make_option(c("--by-dataset"), action="store_true", type="logical",
                default=FALSE,
                help="compute mean/sd and prune outliers based on dataset [default %default]"),
    make_option(c("--outlier-param-cols"), type="character", default="", 
                help="Names of columns to apply thresholding to, [default %default]")
  )

  # Get command-line arguments & options
  args <- parse_args(OptionParser(usage="%prog freq_matrix.bed.gz outfile.bed",
                                  option_list=option_list),
                     positional_arguments=TRUE)
  opts <- args$options

  # get positional args
  if (length(args$args) != 2) {
    cat("intensity_callset_metrics_outlierpruning.R [OPTS] <intensity_callset_metrics_tsv> <out_tsv>\n")
    q()
  }
  intensity_callset_metrics_tsv <- args$args[1]
  iids_keep_txt <- args$args[2]

  # read intensity / callset metrics tsv
  df <- read.table(intensity_callset_metrics_tsv, header=T, 
                   sep="\t", stringsAsFactors=F)


  # add column for absolute value of waviness factor
  df$abs_WF <- abs(df$WF)

  # if defined by user, read iids to subset on and subest callset metrics on them
  if (is.na(opts[["iids-subset-txt"]])==F) {
    iids_subset <- scan(opts[["iids-subset-txt"]],
                        what=character(),
                        quiet=T)
    df <- subset(df, IID %in% iids_subset)
  }

  # get columns to apply thresholding to
  cols <- strsplit(opts[["outlier-param-cols"]],",")[[1]]
  
  # form variables for tracking n samples removed at each thresholding,
  # and which samples are removed
  iids.rm <- c()
  n.iids.rm <- NULL
 
  if (opts[["by-dataset"]] == T) {
    datasets <- c(unique(sort(df$dataset)),NA)
  } else {
    datasets <- c(NA)
  }
  for (metric.col in cols) {
    print(metric.col)
    for (dataset.x in datasets) {
      print(dataset.x)
      if (is.na(dataset.x) == F) {
        df.d <- subset(df, dataset == dataset.x)
      } else {
        df.d <- df
        dataset.x <- "ALL"
      }
      vals <- df.d[[metric.col]]
      vals.mean <- mean(vals)
      vals.sd <- sd(vals)
      vals.median <- median(vals)
      vals.mad <- mad(vals)
      if (opts[["use-median-mad"]] == TRUE) {
        meansd_or_medianmad <- "median/MAD"
        vals.lowerbound <- vals.median - opts[["n-sd-thresh"]] * vals.mad
        vals.upperbound <- vals.median + opts[["n-sd-thresh"]] * vals.mad
      } else {
        meansd_or_medianmad <- "mean/SD"
        vals.lowerbound <- vals.mean - opts[["n-sd-thresh"]] * vals.sd
        vals.upperbound <- vals.mean + opts[["n-sd-thresh"]] * vals.sd
      }
      df.rm <- subset(df.d, 
                      ( df.d[[metric.col]] < vals.lowerbound )
                      |
                      ( df.d[[metric.col]] > vals.upperbound )
                     )
      iids.rm.x <- df.rm$IID
      n.iids.rm.i <- data.frame(param=metric.col,
                                dataset=dataset.x,
                                val_mean=vals.mean,
                                val_median=vals.median,
                                val_sd=vals.sd,
                                val_mad=vals.mad,
                                outlier_def=meansd_or_medianmad,
                                val_lower=vals.lowerbound,
                                val_upper=vals.upperbound,
                                n_rm=length(iids.rm.x)
                               )
      if (is.null(n.iids.rm)) {
        n.iids.rm <- n.iids.rm.i
        iids.rm <- iids.rm.x
      } else {
        n.iids.rm <- rbind(n.iids.rm, n.iids.rm.i)
        iids.rm <- c(iids.rm, iids.rm.x)
      }
    }
  }
  iids.rm <- unique(sort(iids.rm))
  
  X=0; if (X==1) {

  # for each col, apply outlier detection and removal
  for (col in cols) {
    vals <- df[[col]]
    vals.mean <- mean(vals)
    vals.sd <- sd(vals)
    vals.median <- median(vals)
    vals.mad <- mad(vals)
    if (opts[["use-median-mad"]] == TRUE) {
      meansd_or_medianmad <- "median/MAD"
      vals.lowerbound <- vals.median - opts[["n-sd-thresh"]] * vals.mad
      vals.upperbound <- vals.median + opts[["n-sd-thresh"]] * vals.mad
    } else {
      meansd_or_medianmad <- "mean/SD"
      vals.lowerbound <- vals.mean - opts[["n-sd-thresh"]] * vals.sd
      vals.upperbound <- vals.mean + opts[["n-sd-thresh"]] * vals.sd
    }
    df.rm <- subset(df, 
                    ( df[[col]] < vals.lowerbound )
                    |
                    ( df[[col]] > vals.upperbound )
                   )
    iids.rm.x <- df.rm$IID
    iids.rm <- c(iids.rm, iids.rm.x)
    n.iids.rm.i <- data.frame(param=col,
                              dataset="ALL",
                              val_mean=vals.mean,
                              val_median=vals.median,
                              val_sd=vals.sd,
                              val_mad=vals.mad,
                              outlier_def=meansd_or_medianmad,
                              val_lower=vals.lowerbound,
                              val_upper=vals.upperbound,
                              n_rm=length(iids.rm.x)
                             )
    if (is.null(n.iids.rm)) {
      n.iids.rm <- n.iids.rm.i
    } else {
      n.iids.rm <- rbind(n.iids.rm, n.iids.rm.i)
    }
  }

  }

  if (is.na(opts[["param-maxes"]])==FALSE) {
    maxes_str <- strsplit(opts[["param-maxes"]], ",")[[1]]
    for (max_keyval_str in maxes_str) {
      max_keyval <- strsplit(max_keyval_str, "=")[[1]]
      max_key <- max_keyval[1]
      max_val <- as.numeric(max_keyval[2])
    
      df.rm <- subset(df, df[[max_key]] > max_val)
      iids.rm.x <- df.rm$IID
      iids.rm <- c(iids.rm, iids.rm.x)
      n.iids.rm.i <- data.frame(param=max_key,
                                dataset="ALL",
                                val_mean=mean(df[[max_key]]),
                                val_median=median(df[[max_key]]),
                                val_sd=sd(df[[max_key]]),
                                val_mad=mad(df[[max_key]]),
                                outlier_def="user_defined_max",
                                val_lower=NA,
                                val_upper=max_val,
                                n_rm=length(iids.rm.x)
                               )
    
      if (is.null(n.iids.rm)) {
        n.iids.rm <- n.iids.rm.i
      } else {
        n.iids.rm <- rbind(n.iids.rm, n.iids.rm.i)
      }
    }
  } 
  if (is.na(opts[["nbp-cnv-max"]])==FALSE) {
    df.rm <- subset(df, nbp_cnv > opts[["nbp-cnv-max"]])
    iids.rm.x <- df.rm$IID
    iids.rm <- c(iids.rm, iids.rm.x)
    n.iids.rm.i <- data.frame(param="nbp_cnv",
                              dataset="ALL",
                              val_mean=mean(df$nbp_cnv),
                              val_median=median(df$nbp_cnv),
                                  val_sd=sd(df$nbp_cnv),
                                  val_mad=mad(df$nbp_cnv),
                                  outlier_def="user_defined_max",
                                  val_lower=NA,
                                  val_upper=opts[["nbp-cnv-max"]],
                                  n_rm=length(iids.rm.x)
                                 )
    if (is.null(n.iids.rm)) {
      n.iids.rm <- n.iids.rm.i
    } else {
      n.iids.rm <- rbind(n.iids.rm, n.iids.rm.i)
    }
  }

  # define set of all unique iids to remove, store count
  iids.rm <- unique(sort(iids.rm))
  n.iids.rm <- rbind(n.iids.rm, 
                     data.frame(param="TOTAL", 
                                dataset=NA,
                                val_mean=NA,
                                val_median=NA,
                                val_sd=NA,
                                val_mad=NA,
                                outlier_def=NA,
                                val_lower=NA,
                                val_upper=NA,
                                n_rm=length(iids.rm)
                               )
                    )

  # define set of iids to keep
  iids.keep <- setdiff(df$IID, iids.rm)

  # if user-defined, write param outlier metrics to file
  if (is.na(opts[["outlier-metric-count-tsv"]])==F) {
    write.table(n.iids.rm,
                file=opts[["outlier-metric-count-tsv"]],
                row.names=F, col.names=T, sep="\t", quote=F)
  }

  # write list of IIDs to keep
  write.table(iids.keep,
              file=iids_keep_txt,
              row.names=F, col.names=F,
              quote=F)
}

OutlierPruning <- function(df, metric.col, dataset=NA,
                           use.median.mad=FALSE,
                           n.sd.thresh=3,
                           res=NULL) {
  if (is.na(dataset) == F) {
    df <- subset(df, dataset == dataset)
  } else {
    dataset <- "ALL"
  }
  vals <- df[[metric.col]]
  vals.mean <- mean(vals)
  vals.sd <- sd(vals)
  vals.median <- median(vals)
  vals.mad <- mad(vals)
  if (opts[["use-median-mad"]] == TRUE) {
    meansd_or_medianmad <- "median/MAD"
    vals.lowerbound <- vals.median - opts[["n-sd-thresh"]] * vals.mad
    vals.upperbound <- vals.median + opts[["n-sd-thresh"]] * vals.mad
  } else {
    meansd_or_medianmad <- "mean/SD"
    vals.lowerbound <- vals.mean - opts[["n-sd-thresh"]] * vals.sd
    vals.upperbound <- vals.mean + opts[["n-sd-thresh"]] * vals.sd
  }
  df.rm <- subset(df, 
                  ( df[[col]] < vals.lowerbound )
                  |
                  ( df[[col]] > vals.upperbound )
                 )
  iids.rm.x <- df.rm$IID
  iids.rm <- c(iids.rm, iids.rm.x)
  n.iids.rm.i <- data.frame(param=col,
                            dataset=dataset,
                            val_mean=vals.mean,
                            val_median=vals.median,
                            val_sd=vals.sd,
                            val_mad=vals.mad,
                            outlier_def=meansd_or_medianmad,
                            val_lower=vals.lowerbound,
                            val_upper=vals.upperbound,
                            n_rm=length(iids.rm.x)
                           )
  if (is.null(res)) {
    res <- list()
    res$tbl <- n.iids.rm.i
    res$iids_rm <- iids.rm.x
  } else {
    res$tbl <- rbind(res$tbl, n.iids.rm.i)
    res$iids_rm <- c(res$iids_rm, iids.rm.x)
  }
  return(res)
}

if (interactive() == F){ 
  Main()
}

