
import sys
import argparse
import pandas

def main(userargs):

    # get user args
    parser = argparse.ArgumentParser(prog='add_cnv_data_to_metrics_tsv',
                                     description='combine intensity metrics tsv with n_cnv and nbp_cnv per sample.')
    parser.add_argument('--iid-prefix-rm', action='store', type=str, default=None,
                        help='prefix to remove from presumed sample filename, ' + \
                             'to represent sample iid')
    parser.add_argument('--iid-postfix-rm', action='store', type=str, default=None,
                        help='postfix to remove from presumed sample filename, ' + \
                             'to represent sample iid')
    parser.add_argument('--iid-col', action='store', type=str, default="IID",
                        help='column name to use for IID')
    parser.add_argument('--iids-keep-list',action='store',type=str,default=None,
                        help='list file containing IIDs to keep.')
    parser.add_argument('--bed-cnvtype-colnum',action='store',type=int,default=5,
                        help='column number of cnv type in bed file.')
    parser.add_argument('--bed-iid-colnum', action='store', type=int, default=6,
                        help='column number of iid in bed file.')
    parser.add_argument('--group-dataset-value', action='store', type=str,
                        default=None,
                        help='comma delim group,dataset values to insert ' + \
                             'into table.')
    parser.add_argument('--cnv-cols-postfix', action='store', type=str, default="",
                        help='postfix string to append onto cnv col names')
    parser.add_argument('penncnv_intensity_metrics_tsv', type=str,
                        help='input PennCNV intensity metrics tsv file.')
    parser.add_argument('cnv_bed_file', type=str,
                        help='input BED callset file.')
    parser.add_argument('out_intensity_callset_metrics_tsv', type=str,
                        help='output tsv containing sample-level intensity metrics and cnv counts')
    args = parser.parse_args(userargs)

    # read intensity metrics tsv to df
    df = pandas.read_csv(args.penncnv_intensity_metrics_tsv, 
                         sep="\t", header=0)

    # set iid as row name
    df.index = df.loc[:, args.iid_col]

    # if file with list of iids to keep is provided, subset on it
    if args.iids_keep_list != None:
        iids_keep = []
        in_fh = open(args.iids_keep_list, "r")
        for line in in_fh:
            iid_i = line.rstrip()
            iids_keep.append(iid_i)
        in_fh.close()
        df = df[df[args.iid_col].isin(iids_keep)]

    # init n + nbp columns for del, dup
    df["n_del" + args.cnv_cols_postfix] = 0
    df["nbp_del" + args.cnv_cols_postfix] = 0
    df["n_dup" + args.cnv_cols_postfix] = 0
    df["nbp_dup" + args.cnv_cols_postfix] = 0

    # init filehandle to bed
    if args.cnv_bed_file in ("stdin","-"):
        bed_fh = sys.stdin
    elif args.cnv_bed_file.find(".gz") != -1:
        bed_fh = gzip.open(args.cnv_bed_file, "rb")
    else:
        bed_fh = open(args.cnv_bed_file, "r")
    
    # iterate through each bed file line 
    n_cnv = {"DEL":dict(), "DUP":dict()}
    nbp_cnv =  {"DEL":dict(), "DUP":dict()}
    for line in bed_fh:
        data = line.rstrip().split()
        start = int(data[1])
        end = int(data[2])
        cnvtype = data[args.bed_cnvtype_colnum - 1]
        if cnvtype not in n_cnv: continue
        nbp = end - start
        iid = data[args.bed_iid_colnum - 1]
        if iid not in n_cnv[cnvtype]:
            n_cnv[cnvtype][iid] = 0
            nbp_cnv[cnvtype][iid] = 0
        n_cnv[cnvtype][iid] += 1
        nbp_cnv[cnvtype][iid] += nbp

    # close bed filehanndle
    bed_fh.close()

    # add values to metric df rows
    for iid in n_cnv["DEL"].keys():
        df.loc[iid, "n_del" + args.cnv_cols_postfix] = n_cnv["DEL"][iid]
        df.loc[iid, "nbp_del" + args.cnv_cols_postfix] = nbp_cnv["DEL"][iid]
    for iid in n_cnv["DUP"].keys():
        df.loc[iid, "n_dup" + args.cnv_cols_postfix] = n_cnv["DUP"][iid]
        df.loc[iid, "nbp_dup" + args.cnv_cols_postfix] = nbp_cnv["DUP"][iid]

    # add columns for total number of cnvs
    df["n_cnv" + args.cnv_cols_postfix] = df["n_del" + args.cnv_cols_postfix] + df["n_dup" + args.cnv_cols_postfix]
    df["nbp_cnv" + args.cnv_cols_postfix] = df["nbp_del" + args.cnv_cols_postfix] + df["nbp_dup" + args.cnv_cols_postfix]

    # if defined by user, insert group/dataset value
    if args.group_dataset_value != None:
        group_dataset = args.group_dataset_value.split(",")
        cols_old = list(df.columns)
        df["group"] = group_dataset[0]
        df["dataset"] = group_dataset[1]
        cols_new = ["group","dataset"] + cols_old
        df = df[cols_new]

    # write metric file
    df.to_csv(path_or_buf=args.out_intensity_callset_metrics_tsv,
              sep="\t", header=True, index=False)

    return

if __name__ == "__main__":
    userargs = sys.argv[1:]
    main(userargs)
