 
import os
import sys
from collections import OrderedDict
import argparse

# library for easier reading of penncnv entries
from penncnv import PenncnvEntry

def main(userargs):

    # get user args
    parser = argparse.ArgumentParser(prog='collect_penncnv_call_metrics',
                                     description='using callset file in output format produced by PennCNV, ' + \
                                                 'produce an output that meets user specified requirements ' + \
                                                 'regarding size and number of spanning snps. User has option ' + \
                                                 'to write output in BED format.')
    parser.add_argument("--chr-include", action='store', type=str,
                        default="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT",
                        help="chromosomes to subset on, default %(default)s.")
    
    parser.add_argument('--write-mode', action='store', type=str, default='w',
                        help='write mode to use for writing output file')
    parser.add_argument('--iid-prefix-rm', action='store', type=str, default=None,
                        help='prefix to remove from presumed sample filename, ' + \
                             'to represent sample iid')
    parser.add_argument('--iid-postfix-rm', action='store', type=str, default=None,
                        help='postfix to remove from presumed sample filename, ' + \
                             'to represent sample iid')
    parser.add_argument('--iid-locus-extract-tsv', action='store', 
                        type=str, default=None,
                        help='tsv with iid and cnv locus to subset on')
    parser.add_argument("in_penncnv_txt", type=str,
                        help="input penncnv callset text file")
    parser.add_argument("out_tsv", type=str, help="output table tsv")
    args = parser.parse_args(userargs)

    # init filehandle to penncnv file
    if args.in_penncnv_txt in ("stdin","-"):
        in_fh = sys.stdin
    else:
        in_fh = open(args.in_penncnv_txt, "r")

    # define set with chromosomes to subset on 
    chr_include = set(args.chr_include.split(","))

    # get iid / loci to extract, if defined
    iid_locus_extract = set()
    if args.iid_locus_extract_tsv != None:
        extract_fh = open(args.iid_locus_extract_tsv, "r")
        for line in extract_fh:
            if line[0]=="#": continue
            data=line.rstrip().split("\t")
            iid = data[0]
            locus = data[1]
            iid_locus_extract.add(iid + " " + locus)
        extract_fh.close()

    # init output 
    out_fh = open(args.out_tsv, args.write_mode)
    if args.write_mode == "w":
        header_list = ["dataset","IID", "chrom", "start0", "end",
                       "locus", "cnvtype", "length", "numsnp"]
        out_fh.write("\t".join(header_list) + "\n")

    # for each line in penncnv file ..
    for penncnv_line in in_fh:

        # init as object
        entry = PenncnvEntry(penncnv_line)

        # iid should consist of dataset.iid, retrieve all 3
        dataset_iid = entry.iid.split(".")
        dataset = dataset_iid[0]
        iid = dataset_iid[1]

        # if entry doesn't map to chromosomes of interest, then skip
        if entry.chrom not in chr_include:
            continue

        # if extract set defined ..
        if len(iid_locus_extract) != 0:
            # iid / locus not in extract set, then skip
            if iid + " " + entry.interval not in iid_locus_extract:
                continue

        # form output list
        out_list = [dataset, iid, 
                    entry.chrom, str(entry.start-1), str(entry.end),
                    entry.interval, entry.cnv_type,
                    str(entry.length), str(entry.numsnp)]

        # write to file 
        out_fh.write("\t".join(out_list) + "\n")

    # close filehandles
    in_fh.close()
    out_fh.close()

    return


if __name__ == "__main__":
    userargs = sys.argv[1:]
    main(userargs)
