

import os
import sys
from collections import OrderedDict
import argparse

def main(userargs):

    # get user args
    parser = argparse.ArgumentParser(prog='rawcnv_log_to_qcmetrics_tsv',
                                     description='from logfile made by penncnv detect_cnv.pl, ' + \
                                                 'produce a tsv file containing QC metrics ' + \
                                                 'per input sample.')
    parser.add_argument('--qc-cols', action='store', type=str, 
                        help='comma-delim set of QC metrics from logfile to write to tsv',
                        default='LRR_mean,LRR_median,LRR_SD,BAF_mean,' + \
                                'BAF_median,BAF_SD,BAF_DRIFT,WF,GCWF')
    parser.add_argument('--iid-prefix-rm', action='store', type=str, default=None,
                        help='prefix to remove from presumed sample filename, ' + \
                             'to represent sample iid')
    parser.add_argument('--iid-postfix-rm', action='store', type=str, default=None,
       	       	       	help='postfix to remove from presumed sample filename, ' + \
                             'to represent sample iid')
    parser.add_argument('penncnv_rawcnv_log_file', type=str,
                        help='input PennCNV .rawcnv.log file.')
    parser.add_argument('out_tsv', type=str,
                        help='output tsv containing sample-level values for QC metrics')
    args = parser.parse_args(userargs)

    # init filehandle
    fh = open(args.penncnv_rawcnv_log_file, "r")

    # init data structs for holding sample ids, metrics
    samples = []
    metrics = dict()
    
    # for each line in log file ..
    for line in fh:
        
        # remove newline item 
        line = line.rstrip()

        # if "NOTICE :" in front, remove it. Else, skip line
        if line.find("NOTICE: ") == 0:
            line = line.replace("NOTICE: ", "")
        else:
            continue

        # if line contains ": " then assume its a data line with key metrics
        if line.find(": ") != -1:
            data = line.split(": ")
            sampleinfo = data[0]
            metrics_str = data[1]
            if sampleinfo.find('quality summary for ') == 0:
                sampleinfo = sampleinfo.replace('quality summary for ', '')
                if args.iid_prefix_rm != None:
                    sampleinfo = sampleinfo.replace(args.iid_prefix_rm, '')
                if args.iid_postfix_rm != None:
                    sampleinfo = sampleinfo.replace(args.iid_postfix_rm, '')
                metrics[sampleinfo] = dict()
                metrics_list = metrics_str.split()
                for metric_i in metrics_list:
                    metric_i_list = metric_i.split("=")
                    metrics[sampleinfo][metric_i_list[0]]=metric_i_list[1]


    # write results per sample in output (table format)
    out_fh = open(args.out_tsv, "w")
    samples = list(metrics.keys())
    samples.sort()
    rows=[]
    cols = args.qc_cols.split(',')
    out_fh.write("\t".join(['IID'] + cols) + "\n")
    for sample_i in samples:
        row = [sample_i]
        for col in cols:
            if col not in metrics[sample_i]:
                print('ERROR : qc metric ' + col + \
                      'absent from data (on sample ' + \
                      sample_i + ')')
                sys.exit(1)
            row.append(metrics[sample_i][col])
        row_str = "\t".join(row)
        out_fh.write(row_str + "\n")
    out_fh.close()

    return

if __name__ == "__main__":
    userargs = sys.argv[1:]
    main(userargs)
