#!/bin/bash
#SBATCH -A sens2018605
#SBATCH -p core
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -t 05:00:00
#SBATCH --mem=8g
#SBATCH -J 01.qual_cnv
#SBATCH --output=logs/qual_cnv/qual_cnv.%A.out
#SBATCH --error=logs/qual_cnv/qual_cnv.%A.err

## MODULE 
module load python/3.8.7

## PARAM
DATASETS=("NOR:NORDiC-NOR_cases_2019_1"
          "NOR:NORDiC-NOR_cases_2019_2"
          "NOR:NORDiC-NOR_cases_2020"
          "NOR:norway_ctrls_2019"
          "NOR:norway_ctrls_2020"
          "SWE:LG500_ctrls_2019"
          "SWE:NORDiC-SWE_cases_2018"
          "SWE:NORDiC-SWE_cases_2020"
          "SWE:ANGI_PhaseII_Pedersen_Controls_GSA-MD_wave1"
          "SWE:ANGI_PhaseII_Pedersen_Controls_GSA-MD_wave2"
         )

# get path to penncnv scripts
source cfg/cnv_calling.cfg

# add to python path
export PYTHONPATH="$PYTHONPATH:$PWD/src/python/:$PY3_MODULES_DIR"

# make sure output dir exists
mkdir -p results/qual_cnv/

# form table of old -> new iids
cat ../NORDiC_GWAS_CNV_202103/results/bcf/SWE/NORDiC-SWE_cases_2018/NORDiC-SWE_cases_2018.sampleid_old.sampleid_new.tsv \
> results/qual_cnv/NORDiC_2021.sampleid_old.sampleid_new.tsv

# for each dataset..
for dsfull in ${DATASETS[@]}
do
  
  # get nor_or_swe, dataset name
  nor_or_swe=`echo $dsfull | cut -d':' -f1`
  ds=`echo $dsfull | cut -d':' -f2`
  echo $dsfull

  # make sure output dir exists
  mkdir -p results/qual_cnv/${nor_or_swe}/${ds}/

  # make a merged dataset for quantisnp callset
  src/qual_cnv/tables_merge.sh \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.quantisnp.tsv \
  ../NORDiC_GWAS_CNV_202103/results/quantisnp_batch_primary_callset/${nor_or_swe}/${ds}/*/*.cnv

  # intersect quantisnp callset with penncnv callset
  python src/qual_cnv/quantisnp_calls_merge.py \
  --iid-prefix-rm "../../../genomic_wave/${nor_or_swe}/${ds}/" \
  --iid-postfix-rm '.txt.adjusted' \
  ../NORDiC_GWAS_CNV_202103/results/cnv_calling_files/NORDiC_2021.snpid_chrom_pos.X_Y_XY_M.tsv \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.quantisnp.tsv \
  ../NORDiC_GWAS_CNV_202103/results/penncnv_primary_callset/${nor_or_swe}/${ds}/${ds}.rawcnv \
  > results/qual_cnv/${nor_or_swe}/${ds}/${ds}.penncnv_quantisnp_intersect.rawcnv

  # get number of cnvs in pre-combineseg callset
  n_cnv_0=`cat results/qual_cnv/${nor_or_swe}/${ds}/${ds}.penncnv_quantisnp_intersect.rawcnv \
           | wc -l`

  j=1
  convergence=0
  n_cnv_steps=($n_cnv_0)
  logfile=results/qual_cnv/${nor_or_swe}/${ds}/${ds}.combineseg_iter.cnv.log
  > $logfile
  while [[ $convergence != 1 ]]
  do

    # form intermediate file
    if [[ $j == 1 ]]
    then
      cp \
      results/qual_cnv/${nor_or_swe}/${ds}/${ds}.penncnv_quantisnp_intersect.rawcnv \
      results/qual_cnv/${nor_or_swe}/${ds}/${ds}.pre_combineseg.rawcnv
    else 
      cp \
      results/qual_cnv/${nor_or_swe}/${ds}/${ds}.post_combineseg.rawcnv \
      results/qual_cnv/${nor_or_swe}/${ds}/${ds}.pre_combineseg.rawcnv
    fi

    # print status to stdout
    echo "combineseg_iter (${nor_or_swe} / ${ds}) - iter ${j} initiated"

    # run single round of combineseg algorithm on current iteration of callset
    perl $PENNCNV_DIR/clean_cnv.pl combineseg \
    --signalfile ../NORDiC_GWAS_CNV_202103/results/cnv_calling_files/NORDiC_2021.snpid_chrom_pos.X_Y_XY_M.tsv \
    --fraction "0.2" \
    results/qual_cnv/${nor_or_swe}/${ds}/${ds}.pre_combineseg.rawcnv \
    > results/qual_cnv/${nor_or_swe}/${ds}/${ds}.post_combineseg.rawcnv \
    2>results/qual_cnv/${nor_or_swe}/${ds}/${ds}.post_combineseg.rawcnv.log
    
    # results/qual_cnv/PGC_TS_CNV_2021.${groups[$i]}.raw.merged.combineseg_iter.log

    # check number of cnvs pre and post combineseg iteration
    n_cnv_pre=`cat results/qual_cnv/${nor_or_swe}/${ds}/${ds}.pre_combineseg.rawcnv \
               | wc -l`
    n_cnv_post=`cat results/qual_cnv/${nor_or_swe}/${ds}/${ds}.post_combineseg.rawcnv \
                | wc -l`
    
    # if number of cnvs pre-combineseg and post-combineseg are equal,
    # convergence has been reached. otherwise, go onto next iteration.
    if [[ $n_cnv_pre == $n_cnv_post ]]
    then
      convergence=1
      n_iter_total=`expr ${#n_cnv_steps[@]} - 1`
      echo "n_iter_total=${n_iter_total}" >> $logfile

      # print info to stdout
      echo "combineseg_iter (${nor_or_swe} / ${ds}) - iter ${j} completed. no change."
      j0=`expr $j - 1`
      echo "combineseg_iter (${nor_or_swe} / ${ds}) - convergence reached in ${j0} iter."
    
    else 

      # write information on iteration to log file
      echo "iter=${j},n_cnv_precombineseg=${n_cnv_pre}" >> $logfile
      echo "iter=${j},n_cnv_postcombineseg=${n_cnv_post}" >> $logfile
      echo "iter=${j},penncnv_log=" >> $logfile
      cat results/qual_cnv/${nor_or_swe}/${ds}/${ds}.post_combineseg.rawcnv.log \
      >> $logfile

      # print info to stdout
      echo "combineseg_iter (${groups[$i]}) - iter ${j} completed."

      # store number of cnvs in this combineseg iteration
      n_cnv_steps[$j]=$n_cnv_post

      # imcrement iteration counter by 1
      j=`expr $j + 1`

    fi

  done

  # make file for final post-combineseg callset
  cp \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.post_combineseg.rawcnv \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.combineseg.rawcnv 

  # remove intermediate files
  rm \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.pre_combineseg.rawcnv \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.post_combineseg.rawcnv 

  # form table of intensity qc metrics per sample using 
  # penncnv log file from cnv calling
  python src/cnv_filtering/rawcnv_log_to_qcmetrics_tsv.py \
  --iid-prefix-rm "../../../genomic_wave/${nor_or_swe}/${ds}/${ds}." \
  --iid-postfix-rm ".txt.adjusted" \
  ../NORDiC_GWAS_CNV_202103/results/penncnv_primary_callset/${nor_or_swe}/${ds}/${ds}.rawcnv.log \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.penncnv_intensity_metrics.tsv

  # make BED format file with raw cnv calls
  python src/cnv_filtering/filter_penncnv.py \
    --iid-prefix-rm "${ds}." \
    --iid-postfix-rm ".txt.adjusted" \
    --output-as-bed \
    results/qual_cnv/${nor_or_swe}/${ds}/${ds}.combineseg.rawcnv \
  | sort -k1,1 -k2,2n \
  > results/qual_cnv/${nor_or_swe}/${ds}/${ds}.combineseg.bed

  # filter penncnv callset to get analysis-qualifying cnvs, output in penncnv format
  python src/cnv_filtering/filter_penncnv.py \
  --length-min 30000 \
  --length-max 20000000 \
  --numsnp-min 15 \
  --iid-prefix-rm "${ds}." \
  --chr-include 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.combineseg.rawcnv \
  > results/qual_cnv/${nor_or_swe}/${ds}/${ds}.qual_cnv.cnv

  # convert to BED
  python src/cnv_filtering/filter_penncnv.py \
    --output-as-bed \
    --iid-prefix-rm "${ds}." \
    --iid-postfix-rm ".txt.adjusted" \
    results/qual_cnv/${nor_or_swe}/${ds}/${ds}.qual_cnv.cnv \
  | sort -k1,1 -k2,2n \
  > results/qual_cnv/${nor_or_swe}/${ds}/${ds}.qual_cnv.bed

  # add raw cnv metrics to metrics tsv
  python src/cnv_filtering/add_cnv_data_to_metrics_tsv.py \
  --group-dataset-value "GSA,${ds}" \
  --cnv-cols-postfix "_raw" \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.penncnv_intensity_metrics.tsv \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.combineseg.bed  \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.penncnv_intensity_metrics.with_rawcnv_incomplete.tsv

  # add raw cnv metrics to metrics tsv
  python src/cnv_filtering/add_cnv_data_to_metrics_tsv.py \
  --cnv-cols-postfix "" \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.penncnv_intensity_metrics.with_rawcnv_incomplete.tsv \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.qual_cnv.bed \
  results/qual_cnv/${nor_or_swe}/${ds}/${ds}.penncnv_intensity_metrics.with_rawcnv.tsv
  
done

# make single file across full data (cnv)
cat results/qual_cnv/*/*/*.qual_cnv.cnv \
> results/qual_cnv/NORDiC_2021.qual_cnv.cnv

# make single file across full data (bed)
cat results/qual_cnv/*/*/*.qual_cnv.bed \
| sort -k1,1 -k2,2n \
> results/qual_cnv/NORDiC_2021.qual_cnv.bed

# make single file across full data (BAF-filtered bed)
cat results/qual_cnv/*/*/*.qual_cnv.CNV_validation_BAF.bed \
> results/qual_cnv/NORDiC_2021.qual_cnv.CNV_validation_BAF.bed

# make single file across full data (penncnv tsv)
src/qual_cnv/tables_merge.sh \
results/qual_cnv/NORDiC_2021.penncnv_intensity_metrics.with_rawcnv.tsv \
results/qual_cnv/*/*/*.penncnv_intensity_metrics.with_rawcnv.tsv

exit
