#!/bin/bash

## PARAM
DATASETS=("NOR:NORDiC-NOR_cases_2019_1"
          "NOR:NORDiC-NOR_cases_2019_2"
          "NOR:NORDiC-NOR_cases_2020"
          "NOR:norway_ctrls_2019"
          "NOR:norway_ctrls_2020"
          "SWE:LG500_ctrls_2019"
          "SWE:NORDiC-SWE_cases_2018"
          "SWE:NORDiC-SWE_cases_2020"
          "SWE:ANGI_PhaseII_Pedersen_Controls_GSA-MD_wave1"
          "SWE:ANGI_PhaseII_Pedersen_Controls_GSA-MD_wave2"
         )
 
## MODULE
module load python/3.8.7
module load R/4.0.4 R_packages/4.0.4
module load bioinfo-tools
module load seqtk/1.2-r101
module load BEDTools/2.25.0

# get path to penncnv scripts
source cfg/cnv_calling.cfg

# add to python path
export PYTHONPATH="$PYTHONPATH:$PWD/src/python/:$PY3_MODULES_DIR"

# make sure output dir exists
mkdir -p results/cnv_filtering/

BUILD_BLACKLIST_REGION_BEDS=1
if [[ $BUILD_BLACKLIST_REGION_BEDS == 1 ]]
then

  # make bed file with all centromere and telomere loci, with 500kb padding
  # on each edge
  python3 src/cnv_filtering/make_centromere_telomere_blacklist_bed.py \
  --no-chr \
  --telomere-size 500000 \
  --telomere-extend 0 \
  --centromere-extend 500000 \
  hg19 \
  > results/cnv_filtering/hg19.centromere_500kb_padding.telomere_500kb.bed

  # get N-masked GRCh37 bed
  seqtk cutN -gp10000000 -n1 data/cnv_filtering/human_g1k_v37.fasta \
  > results/cnv_filtering/human_g1k_v37-N.bed

  # form bed file with segmental duplications
  # https://humanparalogy.gs.washington.edu/build37/data/GRCh37GenomicSuperDup.tab
  tail -n+2 data/cnv_filtering/GRCh37GenomicSuperDup.tab \
  | awk '{OFS="\t"; print $1,$2-1,$3; print $7,$8-1,$9}' \
  | sed 's/chr//g' \
  | sort -k1,1 -k2,2n \
  | uniq \
  | grep -v "_" \
  | bedtools merge -i stdin \
  > results/cnv_filtering/GRCh37GenomicSuperDup.bed

  # repeatmasker : downloaded from ucsc genome table browser on 20210621.
  # form bed file with simple repeats, low complexity repeats, satellite reps.
  # http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/rmsk.txt.gz
  zcat data/cnv_filtering/rmsk.txt.gz \
  | awk '{if (($12=="Simple_repeat")  || 
              ($12=="Low_complexity") || 
              ($12=="Satellite")) {
          OFS="\t"; print $6,$7-1,$8}
        }' \
  | tr " " "\t" \
  | grep -v "_" \
  | sed 's/chr//g' \
  | sort -k1,1 -k2,2n \
  | bedtools merge -i stdin \
  > results/cnv_filtering/repeatmasker_GRCh37.simplerepeat_lowcomplexity_satellite.bed

  # get list of ENSG IDs for genes that are either classified as 
  # 1) Immunoglobulin, or 
  # 2) T cell receptor
  awk -F"\t" '{if (($10!="Ensembl gene ID") && ($10!="")) {print $10}}' \
  data/cnv_filtering/HGNC_Immunoglobulins_genegroup_348.20220113.txt \
  data/cnv_filtering/HGNC_TcellReceptors_genegroup_370.20220113.txt \
  > results/cnv_filtering/HGNC_Immunoglobulins_TcellReceptors.ENSG.list

  # make BED file with loci for immunoglobulin genes
  python3 src/annot_cnv/cnv_gtf_annotation.py \
  --feature-classifs gene \
  --ensg-keep-listfile results/cnv_filtering/HGNC_Immunoglobulins_TcellReceptors.ENSG.list \
  --out-rgn-bed results/cnv_filtering/HGNC_Immunoglobulins_TcellReceptors.ENSG.bed \
  data/annot_cnv/Homo_sapiens.GRCh37.87.gtf.gz \
  results/qual_cnv/NORDiC_2021.qual_cnv.bed \
  results/cnv_filtering/HGNC_Immunoglobulins_TcellReceptors.qual_cnv.bed

  # Take File S1 from PMID 22374857 (Shirley et al 2012), remove header
  # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3370055/bin/NIHMS361907-supplement-Supp_File_S1.bed
  grep "^chr" \
    data/cnv_filtering/NIHMS361907-supplement-Supp_File_S1.bed \
  | sort -k1,1 -k2,2n \
  | cut -f1-3 \
  > results/cnv_filtering/NIHMS361907-supplement-Supp_File_S1.hg18.bed

  # liftover BED from Shirley et al 2012 from hg18 to hg19
  $LIFTOVER \
  results/cnv_filtering/NIHMS361907-supplement-Supp_File_S1.hg18.bed \
  data/cnv_filtering/hg18ToHg19.over.chain \
  results/cnv_filtering/NIHMS361907-supplement-Supp_File_S1.hg19.bed \
  results/cnv_filtering/NIHMS361907-supplement-Supp_File_S1.hg19.unmapped
  
  # remove 'chr' from bed
  cat \
    results/cnv_filtering/NIHMS361907-supplement-Supp_File_S1.hg19.bed \
  | sed 's/^chr//g' \
  > results/cnv_filtering/NIHMS361907-supplement-Supp_File_S1.hg19.no_chr.bed

fi

FILTERING_STEPS=1
if [[ $FILTERING_STEPS == 1 ]]
then

  FILTERING_PT1=1; if [[ $FILTERING_PT1 == 1 ]]; then

  # get file that has group, dataset and iid
  tail -n+2 \
    results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.tsv \
  | cut -f1,2,3 \
  > results/cnv_filtering/NORDiC_2021.analysisready.group_dataset_iid.tsv
  
  # subset the original qual cnv bed file on IIDs of samples that 
  # passed the intensity_qc step
  awk 'FNR==NR {iids[$1]=1;next}
       {if ($6 in iids) {print $0}}' \
  results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.iids.txt \
  results/qual_cnv/NORDiC_2021.qual_cnv.bed \
  > results/cnv_filtering/NORDiC_2021.0.qual_cnv.bed

  # make a fam file that only has subset of intensity qc passing samples in it
   awk 'FNR==NR {iids[$1]=1;next}
       {if ($2 in iids) {print $0}}' \
  results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.iids.txt \
  ../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.MERGED.pruned_miss_sex_rel_EUR_nonoutlier_het.fam \
  > results/cnv_filtering/NORDiC_2021.analysisready.fam 
  
  # hotspot plot (filter0)
  Rscript src/visualize_cnvs/cnv_hotspot_plot.R \
  --ref-genome hg19 \
  --cnv-bed results/cnv_filtering/NORDiC_2021.0.qual_cnv.bed \
  --fam results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --out-pdf results/cnv_filtering/NORDiC_2021.0.qual_cnv.hotspot.pdf

  # filter1 : centromere and telomere regions
  python3 src/cnv_filtering/filter_cnv_bed.py \
  --recipoverlap 0.5 \
  --xcov 0.5 \
  --minsize 30000 \
  --maxsize 20000000 \
  --dist 30000 \
  -x results/cnv_filtering/hg19.centromere_500kb_padding.telomere_500kb.bed \
  results/cnv_filtering/NORDiC_2021.0.qual_cnv.bed \
  results/cnv_filtering/NORDiC_2021.1.centromere_telomere.bed

  # hotspot plot (filter1)
  Rscript src/visualize_cnvs/cnv_hotspot_plot.R \
  --ref-genome hg19 \
  --cnv-bed results/cnv_filtering/NORDiC_2021.1.centromere_telomere.bed \
  --fam results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --out-pdf results/cnv_filtering/NORDiC_2021.1.centromere_telomere.hotspot.pdf

  # filter2 : polyN in the reference genome
  python3 src/cnv_filtering/filter_cnv_bed.py \
  --recipoverlap 0.5 \
  --xcov 0.5 \
  --minsize 30000 \
  --maxsize 20000000 \
  --dist 30000 \
  -x results/cnv_filtering/hg19.centromere_500kb_padding.telomere_500kb.bed \
  -x results/cnv_filtering/human_g1k_v37-N.bed \
  results/cnv_filtering/NORDiC_2021.0.qual_cnv.bed \
  results/cnv_filtering/NORDiC_2021.2.human_g1k_v37-N.bed

  # hotspot plot (filter2)
  Rscript src/visualize_cnvs/cnv_hotspot_plot.R \
  --ref-genome hg19 \
  --cnv-bed results/cnv_filtering/NORDiC_2021.2.human_g1k_v37-N.bed \
  --fam results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --out-pdf results/cnv_filtering/NORDiC_2021.2.human_g1k_v37-N.hotspot.pdf

  # filter3 : segmental duplication loci
  python3 src/cnv_filtering/filter_cnv_bed.py \
  --recipoverlap 0.5 \
  --xcov 0.5 \
  --minsize 30000 \
  --maxsize 20000000 \
  --dist 30000 \
  -x results/cnv_filtering/hg19.centromere_500kb_padding.telomere_500kb.bed \
  -x results/cnv_filtering/human_g1k_v37-N.bed \
  -x results/cnv_filtering/GRCh37GenomicSuperDup.bed \
  results/cnv_filtering/NORDiC_2021.0.qual_cnv.bed \
  results/cnv_filtering/NORDiC_2021.3.GenomicSuperDup.bed

  # hotspot plot (filter3)
  Rscript src/visualize_cnvs/cnv_hotspot_plot.R \
  --ref-genome hg19 \
  --cnv-bed results/cnv_filtering/NORDiC_2021.3.GenomicSuperDup.bed \
  --fam results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --out-pdf results/cnv_filtering/NORDiC_2021.3.GenomicSuperDup.hotspot.pdf

  # filter4 : repeatmasker loci that have one of the following labels:
  # 1. simple repeats
  # 2. low complexity regions
  # 3. satellite regions
  python3 src/cnv_filtering/filter_cnv_bed.py \
  --recipoverlap 0.5 \
  --xcov 0.5 \
  --minsize 30000 \
  --maxsize 20000000 \
  --dist 30000 \
  -x results/cnv_filtering/hg19.centromere_500kb_padding.telomere_500kb.bed \
  -x results/cnv_filtering/human_g1k_v37-N.bed \
  -x results/cnv_filtering/GRCh37GenomicSuperDup.bed \
  -x results/cnv_filtering/repeatmasker_GRCh37.simplerepeat_lowcomplexity_satellite.bed \
  results/cnv_filtering/NORDiC_2021.0.qual_cnv.bed \
  results/cnv_filtering/NORDiC_2021.4.repeatmasker_simplerepeat_lowcomplexity_satellite.bed

  # hotspot plot (filter4)
  Rscript src/visualize_cnvs/cnv_hotspot_plot.R \
  --ref-genome hg19 \
  --cnv-bed results/cnv_filtering/NORDiC_2021.4.repeatmasker_simplerepeat_lowcomplexity_satellite.bed \
  --fam results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --out-pdf results/cnv_filtering/NORDiC_2021.4.repeatmasker_simplerepeat_lowcomplexity_satellite.hotspot.pdf


  # filter5 : loci for Immunoglobulin and T cell receptor genes
  python3 src/cnv_filtering/filter_cnv_bed.py \
  --recipoverlap 0.5 \
  --xcov 0.5 \
  --minsize 30000 \
  --maxsize 20000000 \
  --dist 30000 \
  -x results/cnv_filtering/hg19.centromere_500kb_padding.telomere_500kb.bed \
  -x results/cnv_filtering/human_g1k_v37-N.bed \
  -x results/cnv_filtering/GRCh37GenomicSuperDup.bed \
  -x results/cnv_filtering/repeatmasker_GRCh37.simplerepeat_lowcomplexity_satellite.bed \
  -x results/cnv_filtering/HGNC_Immunoglobulins_TcellReceptors.ENSG.bed \
  results/cnv_filtering/NORDiC_2021.0.qual_cnv.bed \
  results/cnv_filtering/NORDiC_2021.5.Immunoglobulins_TcellReceptors.bed

  # hotspot plot (filter5)
  Rscript src/visualize_cnvs/cnv_hotspot_plot.R \
  --ref-genome hg19 \
  --cnv-bed results/cnv_filtering/NORDiC_2021.5.Immunoglobulins_TcellReceptors.bed \
  --fam results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --out-pdf results/cnv_filtering/NORDiC_2021.5.Immunoglobulins_TcellReceptors.hotspot.pdf

  # filter6 : cnv loci associated with LCL status in EBV-transformed cell-lines
  python3 src/cnv_filtering/filter_cnv_bed.py \
  --recipoverlap 0.5 \
  --xcov 0.5 \
  --minsize 30000 \
  --maxsize 20000000 \
  --dist 30000 \
  -x results/cnv_filtering/hg19.centromere_500kb_padding.telomere_500kb.bed \
  -x results/cnv_filtering/human_g1k_v37-N.bed \
  -x results/cnv_filtering/GRCh37GenomicSuperDup.bed \
  -x results/cnv_filtering/repeatmasker_GRCh37.simplerepeat_lowcomplexity_satellite.bed \
  -x results/cnv_filtering/HGNC_Immunoglobulins_TcellReceptors.ENSG.bed \
  -x results/cnv_filtering/NIHMS361907-supplement-Supp_File_S1.hg19.no_chr.bed \
  results/cnv_filtering/NORDiC_2021.0.qual_cnv.bed \
  results/cnv_filtering/NORDiC_2021.6.LCL_loci.bed

  # hotspot plot (filter6)
  Rscript src/visualize_cnvs/cnv_hotspot_plot.R \
  --ref-genome hg19 \
  --cnv-bed results/cnv_filtering/NORDiC_2021.6.LCL_loci.bed \
  --fam results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --out-pdf results/cnv_filtering/NORDiC_2021.6.LCL_loci.hotspot.pdf

  # filter7 : heterozygosity
  python3 src/cnv_filtering/CNV_filtration_heterozygosity.py \
  --out-metrics-tsv results/cnv_filtering/NORDiC_2021.7.het_filter.stats.tsv \
  results/hdf5/NORDiC_CNV_caco_2021.intensity_data.h5 \
  results/cnv_filtering/NORDiC_2021.6.LCL_loci.bed \
  results/cnv_filtering/NORDiC_2021.7.het_filter.bed 

  # filter8 : freq (gnomAD)
  N=`cat results/cnv_filtering/NORDiC_2021.analysisready.fam | wc -l`
  AFS="AF,AFR_AF,AFR_NONREF_FREQ,AMR_AF,AMR_NONREF_FREQ,CSA_AF,CSA_NONREF_FREQ,EAS_AF,EAS_NONREF_FREQ,EUR_AF,EUR_NONREF_FREQ,MID_AF,MID_NONREF_FREQ,OCN_AF,OCN_NONREF_FREQ,OTH_AF,OTH_NONREF_FREQ,SAS_AF,SAS_NONREF_FREQ"
  python3 src/cnv_filtering/filter_cnv_bed.py \
  --recipoverlap 0.5 \
  --xcov 0.5 \
  --maxfreq 0.01 \
  --minsize 30000 \
  --maxsize 20000000 \
  --dist 30000 \
  --vcf data/cnv_filtering/gnomad_v2.1_sv.nonneuro.sites.vcf.gz \
  --vcf-af-fields "${AFS}" \
  -x results/cnv_filtering/hg19.centromere_500kb_padding.telomere_500kb.bed \
  -x results/cnv_filtering/human_g1k_v37-N.bed \
  -x results/cnv_filtering/GRCh37GenomicSuperDup.bed \
  -x results/cnv_filtering/repeatmasker_GRCh37.simplerepeat_lowcomplexity_satellite.bed \
  -x results/cnv_filtering/HGNC_Immunoglobulins_TcellReceptors.ENSG.bed \
  -x results/cnv_filtering/NIHMS361907-supplement-Supp_File_S1.hg19.no_chr.bed \
  results/cnv_filtering/NORDiC_2021.7.het_filter.bed \
  results/cnv_filtering/NORDiC_2021.8.frq_gnomAD.bed

  # filter9 : freq in each individual dataset
  > results/cnv_filtering/datasets.in.tsv 
  for dsfull in ${DATASETS[@]}
  do
  
    # get nor_or_swe, dataset name
    nor_or_swe=`echo $dsfull | cut -d':' -f1`
    ds=`echo $dsfull | cut -d':' -f2`
    echo $dsfull

    # make sure output dir exists
    mkdir -p results/cnv_filtering/${nor_or_swe}/${ds}/

    # derive IID list for samples in dataset
    awk -v DS=${ds} '{if ($2==DS) {print $3}}' \
    results/cnv_filtering/NORDiC_2021.analysisready.group_dataset_iid.tsv \
    > results/cnv_filtering/${nor_or_swe}/${ds}/${nor_or_swe}.${ds}.analysisready.iids.list

    # get subset of calls that are specifically from dataset
    awk 'FNR==NR {iids[$1]=1;next}
         {if ($6 in iids) {print $0}}' \
    results/cnv_filtering/${nor_or_swe}/${ds}/${nor_or_swe}.${ds}.analysisready.iids.list \
    results/cnv_filtering/NORDiC_2021.8.frq_gnomAD.bed \
    > results/cnv_filtering/${nor_or_swe}/${ds}/${nor_or_swe}.${ds}.pre_frq_filter.bed

    # add to input datasets file
    pre_frq_filter_bed=${nor_or_swe}/${ds}/${nor_or_swe}.${ds}.pre_frq_filter.bed
    N_DS=`cat results/cnv_filtering/${nor_or_swe}/${ds}/${nor_or_swe}.${ds}.analysisready.iids.list | wc -l`
    echo -e "${ds}\t${N_DS}\t${pre_frq_filter_bed}" \
    >> results/cnv_filtering/datasets.in.tsv

  done

  # go to output dir
  cd results/cnv_filtering/

  # apply filter 
  N=`cat NORDiC_2021.analysisready.fam | wc -l`
  python3 ../../src/cnv_filtering/filter_cnv_bed.py \
  --recipoverlap 0.5 \
  --xcov 0.5 \
  --maxfreq 0.01 \
  --dist 30000 \
  --minsize 30000 \
  --maxsize 20000000 \
  --dist 30000 \
  --cohorts-list datasets.in.tsv \
  --allcohorts NORDiC_2021.8.frq_gnomAD.bed \
  --allcohorts_nsamp ${N} \
  NORDiC_2021.8.frq_gnomAD.bed \
  NORDiC_2021.9.frq_eachcohort.bed

  # go back to parent dir
  cd ../../

  # go to output dir
  cd results/cnv_filtering/

  # apply filter 
  N=`cat NORDiC_2021.analysisready.fam | wc -l`
  python3 ../../src/cnv_filtering/filter_cnv_bed.py \
  --recipoverlap 0.5 \
  --xcov 0.5 \
  --maxfreq 0.01 \
  --dist 30000 \
  --minsize 30000 \
  --maxsize 20000000 \
  --dist 30000 \
  --cohorts-list datasets.in.tsv \
  --allcohorts NORDiC_2021.8.frq_gnomAD.bed \
  --allcohorts_nsamp ${N} \
  NORDiC_2021.8.frq_gnomAD.bed \
  NORDiC_2021.9.frq_eachcohort.bed

  # go back to parent dir
  cd ../../

  # go to output dir
  cd results/cnv_filtering/

  # apply filter 
  N=`cat NORDiC_2021.analysisready.fam | wc -l`
  python3 ../../src/cnv_filtering/filter_cnv_bed.py \
  --recipoverlap 0.5 \
  --xcov 0.5 \
  --maxfreq 0.01 \
  --dist 30000 \
  --minsize 30000 \
  --maxsize 20000000 \
  --dist 30000 \
  --cohorts-list datasets.in.tsv \
  --allcohorts NORDiC_2021.8.frq_gnomAD.bed \
  --allcohorts_nsamp ${N} \
  NORDiC_2021.8.frq_gnomAD.bed \
  NORDiC_2021.9.frq_eachcohort.bed

  # go back to parent dir
  cd ../../

  # go to output dir
  cd results/cnv_filtering/

  # apply filter 
  N=`cat NORDiC_2021.analysisready.fam | wc -l`
  python3 ../../src/cnv_filtering/filter_cnv_bed.py \
  --recipoverlap 0.5 \
  --xcov 0.5 \
  --maxfreq 0.01 \
  --dist 30000 \
  --minsize 30000 \
  --maxsize 20000000 \
  --dist 30000 \
  --cohorts-list datasets.in.tsv \
  --allcohorts NORDiC_2021.8.frq_gnomAD.bed \
  --allcohorts_nsamp ${N} \
  NORDiC_2021.8.frq_gnomAD.bed \
  NORDiC_2021.9.frq_eachcohort.bed

  # go back to parent dir
  cd ../../

  # go to output dir
  cd results/cnv_filtering/

  # apply filter 
  N=`cat NORDiC_2021.analysisready.fam | wc -l`
  python3 ../../src/cnv_filtering/filter_cnv_bed.py \
  --recipoverlap 0.5 \
  --xcov 0.5 \
  --maxfreq 0.01 \
  --dist 30000 \
  --minsize 30000 \
  --maxsize 20000000 \
  --dist 30000 \
  --cohorts-list datasets.in.tsv \
  --allcohorts NORDiC_2021.8.frq_gnomAD.bed \
  --allcohorts_nsamp ${N} \
  NORDiC_2021.8.frq_gnomAD.bed \
  NORDiC_2021.9.frq_eachcohort.bed

  # go back to parent dir
  cd ../../

  # hotspot plot (filter9)
  Rscript src/visualize_cnvs/cnv_hotspot_plot.R \
  --ref-genome hg19 \
  --cnv-bed results/cnv_filtering/NORDiC_2021.9.frq_eachcohort.bed \
  --fam results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --out-pdf results/cnv_filtering/NORDiC_2021.9.frq_eachcohort.hotspot.pdf

  fi

  FILTERING_PT2=1; if [[ $FILTERING_PT2 == 1 ]]; then

  # filter10 : BAF validation 
  time python3 src/cnv_filtering/dup_validation_BAF.py \
  --hdf5-baf-key lrr \
  --hdf5-lrr-key baf \
  --matplotlib-use Agg \
  --baf-cluster-calling-method k-means \
  --out-filtered-cnv results/cnv_filtering/NORDiC_2021.10.BAF_validation_dup.bed \
  --print-current-cnv \
  results/hdf5/NORDiC_CNV_caco_2021.intensity_data.h5 \
  results/cnv_filtering/NORDiC_2021.9.frq_eachcohort.bed \
  results/cnv_filtering/NORDiC_2021.10.BAF_validation_dup

  # hotspot plot (filter10)
  Rscript src/visualize_cnvs/cnv_hotspot_plot.R \
  --ref-genome hg19 \
  --cnv-bed results/cnv_filtering/NORDiC_2021.10.BAF_validation.bed \
  --fam results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --out-pdf results/cnv_filtering/NORDiC_2021.10.BAF_validation.hotspot.pdf

  fi

fi

exit
