#!/bin/bash

## DATA
BW=data/annot_cnv/241-mammalian-2020v2.bigWig

## MODULE
module load python/3.8.7
module load R/4.0.4 R_packages/4.0.4
module load bioinfo-tools
module load BEDTools/2.25.0

# load filepaths
source cfg/cnv_calling.cfg

# add to python path   
export PYTHONPATH="$PYTHONPATH:$PWD/src/python/:$PY3_MODULES_DIR"

# make sure output dir exists
mkdir -p results/annot_cnv/

CDS_ANNOT=1
if [[ $CDS_ANNOT == 1 ]]
then

  # collect penncnv call metrics (raw)
  python3 src/annot_cnv/collect_penncnv_call_metrics.py \
  --chr-include 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 \
  --write-mode w \
  results/qual_cnv/NORDiC_2021.qual_cnv.cnv \
  results/annot_cnv/NORDiC_2021.qual_cnv.metrics.tsv

  # copy over raw set of qualifying CNV calls
  cp \
  results/cnv_filtering/NORDiC_2021.0.qual_cnv.bed \
  results/annot_cnv/NORDiC_2021.qual_cnv.bed
  
  # copy over CNV callset to use for caco burden sanity checks
  cp \
  results/cnv_filtering/NORDiC_2021.10.BAF_validation_dup.bed \
  results/annot_cnv/NORDiC_2021.callset.bed

  # get stats for callset
  awk '{OFS="\t"; print $6,$4}' \
  results/annot_cnv/NORDiC_2021.callset.bed \
  > results/annot_cnv/NORDiC_2021.callset.iid_locus.tsv
  python3 src/annot_cnv/collect_penncnv_call_metrics.py \
  --chr-include 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 \
  --write-mode w \
  --iid-locus-extract-tsv results/annot_cnv/NORDiC_2021.callset.iid_locus.tsv \
  results/qual_cnv/NORDiC_2021.qual_cnv.cnv \
  results/annot_cnv/NORDiC_2021.callset.metrics.tsv

  # get CDS interval for CCDS genes, in bed format
  python3 src/annot_cnv/gtf_to_bed.py \
  --attributes-is-in "transcript_biotype:protein_coding;gene_biotype:protein_coding" \
  --tags-required "CCDS" \
  --feature-classifs "CDS" \
  data/annot_cnv/Homo_sapiens.GRCh37.87.gtf.gz \
  results/annot_cnv/Homo_sapiens.GRCh37.87.CDS.bed

  # get min/max locus for CDS genes
  python3 src/annot_cnv/bed_minmax_locus.py \
    results/annot_cnv/Homo_sapiens.GRCh37.87.CDS.bed \
  | sort -k1,1 -k2,2n \
  > results/annot_cnv/Homo_sapiens.GRCh37.87.CDS.minmax_per_gene.bed
  
  # make copy of baseline callset for later
  cp results/annot_cnv/NORDiC_2021.callset.bed \
  results/annot_cnv/NORDiC_2021.callset.CDS_0_1.bed

  # get CNVs overlapping at least one CDS base
  bedtools intersect -u \
  -a results/annot_cnv/NORDiC_2021.callset.bed \
  -b results/annot_cnv/Homo_sapiens.GRCh37.87.CDS.bed \
  > results/annot_cnv/NORDiC_2021.callset.CDS_1.bed

  # get CNVs overlapping no CDS bases
  bedtools intersect -v \
  -a results/annot_cnv/NORDiC_2021.callset.bed \
  -b results/annot_cnv/Homo_sapiens.GRCh37.87.CDS.bed \
  > results/annot_cnv/NORDiC_2021.callset.CDS_0.bed

  # get each distinct overlap between a cnv and a protein-coding gene CDS
  bedtools intersect -wa -wb \
    -a results/annot_cnv/NORDiC_2021.callset.bed \
    -b results/annot_cnv/Homo_sapiens.GRCh37.87.CDS.bed \
  | cut -f 1-6,10 \
  | uniq \
  | sort -k1,1 -k2,2n \
  | uniq \
  > results/annot_cnv/NORDiC_2021.callset.CDS_gene_overlaps.bed

  # get interval/CDS gene overlap combos
  cut -f 4,7 results/annot_cnv/NORDiC_2021.callset.CDS_gene_overlaps.bed \
  | sort -k1,1 -k2,2 \
  > results/annot_cnv/NORDiC_2021.callset.interval_geneoverlaps.tsv
  
  # get 'allele counts' per cnv call (across array groups)
  bedtools intersect -wa -wb -f 0.5 -r \
    -a results/annot_cnv/NORDiC_2021.callset.bed \
    -b results/annot_cnv/NORDiC_2021.callset.bed \
  | awk '{if ($5==$11) {print $0}}' \
  | cut -f 1,2,3,4,5,6 \
  | uniq -c \
  | awk '{OFS="\t"; print $5,$6,$7,$1}' \
  > results/annot_cnv/NORDiC_2021.callset.locus_cnvtype_iid_freq.tsv

fi

FREQ_ANNOT=1
if [[ $FREQ_ANNOT == 1 ]]
then

  # get frequencies for intervals with at least one overlap
  bedtools intersect -wa -wb -f 0.5 -r \
    -a results/annot_cnv/NORDiC_2021.callset.bed \
    -b results/annot_cnv/NORDiC_2021.callset.bed  \
  | awk '{if (($5==$11) && ($6!=$12)) {print $4,$5}}' \
  | sort -k1,1 -k2,2 \
  | uniq -c \
  | awk '{OFS="\t"; print $3,$2,$1}' \
  > results/annot_cnv/NORDiC_2021.callset.cnvtype_interval_count.overlap_ge_1.tsv

  # add frequencies for 0-count intervals
  awk 'FNR==NR {ct_int[$1"_"$2]=$3;next}
       {if (!($5"_"$4 in ct_int)) {OFS="\t"; print $5,$4,"0"}}' \
  results/annot_cnv/NORDiC_2021.callset.cnvtype_interval_count.overlap_ge_1.tsv \
  results/annot_cnv/NORDiC_2021.callset.bed \
  > results/annot_cnv/NORDiC_2021.callset.cnvtype_interval_count.overlap_eq_0.tsv

  # merge into 1 tsv
  cat \
    results/annot_cnv/NORDiC_2021.callset.cnvtype_interval_count.overlap_eq_0.tsv \
    results/annot_cnv/NORDiC_2021.callset.cnvtype_interval_count.overlap_ge_1.tsv \
  | sort -k1,1 -k2,2 \
  > results/annot_cnv/NORDiC_2021.callset.cnvtype_interval_count.tsv

fi

GET_NDEV_CNVS=1
if [[ $GET_NDEV_CNVS == 1 ]]
then

  # make bed file for neurodev cnvs
  tail -n+2 data/annot_cnv/PMID_30994872_TableS11.csv \
  | awk -F"," '{OFS="\t"; print $3,$1}' \
  | awk '{gsub(":","\t",$1); gsub("-","\t",$1); gsub("chr","",$1); 
          OFS="\t"; print $1,$2,$3,$4}' \
  | sort -k1,1 -k2,2n \
  > results/annot_cnv/PMID_30994872_TableS11.bed

  # split neurodev cnvs into dels, dups
  grep "del" results/annot_cnv/PMID_30994872_TableS11.bed \
  > results/annot_cnv/PMID_30994872_TableS11.del.bed
  grep "dup" results/annot_cnv/PMID_30994872_TableS11.bed \
  > results/annot_cnv/PMID_30994872_TableS11.dup.bed

  for x in "qual_cnv" "callset"
  do
    # get ndev CNVs
    cat results/annot_cnv/NORDiC_2021.${x}.bed \
    | grep "DEL" \
    | bedtools intersect -wa -wb -F 0.5 \
      -a stdin \
      -b results/annot_cnv/PMID_30994872_TableS11.del.bed \
    > results/annot_cnv/NORDiC_2021.${x}.del_neurodev.bed
    cat results/annot_cnv/NORDiC_2021.${x}.bed \
    | grep "DUP" \
    | bedtools intersect -wa -wb -F 0.5 \
      -a stdin \
      -b results/annot_cnv/PMID_30994872_TableS11.dup.bed \
    > results/annot_cnv/NORDiC_2021.${x}.dup_neurodev.bed
  done

fi

GET_CONSTRAINT=1
if [[ $GET_CONSTRAINT == 1 ]]
then

  for x in "qual_cnv" "callset"
  do

    # make bed with chr
    awk '{print "chr"$0}' \
    results/annot_cnv/NORDiC_2021.${x}.bed \
    > results/annot_cnv/NORDiC_2021.${x}.with_chr.bed

    # perform liftover to hg38
    $LIFTOVER \
    results/annot_cnv/NORDiC_2021.${x}.with_chr.bed \
    data/annot_cnv/hg19ToHg38.over.chain \
    results/annot_cnv/NORDiC_2021.${x}.hg38.bed \
    results/annot_cnv/NORDiC_2021.${x}.hg38.unmapped
  
    # get nbp per constraint bin, for each locus (constrained/non-constrained)
    python3 src/annot_cnv/bigwig_annotate_loci.py \
    --add-metric-cols \
    --input-bed-locus-colnum 4 \
    $BW \
    results/annot_cnv/NORDiC_2021.${x}.hg38.bed \
    results/annot_cnv/NORDiC_2021.${x}.zoonomia.binary.tsv

    # get nbp per constraint bin (per integer-level score)
    python3 src/annot_cnv/zoonomia_bigwig_annotate_loci.py \
    --add-metric-cols \
    --input-bed-locus-colnum 4 \
    $BW \
    -20 12 \
    results/annot_cnv/NORDiC_2021.${x}.hg38.bed \
    results/annot_cnv/NORDiC_2021.${x}.zoonomia.integer.tsv



  done

fi

exit
