#!/bin/bash
#SBATCH -A sens2018605
#SBATCH -p core
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -t 08:00:00
#SBATCH --mem=8g
#SBATCH -J 09.visualize_cnvs
#SBATCH --output=logs/visualize_cnvs/visualize_cnvs.%A.out
#SBATCH --error=logs/visualize_cnvs/visualize_cnvs.%A.err

## MODULE
module load python/3.8.7
module load bioinfo-tools BEDTools/2.25.0

# load config
source cfg/cnv_calling.cfg

# update pythonpath with python3 modules dir
export PYTHONPATH="$PYTHONPATH:$PY3_MODULES_DIR"

# test modules
python3 -c 'import h5py; import pysam; import pybedtools'

# get datestamp
DATESTAMP=`date +%Y%m%d`

# make sure output dir exists
mkdir -p results/visualize_cnvs/

DELETERIOUS_CNVS=1
if [[ $DELETERIOUS_CNVS == 1 ]]
then

  # get subset of CNVs from callset BED that are of cnvtype and are in
  # deleterious callset
  awk 'FNR==NR {locus_iid[$1":"$2]=1;next}
       {if ($4":"$6 in locus_iid) {print $0}}' \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.deleterious_cnv_annots.tsv \
  results/assoc_tests/NORDiC_2021.callset.analysisready.bed \
  > results/visualize_cnvs/NORDiC_2021.callset.deleterious_CNVs.bed

  # for each cnv type..
  for y in "DEL" "DUP"
  do

    # make sure output dir exists
    dirname=NORDiC_deleterious_${y}s_BAF_LRR
    outdir=results/visualize_cnvs/${dirname}/
    mkdir -p ${outdir}

    # make baf/lrr plots for all cnv calls within array group
    time python3 src/visualize_cnvs/plot_cnv_loci.py \
    --cnvtype ${y} \
    --matplotlib-use Agg \
    --hdf5-baf-key lrr --hdf5-lrr-key baf \
    --plot-files-outroot ${outdir}/NORDiC_2021.callset \
    results/hdf5/NORDiC_CNV_caco_2021.intensity_data.h5 \
    results/cnv_filtering/NORDiC_2021.analysisready.group_dataset_iid.tsv \
    results/cnv_filtering/NORDiC_2021.analysisready.fam \
    GSA \
    results/visualize_cnvs/NORDiC_2021.callset.deleterious_CNVs.bed

    # make tarball with callset images, remove full dirs
    cd results/visualize_cnvs/
    tar -cvzf ${dirname}.${DATESTAMP}.tar.gz ${dirname}
    md5sum ${dirname}.${DATESTAMP}.tar.gz > ${dirname}.${DATESTAMP}.tar.gz.md5
    rm -r ${dirname}/
    cd ../../

  done

fi

ALL_CNVS=1
if [[ $ALL_CNVS == 1 ]]
then

# for each cnv type..
for y in "DEL" "DUP"
do
  # make sure output dir exists
  outdir=results/visualize_cnvs/callset_${y}/
  mkdir -p ${outdir}

  # make baf/lrr plots for all cnv calls within array group
  time python3 src/visualize_cnvs/plot_cnv_loci.py \
  --cnvtype ${y} \
  --matplotlib-use Agg \
  --hdf5-baf-key lrr --hdf5-lrr-key baf \
  --plot-files-outroot ${outdir}/NORDiC_2021.callset \
  results/hdf5/NORDiC_CNV_caco_2021.intensity_data.h5 \
  results/cnv_filtering/NORDiC_2021.analysisready.group_dataset_iid.tsv \
  results/cnv_filtering/NORDiC_2021.analysisready.fam \
  GSA \
  results/assoc_tests/NORDiC_2021.callset.analysisready.bed

  # make tarball with callset images, remove full dirs
  cd results/visualize_cnvs/
  tar -cvzf callset_${y}.${DATESTAMP}.tar.gz callset_${y}/
  md5sum callset_${y}.${DATESTAMP}.tar.gz > callset_${y}.${DATESTAMP}.tar.gz.md5
  rm -r callset_${y}/
  cd ../../

done

fi

exit
