#!/bin/bash

## MODULE
module load python/3.8.7
module load R/4.0.4 R_packages/4.0.4

# load filepaths
source cfg/cnv_calling.cfg

# add to python path   
export PYTHONPATH="$PYTHONPATH:$PWD/src/python/:$PY3_MODULES_DIR"

# make sure output dir exists
mkdir -p results/suppfigures_supptables/

# supplemental tables

MAKE_SUPP_TABLES=1
if [[ $MAKE_SUPP_TABLES == 1 ]]
then

  # Table S1 : number of samples per QC step
  cat results/intensity_qc/NORDiC_2021.dataset_n_per_qc_step.csv \
  | tr "," "\t" \
  | awk 'FNR==NR {array[$1]=$2;next}{OFS="\t"; print $1,array[$1],$0}' \
    data/suppfigures_supptables/dataset_arraytype.tsv \
    - \
  | cut -f 1-2,4- \
  > results/suppfigures_supptables/samples_per_qc_step.tsv 

  # Table S : number of cnv deletion calls per filtration step (per dataset)
  Rscript src/suppfigures_supptables/cnv_counts_rates_per_dataset_table.R \
  --out-tsv results/suppfigures_supptables/del_rate_per_dataset.per_filtering_step.tsv \
  --dels-only --group GSA --report-n-cnv \
  --dataset-adj "ANGI,norway_ctrls,NORDiC-SWE,NORDiC-NOR" \
  --fam-file results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --group-dataset-iid-tsv results/cnv_filtering/NORDiC_2021.analysisready.group_dataset_iid.tsv \
  qual_cnv:results/cnv_filtering/NORDiC_2021.0.qual_cnv.bed \
  centromere_telomere:results/cnv_filtering/NORDiC_2021.1.centromere_telomere.bed \
  human_g1k_v37-N:results/cnv_filtering/NORDiC_2021.2.human_g1k_v37-N.bed \
  segmental_dup:results/cnv_filtering/NORDiC_2021.3.GenomicSuperDup.bed \
  simplerepeat_lowcomplexity_satellite:results/cnv_filtering/NORDiC_2021.4.repeatmasker_simplerepeat_lowcomplexity_satellite.bed \
  Immunoglobulins_TcellReceptors:results/cnv_filtering/NORDiC_2021.5.Immunoglobulins_TcellReceptors.bed \
  LCL_loci:results/cnv_filtering/NORDiC_2021.6.LCL_loci.bed \
  het_filter:results/cnv_filtering/NORDiC_2021.7.het_filter.bed \
  frq_gnomAD:results/cnv_filtering/NORDiC_2021.8.frq_gnomAD.bed \
  frq_eachdataset:results/cnv_filtering/NORDiC_2021.9.frq_eachcohort.bed \
  BAF_validation:results/cnv_filtering/NORDiC_2021.10.BAF_validation_dup.bed 

  # Table S : number of cnv duplication calls per filtration step (per dataset)
  Rscript src/suppfigures_supptables/cnv_counts_rates_per_dataset_table.R \
  --out-tsv results/suppfigures_supptables/dup_rate_per_dataset.per_filtering_step.tsv \
  --dups-only --group GSA --report-n-cnv \
  --dataset-adj "ANGI,norway_ctrls,NORDiC-SWE,NORDiC-NOR" \
  --fam-file results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --group-dataset-iid-tsv results/cnv_filtering/NORDiC_2021.analysisready.group_dataset_iid.tsv \
  qual_cnv:results/cnv_filtering/NORDiC_2021.0.qual_cnv.bed \
  centromere_telomere:results/cnv_filtering/NORDiC_2021.1.centromere_telomere.bed \
  human_g1k_v37-N:results/cnv_filtering/NORDiC_2021.2.human_g1k_v37-N.bed \
  segmental_dup:results/cnv_filtering/NORDiC_2021.3.GenomicSuperDup.bed \
  simplerepeat_lowcomplexity_satellite:results/cnv_filtering/NORDiC_2021.4.repeatmasker_simplerepeat_lowcomplexity_satellite.bed \
  Immunoglobulins_TcellReceptors:results/cnv_filtering/NORDiC_2021.5.Immunoglobulins_TcellReceptors.bed \
  LCL_loci:results/cnv_filtering/NORDiC_2021.6.LCL_loci.bed \
  het_filter:results/cnv_filtering/NORDiC_2021.7.het_filter.bed \
  frq_gnomAD:results/cnv_filtering/NORDiC_2021.8.frq_gnomAD.bed \
  frq_eachdataset:results/cnv_filtering/NORDiC_2021.9.frq_eachcohort.bed \
  BAF_validation:results/cnv_filtering/NORDiC_2021.10.BAF_validation_dup.bed

  # Table S : number of cnv calls per filtration step (case/control)
  Rscript src/suppfigures_supptables/cnv_counts_rates_table.R \
  --group GSA \
  --add-ctrlonly-aov \
  --dataset-adj "ANGI,norway_ctrls,NORDiC-SWE,NORDiC-NOR" \
  --fam-file results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --group-dataset-iid-tsv results/cnv_filtering/NORDiC_2021.analysisready.group_dataset_iid.tsv \
  --out-tsv results/suppfigures_supptables/cnv_filtering_stats.tsv \
  qual_cnv:results/cnv_filtering/NORDiC_2021.0.qual_cnv.bed \
  centromere_telomere:results/cnv_filtering/NORDiC_2021.1.centromere_telomere.bed \
  human_g1k_v37-N:results/cnv_filtering/NORDiC_2021.2.human_g1k_v37-N.bed \
  segmental_dup:results/cnv_filtering/NORDiC_2021.3.GenomicSuperDup.bed \
  simplerepeat_lowcomplexity_satellite:results/cnv_filtering/NORDiC_2021.4.repeatmasker_simplerepeat_lowcomplexity_satellite.bed \
  Immunoglobulins_TcellReceptors:results/cnv_filtering/NORDiC_2021.5.Immunoglobulins_TcellReceptors.bed \
  LCL_loci:results/cnv_filtering/NORDiC_2021.6.LCL_loci.bed \
  het_filter:results/cnv_filtering/NORDiC_2021.7.het_filter.bed \
  frq_gnomAD:results/cnv_filtering/NORDiC_2021.8.frq_gnomAD.bed \
  frq_eachdataset:results/cnv_filtering/NORDiC_2021.9.frq_eachcohort.bed \
  BAF_validation:results/cnv_filtering/NORDiC_2021.10.BAF_validation_dup.bed

  # Table S : results from global CNV burden tests
  cat results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.res.tsv \
  > results/suppfigures_supptables/global_cnv_burden_test_results.tsv

  # Table S : gene-based CNV burden test results
  cat results/assoc_tests/NORDiC_2022.full.gene.res.tsv \
  > results/suppfigures_supptables/gene_based_test_results.tsv

  # Table S : breakpoint-based CNV burden test results
  python3 src/suppfigures_supptables/format_breakpoint_res.py \
  results/hdf5/NORDiC_2021.X_Y_XY_M.bim \
  results/assoc_tests/NORDiC_2022.full.breakpoint.res.tsv \
  > results/suppfigures_supptables/breakpoint_based_test_results.tsv

  # Table S : extTADA test results
  cat results/wes_meta/NORDiC_2022.OCD_rarevar_meta-analysis.771trios_2724ca_5369co.gene_res.tsv \
  > results/suppfigures_supptables/extTADA_results.tsv

  # Table S : CNV calls that were either neurodev, hit an NDD gene or hit a gene
  # with pLI > 0.995. For now, remove sample IID from table
  cat results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.deleterious_cnv_annots.tsv \
  | cut -f 1,3- \
  > results/suppfigures_supptables/NORDiC_deleterious_CNV_calls.tsv

  # Table S : case-only association tests between PRS and deleterious CNV carrier status
  cp \
  results/additional_analyses/cnv_prs_analysis/NORDiC.deleterious_cnv_presence_vs_PRS.tsv \
  results/suppfigures_supptables/NORDiC.deleterious_cnv_presence_vs_PRS.tsv

  # roll supplemental tables into one excel spreadsheet for submission
  python3 src/suppfigures_supptables/csvs_to_xlsx.py \
  --input-table-delim "\t" \
  --legend-txt data/suppfigures_supptables/tables.legend.txt \
  --out-xlsx results/suppfigures_supptables/Supplemental_Tables_unformatted.xlsx \
  "Table S1%results/suppfigures_supptables/samples_per_qc_step.tsv" \
  "Table S2%results/suppfigures_supptables/del_rate_per_dataset.per_filtering_step.tsv" \
  "Table S3%results/suppfigures_supptables/dup_rate_per_dataset.per_filtering_step.tsv" \
  "Table S4%results/suppfigures_supptables/cnv_filtering_stats.tsv" \
  "Table S5%results/suppfigures_supptables/global_cnv_burden_test_results.tsv" \
  "Table S6%results/suppfigures_supptables/gene_based_test_results.tsv" \
  "Table S7%results/suppfigures_supptables/breakpoint_based_test_results.tsv" \
  "Table S8%results/suppfigures_supptables/extTADA_results.tsv" \
  "Table S9%results/suppfigures_supptables/NORDiC_deleterious_CNV_calls.tsv" \
  "Table S10%results/suppfigures_supptables/NORDiC.deleterious_cnv_presence_vs_PRS.tsv"

fi

# supplemental figures

MAKE_SUPP_FIGURES=1
if [[ $MAKE_SUPP_FIGURES == 1 ]] 
then

  # Figure S : PCs from analysis-ready case/control cohort
  # colored by case/control status
  convert -density 300 -quality 100 \
  results/global_cnv_burden_analysis/NORDiC_2022.PCA_plot.PC1-8.pdf \
  results/suppfigures_supptables/PCA_plot_PC1-8.png

  # Figure S : Scree plot from analysis-ready case/control cohort
  convert -density 300 -quality 100 \
  results/global_cnv_burden_analysis/NORDiC_2022.PCA.scree_plot.pdf \
  results/suppfigures_supptables/PCA_scree_plot.png

  # Figure S : PCs from analysis-ready case/control cohort, 
  # colored by norway/sweden sample origin
  convert -density 300 -quality 100 \
  results/global_cnv_burden_analysis/NORDiC_2022.norway_case_sweden_ctrl_labels.PCA_plot.PC1-8.pdf \
  results/suppfigures_supptables/PCA_plot_PC1-8.norway_sweden.png

  # Figure S : LRR_SD distribution per dataset
  convert -density 300 -quality 100 \
  results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.dataset.LRR_SD.pdf \
  results/suppfigures_supptables/per_dataset.LRR_SD.png

  # Figure S : raw cnv call count per dataset (or non-QC-pass call count)
  convert -density 300 -quality 100 \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.dataset.n_cnv_raw.pdf \
  results/suppfigures_supptables/per_dataset.n_cnv_raw.png

  # Figure S : QC-pass cnv call count per dataset
  convert -density 300 -quality 100 \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.dataset.n_cnv.pdf \
  results/suppfigures_supptables/per_dataset.n_cnv.png

  # Figure S : QQ plot of 30kb-100kb dels and dups pre and post batch eff loci pruning
  # topleft=del_pre, toright=del_post, bottomleft=dup_pre, bottomright=dup_post
  # 1. convert each of the 4 input qq plots from pdf to png
  # 2. merge del_pre and del_post (convert a.png b.png +append row1.png; )
  # 3. merge dup_pre and dup_post (convert c.png d.png +append row2.png; )
  # 4. merge del row and dup row (convert row1.png row2.png -append grid.png)
  for x in "del" "dup"
  do
    convert -density 300 -quality 100 \
    results/assoc_tests/NORDiC_2022.full.CDS_0_1.size_30000_100000.${x}.qq.pdf \
    results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_30000_100000.${x}.qq.png
    convert -density 300 -quality 100 \
    results/assoc_tests/NORDiC_2022.full.CDS_0_1.size_30000_100000.analysisready.${x}.qq.pdf \
    results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_30000_100000.analysisready.${x}.qq.png
    convert \
    results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_30000_100000.${x}.qq.png \
    results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_30000_100000.analysisready.${x}.qq.png \
    +append \
    results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_30000_100000.${x}.pre_post_qc.qq.png
  done
  convert \
  results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_30000_100000.del.pre_post_qc.qq.png \
  results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_30000_100000.dup.pre_post_qc.qq.png \
  -append \
  results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_30000_100000.del_dup.pre_post_qc.qq.png
 
  # Figure S : QQ plot of 100kb-500kb dels and dups (no batch eff loci pruning
  # required)
  # 1. convert del qq to png
  # 2. convert dup qq to png
  # 3. top-bottom join qq plots
  for x in "del" "dup"
  do
    convert -density 300 -quality 100 \
    results/assoc_tests/NORDiC_2022.full.CDS_0_1.size_100000_500000.${x}.qq.pdf \
    results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_100000_500000.${x}.qq.png
  done
  convert \
  results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_100000_500000.del.qq.png \
  results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_100000_500000.dup.qq.png \
  +append \
  results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_100000_500000.del_dup.qq.png

  # Figure S : leave-one-out analysis (n_del, n_dup)
  convert -density 300 -quality 100 \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.plot.del_dup_burden.all_loo.pdf \
  results/suppfigures_supptables/global_cnv_burden.leave_one_out.png

  # Figure S : covariate leave-one-out analysis (n_del, n_dup)
  convert -density 300 -quality 100 \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.plot.del_dup_burden.loo_covar.pdf \
  results/suppfigures_supptables/global_cnv_burden.covar_leave_one_out.png

  # Figure S : case/control CNV burden by size bins
  convert -density 300 -quality 100 \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.plot.del_dup_burden.size_bins.pdf \
  results/suppfigures_supptables/global_cnv_burden.size_bins.png

  # Figure S : case/control CNV burden by CNV frequency
  convert -density 300 -quality 100 \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.plot.del_dup_burden.freq_bins.pdf \
  results/suppfigures_supptables/global_cnv_burden.freq_bins.png

  # Figure S: Burden of CNVs in haplosenstive and triplosensitive genes
  convert -density 300 -quality 100 \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.plot.del_dup_burden.pHaplo_pTriplo.pdf \
  results/suppfigures_supptables/global_cnv_burden.phaplo_ptriplo.png

  # Figure S: buden of neurodevelopmental CNVs
  convert -density 300 -quality 100 \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.plot.del_dup_burden.neurodevelopmental.pdf \
  results/suppfigures_supptables/global_cnv_burden.neurodevelopmental.png

  # Figure S: burden of noncoding CNVs per mammalian constraint bin
  convert -density 300 -quality 100 \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.plot.del_dup_burden.cons_bins.noncoding_cnvs_only.pdf \
  results/suppfigures_supptables/global_cnv_burden.constraint_bins.noncoding_cnvs_only.png

  # roll supp figures into one document
  python3 src/suppfigures_supptables/make_supp_figures_docx.py \
  --header "Table of Contents" \
  --captions-txt data/suppfigures_supptables/supp_figure_captions.txt \
  --out-docx results/suppfigures_supptables/Supplementary_Information_unformatted.docx \
  "Figure S1%6%results/suppfigures_supptables/PCA_plot_PC1-8.png" \
  "Figure S2%6%results/suppfigures_supptables/PCA_plot_PC1-8.norway_sweden.png" \
  "Figure S3%6%results/suppfigures_supptables/PCA_scree_plot.png" \
  "Figure S4%6%results/suppfigures_supptables/per_dataset.LRR_SD.png" \
  "Figure S5%6%results/suppfigures_supptables/per_dataset.n_cnv_raw.png" \
  "Figure S6%6%results/suppfigures_supptables/per_dataset.n_cnv.png" \
  "Figure S7%6%results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_30000_100000.del_dup.pre_post_qc.qq.png" \
  "Figure S8%6%results/suppfigures_supptables/NORDiC_2022.full.CDS_0_1.size_100000_500000.del_dup.qq.png" \
  "Figure S9%6%results/suppfigures_supptables/global_cnv_burden.leave_one_out.png" \
  "Figure S10%6%results/suppfigures_supptables/global_cnv_burden.covar_leave_one_out.png" \
  "Figure S11%6%results/suppfigures_supptables/global_cnv_burden.size_bins.png" \
  "Figure S12%6%results/suppfigures_supptables/global_cnv_burden.freq_bins.png" \
  "Figure S13%6%results/suppfigures_supptables/global_cnv_burden.phaplo_ptriplo.png" \
  "Figure S14%6%results/suppfigures_supptables/global_cnv_burden.constraint_bins.noncoding_cnvs_only.png"


fi

# supp file : BAF/LRR plots for all notable CNVs listed in supp tables?

# make tarball of supp figures and tables

MAKE_TARBALL=1
if [[ $MAKE_TARBALL == 1 ]]
then
  cd results/
  DATESTAMP=`date +%Y%m%d`
  tar -cvzf NORDiC_CNV.suppfigures_supptables.$DATESTAMP.tar.gz \
  suppfigures_supptables/*.docx \
  suppfigures_supptables/*.xlsx
  cd ../
fi

exit
