#!/bin/bash
#SBATCH -A sens2018605
#SBATCH -p core
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -t 02:00:00
#SBATCH --mem=4g
#SBATCH -J 06.global_cnv_burden_analysis
#SBATCH --output=logs/global_cnv_burden_analysis/global_cnv_burden_analysis.%A.out
#SBATCH --error=logs/global_cnv_burden_analysis/global_cnv_burden_analysis.%A.err

## MODULE
module load R/4.0.4 R_packages/4.0.4
module load bioinfo-tools
module load plink/1.90b4.9

# make sure output dir exists
mkdir -p results/global_cnv_burden_analysis/

# unpack file with pli 
gunzip -c data/annot_cnv/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz \
> results/global_cnv_burden_analysis/gnomad.v2.1.1.lof_metrics.by_gene.txt

RUN_PCA=1
if [[ $RUN_PCA == 1 ]]
then

  # run PCA for full cohort
  plink \
  --allow-no-sex \
  --bfile ../NORDiC_GWAS_CNV_202103/results/dataset_merge/NORDiC_2021.MERGED.with_phe.nonambig_snv.updated_ids \
  --maf 0.05 \
  --extract ../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.1000_genomes_phase3.MERGED.vprune.prune.in \
  --keep results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.PCA.fidiid.tsv \
  --pca \
  --out results/global_cnv_burden_analysis/NORDiC_2022.PCA

  # run PCA without ANGI
  plink \
  --allow-no-sex \
  --bfile ../NORDiC_GWAS_CNV_202103/results/dataset_merge/NORDiC_2021.MERGED.with_phe.nonambig_snv.updated_ids \
  --maf 0.05 \
  --extract ../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.1000_genomes_phase3.MERGED.vprune.prune.in \
  --keep results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.PCA.fidiid.tsv \
  --remove results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.ANGI.fidiid.tsv \
  --pca \
  --out results/global_cnv_burden_analysis/NORDiC_2022.noANGI.PCA

fi

PLOT_PCA=1
if [[ $PLOT_PCA == 1 ]]
then

  # plot eigenvalues from PCA to help figure out which PCs to include as
  # covariates by default
  Rscript src/global_cnv_burden_analysis/plot_eigenvals.R \
  results/global_cnv_burden_analysis/NORDiC_2022.PCA.eigenval \
  results/global_cnv_burden_analysis/NORDiC_2022.PCA.scree_plot.pdf

  # plot eigenvalues from PCA to help figure out which PCs to include as
  # covariates by default (without ANGI)
  Rscript src/global_cnv_burden_analysis/plot_eigenvals.R \
  results/global_cnv_burden_analysis/NORDiC_2022.noANGI.PCA.eigenval \
  results/global_cnv_burden_analysis/NORDiC_2022.no_ANGI.PCA.scree_plot.pdf

  # plot PCs (full cohort)
  Rscript src/intensity_qc/plot_PCA.R \
  --first-n-pcs 8 \
  --fam-file ../NORDiC_GWAS_CNV_202103/results/dataset_merge/NORDiC_2021.MERGED.with_phe.nonambig_snv.updated_ids.fam \
  results/global_cnv_burden_analysis/NORDiC_2022.PCA.eigenvec \
  results/global_cnv_burden_analysis/NORDiC_2022.PCA_plot

  # plot PCs (no ANGI)
  Rscript src/intensity_qc/plot_PCA.R \
  --first-n-pcs 8 \
  --fam-file ../NORDiC_GWAS_CNV_202103/results/dataset_merge/NORDiC_2021.MERGED.with_phe.nonambig_snv.updated_ids.fam \
  results/global_cnv_burden_analysis/NORDiC_2022.noANGI.PCA.eigenvec \
  results/global_cnv_burden_analysis/NORDiC_2022.noANGI.PCA_plot

  # for full cohort, make fam file where norway samples are labelled as 'case',
  # sweden samples are labelled as control
  awk '{if ($2~/norway/) {
         print $1,$1,"0","0","0","2"
       } else if ($2~/sweden/) {
         print $1,$1,"0","0","0","1"
       }}' \
  results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv \
  > results/global_cnv_burden_analysis/NORDiC_2022.norway_case_sweden_ctrl_labels.fam
 
  # make PCA plots for full cohort labelled by norway/sweden rather than
  # case/control 
  Rscript src/intensity_qc/plot_PCA.R \
  --ca-color "#CC79A7" \
  --co-color "#009E73" \
  --first-n-pcs 8 \
  --fam-file results/global_cnv_burden_analysis/NORDiC_2022.norway_case_sweden_ctrl_labels.fam \
  results/global_cnv_burden_analysis/NORDiC_2022.PCA.eigenvec \
  results/global_cnv_burden_analysis/NORDiC_2022.norway_case_sweden_ctrl_labels.PCA_plot
  
  # get subset of fam in noANGI cohort
  awk 'FNR==NR{iids[$2]=1;next}{if ($2 in iids){print $0}}' \
  results/global_cnv_burden_analysis/NORDiC_2022.noANGI.PCA.eigenvec \
  results/global_cnv_burden_analysis/NORDiC_2022.norway_case_sweden_ctrl_labels.fam \
  > results/global_cnv_burden_analysis/NORDiC_2022.norway_case_sweden_ctrl_labels.noANGI.fam

  # make PCA plots for noANGI subset of cohort labelled by norway/sweden rather than
  # case/control 
  Rscript src/intensity_qc/plot_PCA.R \
  --ca-color "#CC79A7" \
  --co-color "#009E73" \
  --first-n-pcs 8 \
  --fam-file results/global_cnv_burden_analysis/NORDiC_2022.norway_case_sweden_ctrl_labels.noANGI.fam \
  results/global_cnv_burden_analysis/NORDiC_2022.noANGI.PCA.eigenvec \
  results/global_cnv_burden_analysis/NORDiC_2022.norway_case_sweden_ctrl_labels.noANGI.PCA_plot
 
fi

RUN_ANALYSIS=1
if [[ $RUN_ANALYSIS == 1 ]]
then

  # run CNV burden analysis Rscript
  Rscript 06.global_cnv_burden_analysis.R \
  --in-eigenvec results/global_cnv_burden_analysis/NORDiC_2022.PCA.eigenvec \
  --in-fam results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --in-iid-group-phe-tsv results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv \
  --outroot results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis \
  > results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.log \
  2>results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.err
  
fi

RUN_ANALYSIS_GT_100KB=1
if [[ $RUN_ANALYSIS_GT_100KB == 1 ]]
then

  # run CNV burden analysis Rscript (only cnvs > 100kb in size)
  Rscript 06.global_cnv_burden_analysis.R \
  --size-ge-100kb \
  --in-eigenvec results/global_cnv_burden_analysis/NORDiC_2022.PCA.eigenvec \
  --in-fam results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --in-iid-group-phe-tsv results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv \
  --outroot results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.gt100kb \
  > results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.gt100kb.log \
  2>results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.gt100kb.err

fi

RUN_ANALYSIS_NO_ANGI=1
if [[ $RUN_ANALYSIS_NO_ANGI == 1 ]]
then

  # run CNV burden analysis Rscript (without ANGI)
  Rscript 06.global_cnv_burden_analysis.R \
  --no-angi \
  --in-eigenvec results/global_cnv_burden_analysis/NORDiC_2022.noANGI.PCA.eigenvec \
  --in-fam results/cnv_filtering/NORDiC_2021.analysisready.fam \
  --in-iid-group-phe-tsv results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv \
  --outroot results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.noANGI \
  > results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.noANGI.log \
  2>results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.noANGI.err

fi

PLOT_RESULTS=1
if [[ $PLOT_RESULTS == 1 ]]
then

  # plot results (coco)
  Rscript src/global_cnv_burden_analysis/plot_global_cnv_burden_coco_caca.R \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.coco.tsv \
  results/global_cnv_burden_analysis/NORDiC_2022.coco_cnv_burden_analysis.plot \
  > results/global_cnv_burden_analysis/NORDiC_2022.coco_cnv_burden_analysis.plot.log \
  2>results/global_cnv_burden_analysis/NORDiC_2022.coco_cnv_burden_analysis.plot.err

  # plot results (caca)
  Rscript src/global_cnv_burden_analysis/plot_global_cnv_burden_coco_caca.R \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.caca.tsv \
  results/global_cnv_burden_analysis/NORDiC_2022.caca_cnv_burden_analysis.plot \
  > results/global_cnv_burden_analysis/NORDiC_2022.caca_cnv_burden_analysis.plot.log \
  2>results/global_cnv_burden_analysis/NORDiC_2022.caca_cnv_burden_analysis.plot.err

  # plot results
  Rscript src/global_cnv_burden_analysis/plot_global_cnv_burden_analysis_results.R \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.res.tsv \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.plot \
  > results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.plot.log \
  2>results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.plot.err

  # plot results (no ANGI)
  Rscript src/global_cnv_burden_analysis/plot_global_cnv_burden_analysis_results.R \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.noANGI.res.tsv \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.noANGI.plot \
  > results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.noANGI.plot.log \
  2>results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.noANGI.plot.err

  # plot results (gt100kb)
  Rscript src/global_cnv_burden_analysis/plot_global_cnv_burden_analysis_results.R \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.gt100kb.res.tsv \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.gt100kb.plot \
  > results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.gt100kb.plot.log \
  2>results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.gt100kb.plot.err


fi

exit
