#!/bin/bash
#SBATCH -A sens2018605
#SBATCH -p core
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -t 02:00:00
#SBATCH --mem=8g
#SBATCH -J 10.additional_analyses
#SBATCH --output=logs/additional_analyses/additional_analyses.%A.out
#SBATCH --error=logs/additional_analyses/additional_analyses.%A.err

## MODULE
module load python/3.8.7
module load R/4.0.4 R_packages/4.0.4
module load bioinfo-tools plink/1.90b4.9 BEDTools/2.25.0

# get path to penncnv scripts
source cfg/cnv_calling.cfg

# add to python path
export PYTHONPATH="$PYTHONPATH:$PWD/src/python/:$PY3_MODULES_DIR"

# make sure output dir exists
mkdir -p results/additional_analyses/

# assoc test between deleterious CNV carrier status and PRS

CNV_PRS_ANALYSIS=1
if [[ $CNV_PRS_ANALYSIS == 1 ]]
then

  # make output dir
  mkdir -p results/additional_analyses/cnv_prs_analysis/

  # compute PCA for NORDiC cases only
  awk 'FNR==NR {iids[$1]=1;next} {if ($2 in iids) {print $1,$2}}' \
  results/cnv_carrier_table/NORDiC.cnv_carrier_table.20230516.del_dup.carriers.tsv \
  data/additional_analyses/ocd_ndnorswe_eur_mh-qc.hg19.ch.fl.bg.qcpass.fam \
  > results/additional_analyses/cnv_prs_analysis/NORDiC_cases.fidiid.txt
  plink \
  --bfile data/additional_analyses/ocd_ndnorswe_eur_mh-qc.hg19.ch.fl.bg.qcpass \
  --extract ../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.1000_genomes_phase3.MERGED.vprune.prune.in \
  --keep results/additional_analyses/cnv_prs_analysis/NORDiC_cases.fidiid.txt \
  --pca \
  --out results/additional_analyses/cnv_prs_analysis/NORDiC_cases.PCA

  # compare PRS in deleterious cnv carriers and non-carriers
  Rscript src/additional_analyses/prs_deleterious_cnv_presence.R \
  results/cnv_carrier_table/NORDiC.cnv_carrier_table.20230516.del_dup.carriers.tsv \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.metrics.tsv \
  results/additional_analyses/cnv_prs_analysis/NORDiC_cases.PCA.eigenvec \
  data/additional_analyses/ocd_ndnorswe.standingheight.PLINK.profile \
  data/additional_analyses/ocd_ndnorswe.ocd2017.PLINK.profile \
  data/additional_analyses/ocd_ndnorswe.CDG2019_ex23andMe.PLINK.profile \
  results/additional_analyses/cnv_prs_analysis/NORDiC.deleterious_cnv_presence_vs_PRS.tsv

fi

# assoc test between large CNV burden in chrX and OCD case status

CHRX_ANALYSIS=1
if [[ $CHRX_ANALYSIS == 1 ]]
then

  # make sure output dir exists
  mkdir -p results/additional_analyses/chrX_analysis/

  # compute PCA for male case/control only
  awk 'FNR==NR {iids[$1]=1;next} {if (($2 in iids) && ($5==1)) {print $1,$2}}' \
  results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv \
  data/additional_analyses/ocd_ndnorswe_eur_mh-qc.hg19.ch.fl.bg.qcpass.fam \
  > results/additional_analyses/chrX_analysis/NORDiC_case_control.male_only.fidiid.txt
  plink \
  --bfile data/additional_analyses/ocd_ndnorswe_eur_mh-qc.hg19.ch.fl.bg.qcpass \
  --extract ../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.1000_genomes_phase3.MERGED.vprune.prune.in \
  --keep results/additional_analyses/chrX_analysis/NORDiC_case_control.male_only.fidiid.txt \
  --pca \
  --out results/additional_analyses/chrX_analysis/NORDiC_case_control.male_only.PCA

  # compute PCA for female case/control only
  awk 'FNR==NR {iids[$1]=1;next} {if (($2 in iids) && ($5==2)) {print $1,$2}}' \
  results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv \
  data/additional_analyses/ocd_ndnorswe_eur_mh-qc.hg19.ch.fl.bg.qcpass.fam \
  > results/additional_analyses/chrX_analysis/NORDiC_case_control.female_only.fidiid.txt
  plink \
  --bfile data/additional_analyses/ocd_ndnorswe_eur_mh-qc.hg19.ch.fl.bg.qcpass \
  --extract ../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.1000_genomes_phase3.MERGED.vprune.prune.in \
  --keep results/additional_analyses/chrX_analysis/NORDiC_case_control.female_only.fidiid.txt \
  --pca \
  --out results/additional_analyses/chrX_analysis/NORDiC_case_control.female_only.PCA

  # get suitable large chrX CNV calls
  DATASETS=("NOR:NORDiC-NOR_cases_2019_1"
            "NOR:NORDiC-NOR_cases_2019_2"
            "NOR:NORDiC-NOR_cases_2020"
            "NOR:norway_ctrls_2019"
            "NOR:norway_ctrls_2020"
            "SWE:LG500_ctrls_2019"
            "SWE:NORDiC-SWE_cases_2018"
            "SWE:NORDiC-SWE_cases_2020"
            "SWE:ANGI_PhaseII_Pedersen_Controls_GSA-MD_wave1"
            "SWE:ANGI_PhaseII_Pedersen_Controls_GSA-MD_wave2"
           )
  > results/additional_analyses/chrX_analysis/NORDiC_2022.chrX_callset.raw.bed
  for dsfull in ${DATASETS[@]}
  do
    
    # get nor_or_swe, dataset name
    nor_or_swe=`echo $dsfull | cut -d':' -f1`
    ds=`echo $dsfull | cut -d':' -f2`
    echo $dsfull

    # pull down raw calls where size >30kb, nprobes >= 15
    cat results/qual_cnv/${nor_or_swe}/${ds}/${ds}.quantisnp.tsv \
    | sed	"s/^${ds}\.//g" \
    | awk -F"\t" '{if (($2==23) && (($4-$3) > 30000) && ($8 >= 15)) {
      OFS="\t"; print "X",$3-1,$4,$9,$1}}' \
      - \
    | awk '{OFS="\t"; print $1,$2,$3,"chr"$1":"$2+1"-"$3,$4,$5}' \
    >>results/additional_analyses/chrX_analysis/NORDiC_2022.chrX_callset.raw.bed

  done

  # get small slice of gnomAD freqs
  zcat data/cnv_filtering/gnomad_v2.1_sv.nonneuro.sites.vcf.gz \
  | egrep "^#|^X" \
  > results/additional_analyses/chrX_analysis/gnomad_v2.1_sv.nonneuro.sites.X.vcf

  # get subset of calls from samples in final case/control, split into male-only
  # and female-only
  for x in "male" "female"
  do

    # get number of samples
    N=`grep -c "_${x}" \
       results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv`

    # get single-sex raw bed
    cat results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv \
    | grep "_${x}" \
    | awk 'FNR==NR {iids[$1]=1;next}{if ($6 in iids) {print $0}}' \
      - \
      results/additional_analyses/chrX_analysis/NORDiC_2022.chrX_callset.raw.bed \
    | sort -k2,2n \
    > results/additional_analyses/chrX_analysis/NORDiC_2022.chrX_callset.raw.$x.bed 

    # apply same filters you used in autosomal cnv analysis
    AFS="AF,AFR_AF,AFR_NONREF_FREQ,AMR_AF,AMR_NONREF_FREQ,CSA_AF,CSA_NONREF_FREQ,EAS_AF,EAS_NONREF_FREQ,EUR_AF,EUR_NONREF_FREQ,MID_AF,MID_NONREF_FREQ,OCN_AF,OCN_NONREF_FREQ,OTH_AF,OTH_NONREF_FREQ,SAS_AF,SAS_NONREF_FREQ"
    python3 src/cnv_filtering/filter_cnv_bed.py \
    --recipoverlap 0.5 \
    --xcov 0.5 \
    --minsize 30000 \
    --maxsize 20000000 \
    --dist 30000 \
    --chr "X" \
    --maxfreq 0.01 \
    --vcf results/additional_analyses/chrX_analysis/gnomad_v2.1_sv.nonneuro.sites.X.vcf \
    --vcf-af-fields "${AFS}" \
    --allcohorts results/additional_analyses/chrX_analysis/NORDiC_2022.chrX_callset.raw.$x.bed \
    --allcohorts_nsamp ${N} \
    -x results/cnv_filtering/hg19.centromere_500kb_padding.telomere_500kb.bed \
    -x results/cnv_filtering/human_g1k_v37-N.bed \
    -x results/cnv_filtering/GRCh37GenomicSuperDup.bed \
    -x results/cnv_filtering/repeatmasker_GRCh37.simplerepeat_lowcomplexity_satellite.bed \
    -x results/cnv_filtering/HGNC_Immunoglobulins_TcellReceptors.ENSG.bed \
    -x results/cnv_filtering/NIHMS361907-supplement-Supp_File_S1.hg19.no_chr.bed \
    results/additional_analyses/chrX_analysis/NORDiC_2022.chrX_callset.raw.$x.bed \
    results/additional_analyses/chrX_analysis/NORDiC_2022.chrX_callset.filtered.$x.bed

  done

  # run burden comparison R script
  Rscript src/additional_analyses/chrX_analysis.R \
  results/global_cnv_burden_analysis/NORDiC_2022.global_cnv_burden_analysis.metrics.tsv \
  results/additional_analyses/chrX_analysis/NORDiC_case_control.male_only.PCA.eigenvec \
  results/additional_analyses/chrX_analysis/NORDiC_case_control.female_only.PCA.eigenvec \
  results/additional_analyses/chrX_analysis/NORDiC_2022.chrX_callset.filtered.male.bed \
  results/additional_analyses/chrX_analysis/NORDiC_2022.chrX_callset.filtered.female.bed \
  results/additional_analyses/chrX_analysis/NORDiC_2022.chrX_analysis.res.tsv

fi

exit
