#!/bin/bash
#SBATCH -A sens2018605
#SBATCH -p core
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -t 06:00:00
#SBATCH --mem=8g
#SBATCH -J 05.assoc_tests
#SBATCH --output=logs/assoc_tests/assoc_tests.%A.out
#SBATCH --error=logs/assoc_tests/assoc_tests.%A.err

# association tests conducted with the purpose of 
# 1) ensuring that summary statistics are well-controlled, p not inflated
# 2) identifying genes/loci assoc with OCD case status

## MODULE
module load python/3.8.7
module load R/4.0.4 R_packages/4.0.4
module load bioinfo-tools
module load BEDTools/2.25.0

# load filepaths
source cfg/cnv_calling.cfg

# add to python path   
export PYTHONPATH="$PYTHONPATH:$PWD/src/python/:$PY3_MODULES_DIR"

# make sure output dir exists
mkdir -p results/assoc_tests/

# one single case/control group
awk '{OFS="\t"; print $2,"NORDiC_casectrl",$6-1}' \
results/cnv_filtering/NORDiC_2021.analysisready.fam \
> results/assoc_tests/NORDiC_2022.iid_group_pheno.not_sex_stratified.tsv

# split data into male and female groups
awk '{if ($5==1) {OFS="\t"; print $2,"NORDiC_casectrl_male",$6-1}}' \
results/cnv_filtering/NORDiC_2021.analysisready.fam \
> results/assoc_tests/NORDiC_2022.iid_group_pheno.sex_stratified.tsv
awk '{if ($5==2) {OFS="\t"; print $2,"NORDiC_casectrl_female",$6-1}}' \
results/cnv_filtering/NORDiC_2021.analysisready.fam \
>>results/assoc_tests/NORDiC_2022.iid_group_pheno.sex_stratified.tsv

# get norwegian and swedish iids
cat results/intensity_qc/NORDiC_2021.dataset_IID.tsv \
| egrep "\-NOR_|norway" \
| cut -f 2 \
> results/assoc_tests/NORDiC_2022.norway.iid.list
cat results/intensity_qc/NORDiC_2021.dataset_IID.tsv \
| egrep -v "\-NOR_|norway" \
| cut -f 2 \
> results/assoc_tests/NORDiC_2022.sweden.iid.list

# make norway/sweden groups
> results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv
for grp in "norway" "sweden"
do

  awk -v GRP=$grp 'FNR==NR {iids[$1]=1;next} 
                   {if (($2 in iids) && ($5==1)) {OFS="\t"; print $2,"NORDiC-"GRP"_casectrl_male",$6-1}}' \
  results/assoc_tests/NORDiC_2022.$grp.iid.list \
  results/cnv_filtering/NORDiC_2021.analysisready.fam \
  >> results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv
  awk -v GRP=$grp 'FNR==NR {iids[$1]=1;next} 
                   {if (($2 in iids) && ($5==2)) {OFS="\t"; print $2,"NORDiC-"GRP"_casectrl_female",$6-1}}' \
  results/assoc_tests/NORDiC_2022.$grp.iid.list \
  results/cnv_filtering/NORDiC_2021.analysisready.fam \
  >> results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv

done

# make table of sample counts per case/control group
infile=results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv
outfile=results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.nca_nco_tbl.tsv
echo -e "group\tn_cases\tn_controls" \
> $outfile
grps=(NORDiC-sweden_casectrl_male
      NORDiC-sweden_casectrl_female
      NORDiC-norway_casectrl_male
      NORDiC-norway_casectrl_female)
for grp in ${grps[@]}
do
  nca=`awk -v GRP="$grp" '{if (($2==GRP) && ($3==1)) {N++}} END {print N}' $infile`
  nco=`awk -v GRP="$grp" '{if (($2==GRP) && ($3==0)) {N++}} END {print N}' $infile`
  echo -e "$grp\t$nca\t$nco" >> $outfile
done

NULL_TESTS=1
if [[ $NULL_TESTS == 1 ]]
then

  # null association tests (per array group, merge all non-CDS CNVs and then
  # cluster and perform locus-based tests on these loci. Resulting sumstats
  # should have low to zero genomic inflation.
  grp="full"
  for x in "0_1"
  do

    # do null tests on size bins
    size_mins=(30000 30000 100000 100000)
    size_maxes=(20000000 100000 200000 500000)
    nsize0=`expr ${#size_mins[@]} - 1`
    for i in `seq 0 $nsize0`
    do

      # define min and max cnv size
      sizemin=${size_mins[$i]}
      sizemax=${size_maxes[$i]}

      # get set of cnvs that fall in size bin
      awk -v SIZEMIN=${sizemin} -v SIZEMAX=${sizemax} \
      '{size=$3-$2; if ((SIZEMIN <= size) && (size <= SIZEMAX)) {print $0}}' \
      results/annot_cnv/NORDiC_2021.callset.CDS_${x}.bed \
      > results/assoc_tests/NORDiC_2022.callset.CDS_${x}.size_${sizemin}_${sizemax}.bed

      # from contiguous CNV call interval clusters
      bedtools cluster \
      -i results/assoc_tests/NORDiC_2022.callset.CDS_${x}.size_${sizemin}_${sizemax}.bed \
      | python3 src/assoc_tests/bedtools_cluster_to_merged_intervals.py stdin \
      > results/assoc_tests/NORDiC_2022.callset.CDS_${x}.size_${sizemin}_${sizemax}.clusters.bed
      
      # run assoc tests on CNV clusters 
      python3 src/assoc_tests/cnv_assoc_tests.py \
      --sampleinfo-group-grep "${grp}" \
      --minsize ${sizemin} \
      --maxsize ${sizemax} \
      --use-cmh-stats \
      --cnv-qq-plot-pdf results/assoc_tests/NORDiC_2022.${grp}.CDS_${x}.size_${sizemin}_${sizemax}.cnv.qq.pdf \
      --del-qq-plot-pdf results/assoc_tests/NORDiC_2022.${grp}.CDS_${x}.size_${sizemin}_${sizemax}.del.qq.pdf \
      --dup-qq-plot-pdf results/assoc_tests/NORDiC_2022.${grp}.CDS_${x}.size_${sizemin}_${sizemax}.dup.qq.pdf \
      --random-seed-number 11591 --n-perm 1000 \
      --chr 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 \
      --gene-bed results/assoc_tests/NORDiC_2022.callset.CDS_${x}.size_${sizemin}_${sizemax}.clusters.bed  \
      --cnv-types DEL,DUP \
      --matplotlib-use Agg \
      results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv \
      results/annot_cnv/NORDiC_2021.callset.CDS_${x}.bed \
      results/assoc_tests/NORDiC_2022.${grp}.CDS_${x}.size_${sizemin}_${sizemax}.res.tsv

    done

  done

  # lambda inflation for small CNVs, being driven by 3 clusters of small CNVs.
  # get cluster intervals and exclude small CNVs overlapping these loci out of analysis.
  # most likely are batch effects.
  
  # get names of clusters to exclude
  cat results/assoc_tests/NORDiC_2022.full.CDS_0_1.size_30000_100000.res.tsv \
  | awk '{if ($10<0.05) {print $1}}' \
  | sed 's/_DEL//g; s/_DUP//g' \
  > results/assoc_tests/NORDiC_2022.full.CDS_0_1.size_30000_100000.loci_exclude.list

  # get bed intervals to go along with loci
  awk 'FNR==NR{loci[$1]=1;next}{if ($4 in loci) {print $0}}' \
    results/assoc_tests/NORDiC_2022.full.CDS_0_1.size_30000_100000.loci_exclude.list \
    results/assoc_tests/NORDiC_2022.callset.CDS_0_1.size_30000_100000.clusters.bed \
  > results/assoc_tests/NORDiC_2022.full.CDS_0_1.size_30000_100000.loci_exclude.bed

  # create bed file of small cnvs not overlapping these loci
  awk -v SIZEMIN=30000 -v SIZEMAX=100000 \
    '{size=$3-$2; if ((SIZEMIN <= size) && (size <= SIZEMAX)) {print $0}}' \
    results/annot_cnv/NORDiC_2021.callset.bed \
  | bedtools intersect -v \
    -a stdin \
    -b results/assoc_tests/NORDiC_2022.full.CDS_0_1.size_30000_100000.loci_exclude.bed \
  > results/assoc_tests/NORDiC_2021.callset.size_30000_100000.fail_loci_rm.bed 
 
  # append to all cnvs at leaast 100kb in size to form analysis-ready callset
  awk -v SIZEMIN=100000 \
    '{size=$3-$2; if (SIZEMIN <= size) {print $0}}' \
    results/annot_cnv/NORDiC_2021.callset.bed \
  | cat \
    - \
    results/assoc_tests/NORDiC_2021.callset.size_30000_100000.fail_loci_rm.bed \
  | sort -k1,1 -k2,2n \
  > results/assoc_tests/NORDiC_2021.callset.analysisready.bed

  # get CDS overlaps
  bedtools intersect -wa -wb \
    -a results/assoc_tests/NORDiC_2021.callset.analysisready.bed \
    -b results/annot_cnv/Homo_sapiens.GRCh37.87.CDS.bed \
  | cut -f 1-6,10 \
  | uniq \
  | sort -k1,1 -k2,2n \
  | uniq \
  > results/assoc_tests/NORDiC_2021.callset.analysisready.CDS_gene_overlaps.bed

  # redo qq plot with fail loci excluded
  size_mins=(30000 30000)
  size_maxes=(20000000 100000)
  for i in `seq 0 $nsize0`
  do

    # define min and max cnv size
    sizemin=${size_mins[$i]}
    sizemax=${size_maxes[$i]}
    
    # run assoc tests on CNV clusters 
    python3 src/assoc_tests/cnv_assoc_tests.py \
    --sampleinfo-group-grep "${grp}" \
    --minsize ${sizemin} \
    --maxsize ${sizemax} \
    --use-cmh-stats \
    --cnv-qq-plot-pdf results/assoc_tests/NORDiC_2022.${grp}.CDS_${x}.size_${sizemin}_${sizemax}.analysisready.cnv.qq.pdf \
    --del-qq-plot-pdf results/assoc_tests/NORDiC_2022.${grp}.CDS_${x}.size_${sizemin}_${sizemax}.analysisready.del.qq.pdf \
    --dup-qq-plot-pdf results/assoc_tests/NORDiC_2022.${grp}.CDS_${x}.size_${sizemin}_${sizemax}.analysisready.dup.qq.pdf \
    --random-seed-number 11591 --n-perm 1000 \
    --chr 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 \
    --gene-bed results/assoc_tests/NORDiC_2022.callset.CDS_${x}.size_${sizemin}_${sizemax}.clusters.bed  \
    --cnv-types DEL,DUP \
    --matplotlib-use Agg \
    results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv \
    results/assoc_tests/NORDiC_2021.callset.analysisready.bed \
    results/assoc_tests/NORDiC_2022.${grp}.CDS_${x}.size_${sizemin}_${sizemax}.analysisready.res.tsv

  done 

fi

GENE_TESTS_ALL=1
if [[ $GENE_TESTS_ALL == 1 ]]
then

  # clump annotations with 100% overlap between neighboring gene units
  # (all samples)
  python3 src/assoc_tests/merge_neighboring_geneunits.py \
  --clump-sample-overlap-fraction-min 0.5 \
  results/annot_cnv/Homo_sapiens.GRCh37.87.CDS.bed \
  results/assoc_tests/NORDiC_2021.callset.analysisready.CDS_gene_overlaps.bed \
  > results/assoc_tests/Homo_sapiens.GRCh37.87.CDS.clumped.full.bed

  grp="full"
  python3 src/assoc_tests/cnv_assoc_tests.py \
  --sampleinfo-group-grep "${grp}" \
  --minsize 30000 \
  --use-cmh-stats \
  --cnv-qq-plot-pdf results/assoc_tests/NORDiC_2022.${grp}.gene.cnv.qq.pdf \
  --del-qq-plot-pdf results/assoc_tests/NORDiC_2022.${grp}.gene.del.qq.pdf \
  --dup-qq-plot-pdf results/assoc_tests/NORDiC_2022.${grp}.gene.dup.qq.pdf \
  --random-seed-number 11591 --n-perm 1000 \
  --chr 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 \
  --gene-bed results/assoc_tests/Homo_sapiens.GRCh37.87.CDS.clumped.full.bed \
  --cnv-types DEL,DUP \
  --matplotlib-use Agg \
  results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv \
  results/assoc_tests/NORDiC_2021.callset.analysisready.bed \
  results/assoc_tests/NORDiC_2022.${grp}.gene.res.tsv

  # adjust P for false discovery rate
  Rscript src/assoc_tests/add_padj_col.R \
  results/assoc_tests/NORDiC_2022.full.gene.res.tsv \
  cmh_p fdr cmh_p_fdr \
  results/assoc_tests/NORDiC_2022.full.gene.res.fdr.tsv

  # for assessment of sumstats, merge calls into CDS gene clumps
  bedtools intersect -wa -wb \
    -a results/assoc_tests/NORDiC_2021.callset.analysisready.bed \
    -b results/assoc_tests/Homo_sapiens.GRCh37.87.CDS.clumped.full.bed \
  | cut -f 1-6,10 \
  | sort -k1,1 -k2,2 -k7,7 \
  | uniq \
  > results/assoc_tests/NORDiC_2021.callset.CDS_gene_overlaps.clumped.bed

  # test for a elevated burden in cases relative to controls for 
  # 1. singleton test statistics (ex: for gene X, 1 carrier sample and its a case)
  # 2. non-singleton (n_carrier > 1) cnv frequency
  for x in "del" "dup"
  do
    
    python3 src/assoc_tests/cnv_recurrence_permutation.py \
    --${x}s-only \
    --n-permutations 10000 \
    --seed-number 11591 \
    results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv \
    results/assoc_tests/NORDiC_2021.callset.CDS_gene_overlaps.clumped.bed \
    results/assoc_tests/NORDiC_2022.${grp}.sumstats_case_excess.${x} \
    > results/assoc_tests/NORDiC_2022.${grp}.sumstats_case_excess.${x}.out \
    2>results/assoc_tests/NORDiC_2022.${grp}.sumstats_case_excess.${x}.err

  done 

fi

BREAKPOINT_TESTS=1
if [[ $BREAKPOINT_TESTS == 1 ]]
then

  # get bed with probe sites
  bim=../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.MERGED.pruned_miss_sex_rel.bim
  cat $bim \
  | awk '{OFS="\t"; print $1,$4-1,$4,$2}' \
  | sort -k1,1 -k2,2n \
  > results/assoc_tests/NORDiC_2022.probe_sites.bed

  # get overlaps with callset
  bedtools intersect -wa -wb \
    -a results/assoc_tests/NORDiC_2021.callset.analysisready.bed \
    -b results/assoc_tests/NORDiC_2022.probe_sites.bed \
  | cut -f 1-6,10 \
  | uniq \
  > results/assoc_tests/NORDiC_2022.callset.probe_sites_overlaps.bed

  # get probe site clumps the same way you did with gene overlap clumps
  python3 src/assoc_tests/merge_neighboring_geneunits.py \
  --clump-sample-overlap-fraction-min 0.5 \
  results/assoc_tests/NORDiC_2022.probe_sites.bed \
  results/assoc_tests/NORDiC_2022.callset.probe_sites_overlaps.bed \
  > results/assoc_tests/NORDiC_2022.probe_sites.clumped.bed  

  # do locus-based assoc tests on probe clumps
  grp="full"
  python3 src/assoc_tests/cnv_assoc_tests.py \
  --sampleinfo-group-grep "${grp}" \
  --use-cmh-stats \
  --minsize 30000 \
  --del-qq-plot-pdf results/assoc_tests/NORDiC_2022.${grp}.breakpoint.del.qq.pdf \
  --dup-qq-plot-pdf results/assoc_tests/NORDiC_2022.${grp}.breakpoint.dup.qq.pdf \
  --random-seed-number 11591 --n-perm 1000 \
  --chr 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 \
  --cnv-types DEL,DUP \
  --matplotlib-use Agg \
  --gene-bed results/assoc_tests/NORDiC_2022.probe_sites.clumped.bed \
  results/assoc_tests/NORDiC_2022.iid_group_pheno.norway_sweden.sex_stratified.tsv \
  results/assoc_tests/NORDiC_2021.callset.analysisready.bed \
  results/assoc_tests/NORDiC_2022.${grp}.breakpoint.res.tsv

  # adjust P for false discovery rate
  Rscript src/assoc_tests/add_padj_col.R \
  results/assoc_tests/NORDiC_2022.${grp}.breakpoint.res.tsv \
  cmh_p fdr cmh_p_fdr \
  results/assoc_tests/NORDiC_2022.${grp}.breakpoint.res.fdr.tsv

fi

exit
