#!/bin/bash
#SBATCH -A sens2018605
#SBATCH -p core
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -t 03:00:00
#SBATCH --mem=4g
#SBATCH -J 02.intensity_qc
#SBATCH --output=logs/intensity_qc/intensity_qc.%A.out
#SBATCH --error=logs/intensity_qc/intensity_qc.%A.err

## MODULE
module load R/4.0.4 R_packages/4.0.4
module load bioinfo-tools
module load plink/1.90b4.9

# make sure output dir exists
mkdir -p results/intensity_qc/

# get subset of samples that passed sample biological qc qc
awk 'FNR==NR {iids[$2]=$1;next}
     {if (($3=="IID") || ($3 in iids)) {print $0}}' \
../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.MERGED.pruned_miss_sex_rel_EUR_nonoutlier_het.fam \
results/qual_cnv/NORDiC_2021.penncnv_intensity_metrics.with_rawcnv.tsv \
> results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc.tsv

# analyze distributions of QC metrics per dataset
Rscript src/intensity_qc/intensity_data_plotting.R \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc.tsv \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc
 
# perform outlier pruning across the full data
Rscript src/intensity_qc/intensity_callset_metrics_outlierpruning.R \
--by-dataset \
--n-sd-thresh 3 \
--outlier-param-cols LRR_SD,BAF_DRIFT,abs_WF \
--outlier-metric-count-tsv results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_qc.outlier_metric_count.tsv \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc.tsv \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_qc.iids.txt

# get subset of table that are intensity qc - passing
awk 'FNR==NR {iids[$1]=1;next}
     {if (($3 == "IID") || ($3 in iids)) {print $0}}' \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_qc.iids.txt \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc.tsv \
> results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_qc.tsv

# replot intensity metrics
Rscript src/intensity_qc/intensity_data_plotting.R \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_qc.tsv \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_qc

# manual inspection suggests that you should remove samples with one of the following :
# 1. n_cnv_raw > 20
# 2. nbp_cnv_raw > 20,000,000
Rscript src/intensity_qc/intensity_callset_metrics_outlierpruning.R \
--param-maxes "n_cnv_raw=20,nbp_cnv_raw=20000000" \
--outlier-metric-count-tsv results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.outlier_metric_count.tsv \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_qc.tsv \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.iids.txt

# get subset of table that are intensity qc - passing
awk 'FNR==NR {iids[$1]=1;next}
     {if (($3 == "IID") || ($3 in iids)) {print $0}}' \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.iids.txt \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc.tsv \
> results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.tsv

# replot intensity metrics
Rscript src/intensity_qc/intensity_data_plotting.R \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.tsv \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc

# get FID/IID of intensity pruned sample set
awk 'FNR==NR {iids[$1]=1;next} 
     {if ($2 in iids) {OFS="\t"; print $1,$2}}' \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.iids.txt \
../NORDiC_GWAS_CNV_202103/results/dataset_merge/NORDiC_2021.MERGED.with_phe.nonambig_snv.updated_ids.fam \
> results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.PCA.fidiid.tsv

# PCA of intensity pruned samples
plink \
--allow-no-sex \
--bfile ../NORDiC_GWAS_CNV_202103/results/dataset_merge/NORDiC_2021.MERGED.with_phe.nonambig_snv.updated_ids \
--extract ../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.1000_genomes_phase3.MERGED.vprune.prune.in \
--keep results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.PCA.fidiid.tsv \
--pca \
--out results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.PCA

# get ANGI samples
tail -n+2 results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.tsv \
| awk '{OFS="\t"; if ($2 ~ /ANGI/) {print $3,$3}}' \
> results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.ANGI.fidiid.tsv

# PCA of intensity pruned samples (ANGI excluded)
plink \
--allow-no-sex \
--bfile ../NORDiC_GWAS_CNV_202103/results/dataset_merge/NORDiC_2021.MERGED.with_phe.nonambig_snv.updated_ids \
--extract ../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.1000_genomes_phase3.MERGED.vprune.prune.in \
--keep results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.PCA.fidiid.tsv \
--remove results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.ANGI.fidiid.tsv \
--pca \
--out results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.no_ANGI.PCA

# plot PCs (full cohort)
Rscript src/intensity_qc/plot_PCA.R \
--first-n-pcs 8 \
--fam-file ../NORDiC_GWAS_CNV_202103/results/dataset_merge/NORDiC_2021.MERGED.with_phe.nonambig_snv.updated_ids.fam \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.PCA.eigenvec \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.PCA_plot

# plot PCs (no ANGI)
Rscript src/intensity_qc/plot_PCA.R \
--first-n-pcs 8 \
--fam-file ../NORDiC_GWAS_CNV_202103/results/dataset_merge/NORDiC_2021.MERGED.with_phe.nonambig_snv.updated_ids.fam \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.no_ANGI.PCA.eigenvec \
results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.no_ANGI.PCA_plot

# get dataset/IID tsv
cat results/intensity_qc/NORDiC_2021.intensity_callset_metrics.raw.tsv  \
| cut -f 2,3 \
> results/intensity_qc/NORDiC_2021.dataset_IID.tsv

# get dataset, iid for samples with miss gt 0.02
echo -e "FID\tIID\tIMISS" \
>results/intensity_qc/NORDiC_2021.dataset_IID.miss_qcfail.tsv 
awk 'FNR==NR {ds[$2]=$1;next}
{if (($2 in ds) && ($6 > 0.02) && ($2!="IID")) {OFS="\t"; print ds[$2],$2,$6}}' \
  results/intensity_qc/NORDiC_2021.dataset_IID.tsv  \
  ../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.MERGED.imiss \
| sort -k1 \
>>results/intensity_qc/NORDiC_2021.dataset_IID.miss_qcfail.tsv 

# get samples with sex qcfail 
echo -e "FID\tIID\tPEDSEX\tSNPSEX" \
> results/intensity_qc/NORDiC_2021.dataset_IID.sex_qcfail.tsv
awk 'FNR==NR {ds[$2]=$1;next}
{if (($2 in ds) && ($3 != 0) && ($5 == "PROBLEM") && ($2!="IID")) {OFS="\t"; print ds[$2],$2,$3,$4}}' \
  results/intensity_qc/NORDiC_2021.dataset_IID.tsv  \
../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.MERGED.pruned_miss.sexcheck \
| sort -k1 \
>>results/intensity_qc/NORDiC_2021.dataset_IID.sex_qcfail.tsv

# make csv with number of samples at each qc step
Rscript src/intensity_qc/sample_pruning_steps_csv.R \
--out-csv results/intensity_qc/NORDiC_2021.dataset_n_per_qc_step.csv \
results/intensity_qc/NORDiC_2021.dataset_IID.tsv \
raw:../NORDiC_GWAS_CNV_202103/results/dataset_merge/NORDiC_2021.MERGED.with_phe.nonambig_snv.updated_ids.fam \
missingness:../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.MERGED.imiss_le_0.02.fidiid.tsv \
sex_concordance:../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.MERGED.pruned_miss_sex.fam \
relatedness:../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.MERGED.pruned_miss_sex.relatedness.fids_iids.keep.tsv \
EUR_ancestry:../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.MERGED.pruned_miss_sex_rel.EUR_classif.fidiid.tsv \
PCA_nonoutlier:../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.MERGED.pruned_miss_sex_rel_EUR.PCA.outliers.nonoutlier.fidiid.tsv \
heterozygosity:../NORDiC_GWAS_CNV_202103/results/sample_biological_qc/NORDiC_2021.MERGED.pruned_miss_sex_rel_EUR_nonoutlier.heterozygosity_check.nonoutlier.fidiid.tsv \
intensity_qc:results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_qc.iids.txt \
cnv_qc:results/intensity_qc/NORDiC_2021.intensity_callset_metrics.biolqc_intensity_cnv_qc.iids.txt

exit
