####################################################################################################################################
####################################################################################################################################
#Using BUSCO to evaluate genomes of Pst and select the appropriate reference genome
####################################################################################################################################
####################################################################################################################################

#BUSCO code

run_BUSCO.py -i 123.fasta -l basidiomycota_obd10 -o 123 -m genome --cpu 10
#-i: input the assembled genomes, -l: database selected is basidiomycota, m: type evaluted is genome, --cpu: cpu cores used is 10


####################################################################################################################################
####################################################################################################################################
#Detecting variants of Pst isolates in China with GATK4 pipeline
####################################################################################################################################
####################################################################################################################################

#Public variables used in pipeline
####################################################################################################################################
#software direction
bwa=/users/huxp/biosoft/miniconda3/envs/base-analysis/bin/bwa #BWA was used to mapping the paired reads to reference genome
samtools=/users/huxp/biosoft/miniconda3/envs/base-analysis/bin/samtools #SAMtools was used to deal with BAM/SAM file
picard=/users/huxp/biosoft/picard.jar #Picard was used to deal with BAM/SAM file
gatk=/users/huxp/yuxiang.li/005_GTAK/gatk-4.1.9.0/gatk #gatk was used to detect the variants
vcftools=/users/huxp/biosoft/miniconda3/envs/base-analysis/bin/vcftools #VCFtools was used to deal with VCF file

#genome sequence
Genome=/users/huxp/data/DK0911_primer.fasta
#BWA index
Genome_bwa=/users/data/DK0911_primer.fasta
#samtools index
sample_index=/users/huxp/data/DK0911_primer.fasta.fai

#naming Illumina sequencing data
i=$1
file1=$3
file2=$4

#sample name
location=$2

#tmp dir
tmpdir=/users/huxp/data/tmp/

#code of BWA and GATK
####################################################################################################################################

#creating working direction
mkdir /users/huxp/jichen.dai/workdir/test_gatk_pipeline/${location}
workdir=/users/huxp/jichen.dai/workdir

#unzip
gunzip -c $file1 > $tmpdir/"$i"_1_clean.fq
gunzip -c $file2 > $tmpdir/"$i"_2_clean.fq

#bwa-mapping paired sequences to the reference genome
$bwa mem -t 20  -M -r 1.0 -R '@RG\tID:'${location}'\tSM:'${location}'\tPL:ILLUMINA\tDS:djc' $Genome_bwa $tmpdir/"$i"_1_clean.fq $tmpdir/"$i"_2_clean.fq > $workdir/"$i".sam
#-t: threads used were 20; -M: mark shorter split hits as secondary; -r: look for internal seeds inside a seed longer than {-k} * 0.1
#-R: read group header line

#sam to bam-converting the sam file to bam file
$samtools view -@ 20 -ubhSt $sample_index $workdir/"$i".sam > $workdir/"$i".bam
#-@:twenty threads were used in this study

#staistic-computing the mapping rates
$samtools flagstat -@ 8 "i".bam > "$i".statistic
#-@:Eight threads were used in this study

#clean sam file
java -Xmx32G -jar $picard CleanSam INPUT=$workdir/"$i".bam OUTPUT=$workdir/"$i".clean.bam VALIDATION_STRINGENCY=SILENT

#sort sam file
java -Xmx32G -jar $picard SortSam INPUT=$workdir/"$i".clean.bam OUTPUT=$workdir/"$i".clean.sorted.bam SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT

#mark duplicates
java -Xmx32G -jar $picard MarkDuplicates INPUT=$workdir/"$i".clean.sorted.bam OUTPUT=$workdir/"$i".final.bam VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=TRUE METRICS_FILE=$workdir/DUP_METRICS."$i".OUT MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=800

#index bam file
$samtools faidx $workdir/"$i".final.bam 

#first calling variants
$gatk --java-options "-Xmx48g" HaplotypeCaller \
--native-pair-hmm-threads 16 \
-R $Genome -I $workdir/"$i".final.bam \
-O $workdir/"$i".snps.indels.vcf
#--native-pair-hmm-threads: a native pairHMM implementation use 16 threads; -R: reference genome; -I: input the BAM file;
#-O: output file

#vcftools find gold (extreme high-confident) snps
$vcftools --vcf $workdir/"$i".snps.indels.vcf --min-alleles 2 --max-alleles 2 --minQ 1000 --recode --recode-INFO-all --out $workdir/"$i".snps.indels.gold
# --min-alleles, 2 --max-alleles 2: keep only bialleles
#--min-alleles: keep sites with number of alleles less or equal to 2
#--minQ: threshold set for the Quality value is 1000; --recode: these options are used to generate a new file in either VCF or BCF from the input VCF or BCF file after applying the filtering options specified by the user
#--recode-INFO-all: These options can be used with the above recode options to define an INFO key name to keep in the output file
#--out Output file

#index vcf file
$gatk --java-options "-Xmx48g" IndexFeatureFile \
-I $workdir/"$i".snps.indels.gold.recode.vcf
#-I: input file

#recalibrate vcf file with BaseRecalibrator and ApplyBQSR
$gatk --java-options "-Xmx48g" BaseRecalibrator \
-R $Genome \
-I $workdir/"$i".final.bam \
--known-sites $workdir/"$i".snps.indels.gold.recode.vcf \
-O $workdir/"$i".recal_data.table

$gatk --java-options "-Xmx48g" ApplyBQSR \
-R $Genome \
-I $workdir/"$i".final.bam \
--bqsr-recal-file $workdir/"$i".recal_data.table \
-O $workdir/"$i".recal.bam

#-R: reference genome; -I input BAM file; --known-sites: one or more databases of known polymorphic sites used to exclude regions around known polymorphisms from analysis. 
#-O: output file; --bqsr-recal-file: input recalibration table for BQSR  Required

#final calling variants
$gatk --java-options "-Xmx48g" HaplotypeCaller \
--native-pair-hmm-threads 16 \
-R $Genome \
--emit-ref-confidence GVCF \
-I $workdir/"$i".recal.bam \
-O $workdir/"$i".g.vcf

#--native-pair-hmm-threads: a native pairHMM implementation use 16 threads; -R: reference genome; -I: input the BAM file;
#-O: output file; --emit-ref-confidence: GVCF Mode for emitting reference confidence score

###############################Done_with_snp_calling##################################"

#Hardfilter with vcf file
output_path="/users/huxp/jichen.dai/workdir/"

vcffile='ls /users/huxp/jichen.dai/workdir/*.g.vcf'
for file in $vcffile
do
echo "$file"
id=`echo $file | cut -d \/ -f 6 | cut -d \. -f 1`
echo $id
~/yuxiang.li/005_GTAK/gatk-4.1.9.0/gatk VariantFiltration -V $file --filter-expression "QD<2.0||FS>60.0||MQ<40.0||MQRankSum<-12.5||ReadPosRankSum<-8.0||SOR>3.0" --filter-name "Filtered" -O $output_path/"$id"_filter.g.vcf -R /users/huxp/jichen.dai/DK0911_primer.fasta
#-V: input variant files, --filter-expression: the filter standard used in this study, QD: qual by depth; FS: FisherStrand ; MQ: mapping quality
#MQRankSum: mapping Quality Rank Sum, ReadPosRankSum: Rank sum test for relative positioning of REF versus ALT alleles within reads, --filter-name: names to use for the list of filters
#-O output file, -R reference genome
done

echo "all file done"

#Combine each GVCF file into one file  
ls ~/jichen.dai/workdir/*_filter.g.vcf > vcf.list
~/yuxiang.li/005_GTAK/gatk-4.1.9.0/gatk CombineGVCFs -R ~/jichen.dai/DK0911_primer.fasta -V vcf.list -O DK0911.snps.indels.g.vcf.gz
#-V: input variant files, -O output file, -R reference genome

#Obtain genotype from g.vcf file
~/yuxiang.li/005_GTAK/gatk-4.1.9.0/gatk GenotypeGVCFs -R ~/jichen.dai/DK0911_primer.fasta -V DK0911.snps.indels.g.vcf.gz -O DK0911.snps.indels.vcf
#-V: input variant files, -O output file; -R reference genome;

#Quality control
vcftools --vcf --recode --recode-INFO-all --maf 0.05 --minQ 200 --max-missing 0.3 --min-meanDP 38 --max-meanDP 71 --remove-indels --out Pst.snps
#--maf: keep sites with a Minor Allele Frequency greater than or equal to 0.05
#--max-missing: exclude sites on the basis of the proportion of missing data
#--min-meanDP/--max-meanDP: keep sites with mean depth values (over all included individuals) greater than or equal to the "--min-meanDP" value and less than or equal to the "--max-meanDP" value
#--remove-indels: remove indels
#--minQ: keep sites with quality value above 200, --recode: These options are used to generate a new file in either VCF or BCF from the input VCF or BCF file after applying the filtering options specified by the user
#--recode-INFO-all: These options can be used with the above recode options to define an INFO key name to keep in the output file 
#--out Output file

####################################################################################################################################
####################################################################################################################################
#Annotation of SNP variants with SnpEff pipeline
####################################################################################################################################
####################################################################################################################################

#Configuration the annotation file of DK0911 in SnpEff
java -jar snpEff.jar build -c snpEff.config -gff3 -v DK0911_p
#-c: Input configure file; -gff3: input file with gff3 format; -v: output name

#annotation of snps
java -jar snpEff.jar ann DK0911_p Pst.snps.recode.vcf > Pst.ann.snps.vcf

####################################################################################################################################
####################################################################################################################################
#Linkage disequilibrium filtering
####################################################################################################################################
####################################################################################################################################

#converting vcf format to plink accessed format
plink --vcf --recode --out --allow-extra-chr --chr-set 94 no-xy no-mt
awk '{print $1"\t"$1"_"$4"\t"$3"\t"$4}' .map > .pos.map
mv .ped .pos.ped

#Linkage disequilibrium filtering
plink --file --indep-pairwise 50 1 0.2 --out  --allow-extra-chr --chr-set 94 no-xy no-mt
plink --file --recode vcf --extract .prune.in --out --allow-extra-chr --chr-set 94 no-xy no-mt
plink --file --make-bed --extract .pruned.in --out --allow-extra-chr --chr-set 94 no-xy no-mt

#--file: input PED file, --vcf: input VCF file, --indep-pairwise
#--indep-pairwise: takes the same first two parameters as --indep. Its third parameter is a pairwise r2 threshold: at each step, pairs of variants in the current window with squared correlation greater than the threshold are noted, and variants are greedily pruned from the window until no such pairs remain. Since it does not need to keep the entire <window size> x <window size> correlation matrix in memory, it is usually capable of handling 6-digit window sizes well outside --indep's reach.
#--recode: recode the compute results, --allow-extra-chr: PLINK reports an error if the input data contains unrecognized chromosome codes . If none of the additional codes start with a digit, you can permit them with the --allow-extra-chr flag. (These contigs are ignored by most analyses which skip unplaced regions.)
#--chr-set: changes the chromosome set, The first parameter specifies the number of diploid autosome pairs if positive, or haploid chromosomes if negative. (Polyploid and aneuploid data are not supported, and there is currently no special handling of sex or mitochondrial chromosomes in all-haploid chromosome sets.)

####################################################################################################################################
####################################################################################################################################
#Constructing maximum likelihood (ML) phylogenetic trees
####################################################################################################################################
####################################################################################################################################

#RAxML code
nohup ~/biosoft/standard-RAxML-8.2.12/raxmlHPC-PTHREADS-AVX -f a -x 12345 -p 12345 -s ./2-1/2-1.LD.pruned.min4.phy -T 12 -m GTRGAMMMA -n 2-1 -# 1000 > 2-1.log 2>&1 &
#-f: a means used bootstrap method, -x/-p: random seed is 12345, -s: input phylip file, -T: twelve threads were used in this analysis
#-m: Use GTRGAMMMA as Nucleic Acid matrix, -n: output file name, -#: replication number in bootstrap method

####################################################################################################################################
####################################################################################################################################
#Principal component analysis (PCA)
####################################################################################################################################
####################################################################################################################################

#PCA with plink
plink --allow-extra-chr --chr-set 94 no-xy no-mt --threads 12 --bfile --pca 20 --out 
#--bfile: input BED file; --pca: extracts the top 20 principal components of the variance-standardized relationship matrix
#--threads: Twenty threads used in PCA analysis, --out: output file
#--indep-pairwise: takes the same first two parameters as --indep. Its third parameter is a pairwise r2 threshold: at each step, pairs of variants in the current window with squared correlation greater than the threshold are noted, and variants are greedily pruned from the window until no such pairs remain. Since it does not need to keep the entire <window size> x <window size> correlation matrix in memory, it is usually capable of handling 6-digit window sizes well outside --indep's reach.
#--recode: recode the compute results, --allow-extra-chr: PLINK reports an error if the input data contains unrecognized chromosome codes (such as hg19 haplotype chromosomes or unplaced contigs). If none of the additional codes start with a digit, you can permit them with the --allow-extra-chr flag. (These contigs are ignored by most analyses which skip unplaced regions.)
#--chr-set: changes the chromosome set. The first parameter specifies the number of diploid autosome pairs if positive, or haploid chromosomes if negative. (Polyploid and aneuploid data are not supported, and there is currently no special handling of sex or mitochondrial chromosomes in all-haploid chromosome sets.)

#PCA plotting
library(ggplot2)

dd <- read.table("2-1.eigenvec", header = F)
data <- dd[,3:4]
sample <- read.table("indvs.txt", sep = "\t", header = F)
data <- cbind(data, sample)
colnames(data) <- c("PC1","PC2","Sample","Group")
p <- ggplot(data, aes(x=PC1, y=PC2, color=Group))+geom_point() + 
  xlab("PC1(17.56%)")+ylab("PC2(9.15%)") + 
  scale_color_manual(values = c("#1D72F5","#DF0101","#77CE61", "#FF9326"))+
  theme_bw()
tiff("PCA_first_2021.8.6.tiff", res = 400, height = 2500, width = 3000)
p
dev.off()

####################################################################################################################################
####################################################################################################################################
#Detecting population genetic structures
####################################################################################################################################
####################################################################################################################################

#Admixture code
for k in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17};
do
/users/huxp/biosoft/miniconda3/envs/daijc-py37/bin/admixture -B500 -j10  --cv=10 /users/huxp/jichen.dai/dataset/150/plink/LD/combine.150.snps.LD.pruned.bed $k | tee /users/huxp/jichen.dai/workdir/admixture/2021.8.16/all/log/log${k}.out;
done 
#-B bootstrap method with 500 replications, -j: ten threads used in admixture analysis, --cv: ten Cross-validation

#Population genetic structure plotting
library(pophelper)
library(gridExtra)

sfile = readQ("combine.154.snps.LD.pruned.2.Q")

clist <- list(
  "shiny"=c("#1D72F5","#DF0101","#77CE61", "#FF9326","#A945FF","#0089B2","#FDF060","#FFA6B2","#BFF217","#60D5FD","#CC1577","#F2B950","#7FB21D","#EC496F","#326397","#B26314","#027368","#A4A4A4","#610B5E"))

inds <- read.table("tree_rank1.txt", header = F, stringsAsFactors = F)
rownames(sfile[[1]]) <- inds$V1

p1 <- plotQ(sfile[1],returnplot=T,exportplot=F,basesize=11,
            clustercol=clist$shiny,  # change color list
            showindlab=T,useindlab=T,indlabangle=90,indlabvjust=1,  # individuals axis
            grplab = data.frame(inds$V2),linesize=0.8,pointsize=4,grplabsize=4
)
grid.arrange(p1$plot[[1]])  # must output with grid

####################################################################################################################################
####################################################################################################################################
#Genetic diversity and heterozygosity identification
####################################################################################################################################
####################################################################################################################################

#Calculate pi indices with VCFtools in python code

import os

file1 = open("vcf.list", "r")

for line in file1:
	filename = line.strip("\n")
	local = line.split('/')[-1].split('.')[0]
	
	os.system('/users/huxp/biosoft/miniconda3/envs/daijc-py37/bin/vcftools --vcf %s --window-pi 5000 --out %s' % (filename, local))
#--vcf: Input VCF file, --window-pi: calculate pi within a 5000 bp window, --out: Output file

file1.close()

#pi boxplot
pi <- c()
location <- c()

for (i in c("clade1.1","clade1.2","clade1.3","clade2.1","clade2.2","clade2.3","clade2.4")){
  cluster1 <- read.table(paste(i,".windowed.pi",sep = ""), header = T)
  cluster1.pi <- cluster1$PI
  cluster1.l <- rep(i,times=c(nrow(cluster1)))
  print(i)
  print(mean(cluster1.pi))
  print(median(cluster1.pi))
  pi <- c(pi,cluster1.pi)
  location <- c(location,cluster1.l)
}

pi.table1 <- as.data.frame(pi)
pi.table1$location <- location
p <- ggplot(data=pi.table1)+geom_boxplot(aes(x=location,y=pi,color = location))+
  theme_minimal()+
  theme(panel.grid = element_blank())

tiff("pi_boxplot.tiff", width = 2000, height=1000, res=200)
p
dev.off()

#calculate observed and expected heterozygosity with PLINK

plink --allow-extra-chr --chr-set 94 no-xy no-mt --file vcf.file --het
#--recode: recode the compute results, --allow-extra-chr: PLINK reports an error if the input data contains unrecognized chromosome codes. If none of the additional codes start with a digit, you can permit them with the --allow-extra-chr flag. (These contigs are ignored by most analyses which skip unplaced regions.)
#--chr-set: changes the chromosome set. The first parameter specifies the number of diploid autosome pairs if positive, or haploid chromosomes if negative. (Polyploid and aneuploid data are not supported, and there is currently no special handling of sex or mitochondrial chromosomes in all-haploid chromosome sets.)

#Permutation test

library(coin)

dat1 <- read.delim("clipboard", header = F)

a <- as.numeric(c(dat1$V1, dat1$V2))
b <- c(rep(c("Ho"),times=c(nrow(dat1))),rep(c("he"),times=c(nrow(dat1))))

mydata <- as.data.frame(cbind(a,b))
mydata$a <- as.numeric(mydata$a)
mydata$b <- as.factor(mydata$b)

oneway_test(a~b,data=mydata, distribution="asymptotic")

#Tajima's D calculation-Calculate Tajima's D indices with VCFtools in python code

import os

file1 = open("vcf.list", "r")

for line in file1:
	filename = line.strip("\n")
	local = line.split('/')[-1].split('.')[0]
	
	os.system('/users/huxp/biosoft/miniconda3/envs/daijc-py37/bin/vcftools --vcf %s --TajimaD 5000 --out %s' % (filename, local))
#--vcf: input VCF file, --window-pi: calculate Tajima's D within a 5000 bp window, --out: Output file

file1.close()

#Tajima's D boxplot

TajimaD <- c()
location <- c()

for (i in c("1-1","1-2","1-3","2-1","2-2","2-3","2-4")){
  cluster1 <- read.table(paste(i,".Tajima.D.nonan.txt",sep = ""), header = T)
  cluster1.TajimaD <- cluster1$TajimaD
  cluster1.l <- rep(i,times=c(nrow(cluster1)))
  TajimaD <- c(TajimaD,cluster1.TajimaD)
  location <- c(location,cluster1.l)
}

TajimaD.table1 <- as.data.frame(TajimaD)
TajimaD.table1$location <- location
p <- ggplot(data=TajimaD.table1,aes(x=location,y=TajimaD,fill = location))+geom_violin()+stat_summary(fun.y=mean, geom="point", shape=23, size=2)
tiff("TajimaD_boxplot.tiff", width = 2000, height=1000, res=200)
p
dev.off()

#Kruskal-Walli’s test
library("spdep")
library(pgirmess)

group <- rep(c("A","B","C"),times=c(2,4,3))
data <- cbind(group,sample)
kruskalmc(sample,group,probS=0.05)


####################################################################################################################################
####################################################################################################################################
#Determination of population differentiation, introgression, and migration
####################################################################################################################################
####################################################################################################################################

#FST calculation-Calculate FST indices with VCFtools in python code

import os

a=["1.1","1.2","1.3","2.1","2.2","2.3","2.4"]

for i in range(len(a)):
	for j in range(i+1,len(a)):
		command = "/users/huxp/biosoft/miniconda3/envs/daijc-py37/bin/vcftools --vcf /users/huxp/jichen.dai/dataset/150/combine.150.snps.depth.recode.vcf --weir-fst-pop ~/jichen.dai/dataset/150/cluster/"+a[i]+".txt --weir-fst-pop ~/jichen.dai/dataset/150/cluster/"+a[j]+".txt --out "+a[i]+"_"+a[j]
		os.system(command)
#--vcf: input VCF file, --out: output file; 
#--weir-fst-pop: This option is used to calculate an Fst estimate from Weir and Cockerham’s 1984 paper. This is the preferred calculation of Fst. The provided file must contain a list of individuals (one individual per line) from the VCF file that correspond to one population. This option can be used multiple times to calculate Fst for more than two populations.

#FST heatmap plotting
library(dplyr)
library(readxl)

df <- read_excel("clade2.1.xlsx", sheet = "Sheet1")

df %>% reshape2::melt(id.vars = "x", variable.names="y") %>% na.omit() -> dftmp

colnames(dftmp) <- c("x", "y", "value")

dftmp$x <- factor(dftmp$x, levels = c("Cluster 2.1.1", "Cluster 2.1.2", "Cluster 2.1.3", "Cluster 2.2", "Cluster 2.3", "Cluster 2.4"))
dftmp$y <- factor(dftmp$y, levels = c("Cluster 2.1.1", "Cluster 2.1.2", "Cluster 2.1.3", "Cluster 2.2", "Cluster 2.3", "Cluster 2.4"))

p <- ggplot()+
  geom_tile(data=dftmp,aes(x,y),fill="white",color="grey")+
  geom_point(data=dftmp,aes(x,y,size=abs(value),color=value),
             shape=15)

p1 <- ggplot()+
  geom_tile(data=dftmp,aes(x,y),fill="white",color="grey")+
  geom_point(data=dftmp,aes(x,y,size=abs(value),color=value),
             shape=15)+
  theme_minimal()+
  theme(panel.grid = element_blank())+
  scale_x_discrete(position = "top")+
  scale_y_discrete(position = "right")+
  labs(x=NULL,y=NULL)+
  scale_color_gradient(low = "blue",high = "red")

#Filtering VCF file and keeping bia-allele SNP sites
vcftools --vcf combine.150.PT.snps.depth.recode.vcf --max-alleles 2 --min-alleles 2 --out combine.150.PT.biallelic --recode --recode-INFO-all

#Dsuite code
Dsuite Dtrios combine.150.PT.biallelic.recode.vcf pop.file -t tree.nwk
#-t: input tree structure

#fbranch statistic
Dsuite Fbranch TREE_FILE.nwk FVALS_tree.txt > fbranch.txt

#fbranch plot with built-in python code 
python ~/biosoft/Dsuite/utils/dtools.py ./fbranch.txt ./tree.nwk

#Treemix for different m
for m in {1..7}
 do
 for i in {1..20}
  do
  /users/huxp/biosoft/miniconda3/envs/treemix/bin/treemix -se -bootstrap \
  -i /users/huxp/jichen.dai/workdir/treemix/2021.10.15/treemix/data/PST.treemix.frq.gz \
  -o PST.treemix.${i}.${m} \ 
  -m ${m} \
  -k 500 \
  -root Outgroup \
  -noss
  done
done

#OptM find the best m-R code
library(OptM)
linear = optM("./output")
plot_optM(linear)
dev.off()

#OptM plotting
source("plotting_funcs.R")
plot_tree("TreeMix"
dev.off()