#########################################
###  LIBRRIES
#########################################

### 4 types of libraries were sequenced: Paired-end on Illumina_Hiseq (PE) ; Paired-end on Illumina_miseq (miseq) ; Mate-Pair on Illumina_Hiseq (MP) ; PacBio on PacBio (PacBio)

################
### Reads Post processing
################
### For assemblies, MaSurCa works best with raw reads (illumina&PacBio); Canu was used with the Long raw reads longer than 1500bp. 
### For scaffolding and correcting/polishing steps, PE and MP reads were post-rocessed using Trimomatic (all), bbnorm (PE), bbmerge (ovelapping PE libraries) and Nextclip/Prinseq (MPs)

#########
### Trimmomatic
#########
#! /usr/bin/env bash

if [ $# -ne 3 ]
then
    echo "Usage: $0 <fq.1> <fq.2> <output>"
    exit -1
fi

QUALITY=33
THREADS=6
MINLEN=50
FILE1=${1}
FILE2=${2}
OUT=${3}

/usr/bin/java -classpath /path/to/Trimmomatic-0.30/trimmomatic-0.30.jar \
org.usadellab.trimmomatic.TrimmomaticPE -threads ${THREADS} -phred33 ${FILE1} ${FILE2} \
${OUT}_1_paired.fq.gz ${OUT}_1_unpaired.fq.gz ${OUT}_2_paired.fq.gz ${OUT}_2_unpaired.fq.gz \
ILLUMINACLIP:/path/to/Trimmomatic-0.30/adapters/clippingmiseq.fa:2:30:10 LEADING:${QUALITY} \
TRAILING:${QUALITY} SLIDINGWINDOW:4:${QUALITY} MINLEN:${MINLEN}

./fastqc_4.sh ${OUT}_1_paired.fq.gz ${OUT}_1_unpaired.fq.gz ${OUT}_2_paired.fq.gz ${OUT}_2_unpaired.fq.gz


################
### PAIRED_END Post processing
################


#########
### bbnorm PAIRED-END error correction
#########
#! /usr/bin/env bash

if [ $# -ne 3 ]
then
    echo "Usage: $0 <fq1.gz> <fq2.gz> <targetValue>"
    exit -1
fi

FILE1=$(echo ${1} | sed 's/_paired.fq.gz$//')
FILE2=$(echo ${2} | sed 's/_paired.fq.gz$//')
QVALUE=${3}

bbnorm.sh -Xmx200g  in=${1} in2=${2} extra=bbnormT40/miseq12_1_bbnorm.fq extra=bbnormT40/miseq12_2_bbnorm.fq  out=${FILE1}_bbnorm_T${QVALUE}.fq out2=${FILE2}_bbnorm_T${QVALUE}.fq outt=${FILE1}_bbnormtrashT${QVALUE}.txt k=25 prefilter=t threads=16  fixspikes=t target=${QVALUE} ecc=t cec=t  hist=${FILE1}_T${QVALUE}_histogram


#########
### bbmerge MISEQ sequence merge
#########
#! /usr/bin/env bash

if [ $# -ne 2 ]
then
    echo "Usage: $0 <seq1.fq.gz> <seq2.fq.gz>"
    exit -1
fi

OUT=$(echo ${1} | sed 's/_1.fq.gz$//')

# bbmerge
/path/to/bbmap/bbmerge.sh in1=${1} in2=${2} qtrim=t trimq=20 verystrict=t out=${OUT}_merged.fq outu1=${OUT}_pair1.fq outu2=${OUT}_pair2.fq
# Trimmomatic
/usr/bin/java -classpath /path/to/Trimmomatic-0.30/trimmomatic-0.30.jar org.usadellab.trimmomatic.TrimmomaticSE -threads 12 -phred33 ${OUT}_merged.fq ${OUT}_mergedreads.fq.gz ILLUMINACLIP:/path/to/Trimmomatic-0.30/adapters/clippingmiseq.fa:2:30:10 LEADING:33 TRAILING:33 SLIDINGWINDOW:4:33 MINLEN:250

################
### MATE PAIR Post processing
################

##########
#Nextclip.sh
##########
#! /usr/bin/env bash

if [ $# -ne 3 ]
then
    echo "Usage: $0 <seq1.fq> <seq2.fq> <number_of_reads>"
    exit -1
fi

OUT=$(echo ${1} | sed 's/.fq$//')

### Nextclip
/path/to/nextclip-master/bin/nextclip -i ${1} -j ${2} -o ${OUT}_nextclip --min_length 45 --number_of_reads ${3} --trim_ends 0 --remove_duplicates
/path/to/nextclip-master/bin/nextclip -i ${OUT}_nextclip_D_R1.fastq -j ${OUT}_nextclip_D_R2.fastq -o ${OUT}_nextclipRel --min_length 50 --number_of_reads ${3} --trim_ends 0 -x '29,14' -y '28,14'
### trim class A, B, C
perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclip_A_R1.fastq -fastq2 ${OUT}_nextclip_A_R2.fastq -trim_right 6 -out_good ${OUT}_A
perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclipRel_A_R1.fastq -fastq2 ${OUT}_nextclipRel_A_R2.fastq -trim_right 6 -out_good ${OUT}_Arel
perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclip_B_R2.fastq -trim_right 6 -out_good ${OUT}_B
perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclipRel_B_R2.fastq -trim_right 6 -out_good ${OUT}_Brel
perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclip_C_R1.fastq -trim_right 6 -out_good ${OUT}_C
perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclipRel_C_R1.fastq -trim_right 6 -out_good ${OUT}_Crel
perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclipRel_D_R1.fastq -fastq2 ${OUT}_nextclipRel_D_R2.fastq -trim_right 10 -min_len 50 -out_good ${OUT}_D
### cat nextclip A, B, and C classes of paired reads
cat ${OUT}_A_1.fastq   ${OUT}_Arel_1.fastq   ${OUT}_nextclip_B_R1.fastq   ${OUT}_nextclipRel_B_R1.fastq   ${OUT}_C.fastq   ${OUT}_Crel.fastq  >  ${OUT}_ABC_R1.fastq
cat ${OUT}_A_2.fastq   ${OUT}_Arel_2.fastq   ${OUT}_B.fastq   ${OUT}_Brel.fastq   ${OUT}_nextclip_C_R2.fastq   ${OUT}_nextclipRel_C_R2.fastq  >  ${OUT}_ABC_R2.fastq





#########################################
###  ASSEMBLY
#########################################


#########################################
###   MasurCa Illumina_ONLY  V151
#########################################

# v1.5.1 configuration file

# DATA is specified as type {PE,JUMP,OTHER,PACBIO} and 5 fields:
# 1)two_letter_prefix 2)mean 3)stdev 4)fastq(.gz)_fwd_reads
# 5)fastq(.gz)_rev_reads. The PE reads are always assumed to be
# innies, i.e. --->.<---, and JUMP are assumed to be outties
# <---.--->. If there are any jump libraries that are innies, such as
# longjump, specify them as JUMP and specify NEGATIVE mean. Reverse reads
# are optional for PE libraries and mandatory for JUMP libraries. Any
# OTHER sequence data (454, Sanger, Ion torrent, etc) must be first
# converted into Celera Assembler compatible .frg files (see
# http://wgs-assembler.sourceforge.com)
# MASURCA: /Path/To/build/MaSuRCA-3.2.2/bin/masurca

DATA
PE=  p4  190  40  /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaust_PE1_1.fq.gz   /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaust_PE1_2.fq.gz
PE=  p1  400  100  /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE2_1.fq.gz   /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE2_2.fq.gz
PE=  p2  280  50  /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE1_1.fq.gz   /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE1_2.fq.gz
PE=  p3  280  50    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE3_1.fq.gz   /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE3_2.fq.gz
PE=  m1  380  90    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq1_1.fq.gz  /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq1_2.fq.gz
PE=  m2  380  90    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq2_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq2_2.fq.gz
PE=  m3  400  100    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq3_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq3_2.fq.gz
JUMP=  j5  4400  650    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_MP_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_MP_2.fq.gz
JUMP=  j4  3600  600    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP4_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP4_2.fq.gz
JUMP=  j3  6000  900    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP3_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP3_2.fq.gz
JUMP=  j2  8500  1200    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP2_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP2_2.fq.gz
JUMP=  j1  11000  2000    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP1_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP1_2.fq.gz
#pacbio reads must be in a single fasta file! make sure you provide absolute path
#PACBIO=/Path/To/home/pganot/data/runs/genome_CR130114/pacbio_fastq/pacbio/1_all_fastq/pacbio_subreads_all.fasta
#OTHER=/FULL_PATH/file.frg
#PE reads coverage estimated > 186X
#JUMP reads coverage estimated 80X
#PACBIO coverage estimated 0X
END

PARAMETERS
#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content
GRAPH_KMER_SIZE = auto
#set this to 1 for all Illumina-only assemblies
#set this to 1 if you have less than 20x long reads (454, Sanger, Pacbio) and less than 50x CLONE coverage by Illumina, Sanger or 454 mate pairs
#otherwise keep at 0
USE_LINKING_MATES = 0
#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms
LIMIT_JUMP_COVERAGE = 300
#these are the additional parameters to Celera Assembler.  do not worry about performance, number or processors or batch sizes -- these are computed automatically.
#set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms.
CA_PARAMETERS =  cgwErrorRate=0.15
#minimum count k-mers used in error correction 1 means all k-mers are used.  one can increase to 2 if Illumina coverage >100
KMER_COUNT_THRESHOLD = 2
#auto-detected number of cpus to use
NUM_THREADS = 36
#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*estimated_coverage
JF_SIZE = 100000000000
#set this to 1 to use SOAPdenovo contigging/scaffolding module.  Assembly will be worse but will run faster. Useful for very large (>5Gbp) genomes
SOAP_ASSEMBLY=0
END


#########################################
###   CANU canu16
#########################################

(base) [pganot@localhost assembly]$ cat /Path/To/home/pganot/assembly/canu/canu1_6.sh
#! /usr/bin/env bash


#####this uses the stabe release canu1-6




export PATH=/Path/To/build/canu-1.6/Linux-amd64/bin:$PATH

canu -p pbraw -d v1.1 genomeSize=550m maxMemory=650 maxThreads=36 -pacbio-raw pacbio_subreads_raw.fastq

#########################################
###   MasurCa_Hybrid Illumina_PacBIO V152
#########################################


# v1.5.2 configuration file

# same config as v1.5.1 configuration file with uncommented line for PACBIO subreads

# DATA is specified as type {PE,JUMP,OTHER,PACBIO} and 5 fields:
# 1)two_letter_prefix 2)mean 3)stdev 4)fastq(.gz)_fwd_reads
# 5)fastq(.gz)_rev_reads. The PE reads are always assumed to be
# innies, i.e. --->.<---, and JUMP are assumed to be outties
# <---.--->. If there are any jump libraries that are innies, such as
# longjump, specify them as JUMP and specify NEGATIVE mean. Reverse reads
# are optional for PE libraries and mandatory for JUMP libraries. Any
# OTHER sequence data (454, Sanger, Ion torrent, etc) must be first
# converted into Celera Assembler compatible .frg files (see
# http://wgs-assembler.sourceforge.com)
# MASURCA: /Path/To/build/MaSuRCA-3.2.2/bin/masurca

DATA
PE=  p4  190  40  /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaust_PE1_1.fq.gz   /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaust_PE1_2.fq.gz
PE=  p1  400  100  /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE2_1.fq.gz   /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE2_2.fq.gz
PE=  p2  280  50  /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE1_1.fq.gz   /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE1_2.fq.gz
PE=  p3  280  50    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE3_1.fq.gz   /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE3_2.fq.gz
PE=  m1  380  90    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq1_1.fq.gz  /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq1_2.fq.gz
PE=  m2  380  90    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq2_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq2_2.fq.gz
PE=  m3  400  100    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq3_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq3_2.fq.gz
*JUMP=  j5  4400  650    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_MP_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_MP_2.fq.gz
JUMP=  j4  3600  600    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP4_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP4_2.fq.gz
JUMP=  j3  6000  900    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP3_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP3_2.fq.gz
JUMP=  j2  8500  1200    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP2_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP2_2.fq.gz
JUMP=  j1  11000  2000    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP1_1.fq.gz    /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP1_2.fq.gz
#pacbio reads must be in a single fasta file! make sure you provide absolute path
PACBIO=/Path/To/home/pganot/data/runs/genome_CR130114/pacbio_fastq/pacbio/1_all_fastq/pacbio_subreads_all.fasta
#OTHER=/FULL_PATH/file.frg
#PE reads coverage estimated > 186X
#JUMP reads coverage estimated 80X
#PACBIO coverage estimated 0X
END

PARAMETERS
#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content
GRAPH_KMER_SIZE = auto
#set this to 1 for all Illumina-only assemblies
#set this to 1 if you have less than 20x long reads (454, Sanger, Pacbio) and less than 50x CLONE coverage by Illumina, Sanger or 454 mate pairs
#otherwise keep at 0
USE_LINKING_MATES = 0
#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms
LIMIT_JUMP_COVERAGE = 300
#these are the additional parameters to Celera Assembler.  do not worry about performance, number or processors or batch sizes -- these are computed automatically.
#set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms.
CA_PARAMETERS =  cgwErrorRate=0.15
#minimum count k-mers used in error correction 1 means all k-mers are used.  one can increase to 2 if Illumina coverage >100
KMER_COUNT_THRESHOLD = 2
#auto-detected number of cpus to use
NUM_THREADS = 36
#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*estimated_coverage
JF_SIZE = 100000000000
#set this to 1 to use SOAPdenovo contigging/scaffolding module.  Assembly will be worse but will run faster. Useful for very large (>5Gbp) genomes
SOAP_ASSEMBLY=0
END


#########################################
###   Assembly post-processing
#####################################
#### IMPORTANT NOTE: after every assembly step of the pipeline (MaSurCa assembly V151,152, LRnascaff, Rascaf, sspace), scaffolds containing gaps with more than 10,000 Ns were split into 2 contigs without Ns with the following homemade scripts: 
##############################
 
########  split_longN_fasta.sh
#! /usr/bin/env bash
if [ $# -ne 1 ]
then
    echo "Usage: $0 <file.fasta>"
    exit -1
fi
BASE=$(echo $1 | sed 's/.fasta$//')

## reformat fasta to have 1 line sequences
/Path/To/build/bbmap/reformat.sh in=${1} out=${BASE}_1.fasta fastawrap=0

## replace stretch of Ns longer than 10000 by >tmp
perl -p -e 's/N{10000,}/\n>tmp\n/g' ${BASE}_1.fasta > ${BASE}_2.fasta

## remove any empty sequences, e.g. Ns as the begining or the end of the sequence.
### awk cmd explanation: With RS, we declare that the records are separated by >. If the record contains a second line $2 (i.e. is not an empty fasta record), print the record (add a > in front)
awk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} $2 {print ">"$0}' ${BASE}_2.fasta > ${BASE}_3.fasta

## remove sequence shorter than 500 bp
/Path/To/build/kentLib/faFilter -minSize=500 ${BASE}_3.fasta ${BASE}_4.fasta

## rename scaffold
scaff_header.sh ${BASE}_4.fasta > ${BASE}_max10000N.fasta

## clean & stats
rm ${BASE}_1.fasta
rm ${BASE}_2.fasta
rm ${BASE}_3.fasta
rm ${BASE}_4.fasta

summarizeAssembly.sh ${BASE}_max10000N.fasta           # this summarizeAssembly.sh script is the metrics scrpit given at the end of the page

exit ;
######




#########################################
###   SCAFFOLDING
#########################################

#########
### Lrna_Scaffolder
#########

#! /usr/bin/env bash

if [ $# -ne 1 ]
then
    echo "Usage: $0 <contig.fasta>"
    exit -1
fi

CONT=$(echo $1 | sed 's/_max10000N.fasta$//')


####### 1/ Blat the "ban60_mRNA_softmask_longestIsoform.fasta" transcripts against the contigs
/Path/To/build/kentLib/blat/blat ${1} /Path/To/home/pganot/repeats/redUnix64/2banv60_longIso/ban60_mRNA_softmask_longestIsoform.fasta  blat_${CONT}_vs_banv60.psl -noHead -qMask=lower


###### 2/ run L_RNA_scaffolder
/Path/To/build/L_RNA_scaffolder/L_RNA_scaffolder.sh -d /Path/To/build/L_RNA_scaffolder/ -i blat_${CONT}_vs_banv60.psl -j ${1}

mv  L_RNA_scaffolder.fasta ${CONT}_lrna.fasta
summarizeAssembly.sh ${CONT}_lrna.fasta            # this summarizeAssembly.sh script is the metrics scrpit given at the end of the page

split_longN_fasta.sh ${CONT}_lrna.fasta           # gaps with more than 10,000 Ns were split into 2 contigs without Ns (see script aforementioned)

../clean_scaffolder.sh

#########
### RASCAF
#########

#! /usr/bin/env bash

if [ $# -ne 1 ]
then
    echo "Usage: $0 <file.fasta>"
    exit -1
fi

export PATH=/Path/To/build/Rascaf:$PATH

BASE=$(echo ${1} | sed 's/_max10000N.fasta$//')

### create genome index
bwa index -p ${BASE} -a is ${1}

### run bwa mem with default parameter
bwa mem   -t 12   ${BASE} rna_miseq_uniq_1.fastq rna_miseq_uniq_2.fastq  > ${BASE}_miseq.sam
samtools view -u ${BASE}_miseq.sam | samtools sort -m 100G -@ 4 - ${BASE}_miseq
samtools stats ${BASE}_miseq.bam | grep -e "The command line was"  -e "^SN" | sed 's/# //' |  cut -f 2,3 > ${BASE}_miseq.stats
rm ${BASE}.sam

bwa mem   -t 12   ${BASE} rna_pe1_uniq_1.fastq rna_pe1_uniq_2.fastq  > ${BASE}_tot.sam
samtools view -u ${BASE}_tot.sam | samtools sort -m 100G -@ 4 - ${BASE}_tot
samtools stats ${BASE}_tot.bam | grep -e "The command line was"  -e "^SN" | sed 's/# //' |  cut -f 2,3 > ${BASE}_tot.stats
rm ${BASE}_tot.sam


### run rascaf
rascaf -b ${BASE}_miseq.bam -f ${1} -breakN 26 -o ${BASE}_miseq -ml 100
rascaf -b ${BASE}_tot.bam -f ${1} -breakN 26 -o ${BASE}_tot -ml 100

### run rascaf-join
rascaf-join -r ${BASE}_miseq.out -r ${BASE}_tot.out -o ${BASE}_rascaf

### stats
mv ${BASE}_rascaf.fa  ${BASE}_rascaf.fasta
summarizeAssembly.sh ${BASE}_rascaf.fasta

split_longN_fasta.sh ${BASE}_rascaf.fasta           # gaps with more than 10,000 Ns were split into 2 contigs without Ns (see script aforementioned)
summarizeAssembly.sh ${BASE}_rascaf_max10000N.fasta       # this summarizeAssembly.sh script is the metrics scrpit given at the end of the page

rm *.bam
rm *.out

exit ;

#########
### SSPACE
#########


#! /usr/bin/env bash

if [ $# -ne 1 ]
then
    echo "Usage: $0 <file.fasta>"
    exit -1
fi


BASE=$(echo $1 | sed 's/_max10000N.fasta$//')

export PATH=/Path/To/build/SSPACE-STANDARD-3.0_linux-x86_64/:$PATH

## Run
#Usage: /Path/To/build/SSPACE-STANDARD-3.0_linux-x86_64/SSPACE_Standard_v3.0.pl [SSPACE_Standard_v3.0_linux]
#
#============ General Parameters ============
#-l  Library file containing two mate pate files with insert size, error and either mate pair or paired end indication.
#-s  Fasta file containing contig sequences used for extension. Inserted pairs are mapped to extended and non-extended contigs (REQUIRED)
#-x  Indicate whether to extend the contigs of -s using paired reads in -l. (-x 1=extension, -x 0=no extension, default -x 0)
#============ Extension Parameters ============
#-m  Minimum number of overlapping bases with the seed/contig during overhang consensus build up (default -m 32)
#-o  Minimum number of reads needed to call a base during an extension (default -o 20)
#============ Scaffolding Parameters ============
#-z  Minimum contig length used for scaffolding. Filters out contigs that are below -z (default -z 0 (no filtering), optional).
#-k  Minimum number of links (read pairs) to compute scaffold (default -k 5, optional)
#-a  Maximum link ratio between two best contig pairs *higher values lead to least accurate scaffolding* (default -a 0.7, optional)
#-n  Minimum overlap required between contigs to merge adjacent contigs in a scaffold (default -n 15, optional)
#============ Bowtie Parameters ============
#-g  Maximum number of allowed gaps during mapping with Bowtie. Corresponds to the -v option in Bowtie. *higher number of allowed gaps can lead to least accurate scaffolding* (default -v 0, optional)
#============ Additional Parameters ============
#-T  Specify the number of threads to run SSPACE, used both for reading the input readfiles and mapping the reads against the contigs. For reading in the files, multiple files are read-in simultaneously. With read-mapping, the readmapper is called multiple times with 1 million reads per calls (default -T 1, optional)
#-S  Skip the processing of the reads. Meaning that SSPACE was already run, but user now wants to use different extension/scaffold parameters.
#-b  Base name for your output files (optional)
#-v  Runs the scaffolding process in verbose mode (-v 1=yes, -v 0=no, default -v 0, optional)
#-p  Make .dot file for visualisation (-p 1=yes, -p 0=no, default -p 0, optional)
#
#from the tutorial
#perl (path_to_SSPACE)/SSPACE_Standard_v3.0.pl -l libraries.txt -s contigs_abyss.fa -k 5 -a 0.7 -x 0 -b ecoli_scaffolds_no_extension
#perl (path_to_SSPACE)/SSPACE_Standard_v3.0.pl -l libraries.txt -s contigs_abyss.fa -k 5 -a 0.7 -x 1 -m 30 -o 20 -b ecoli_scaffolds_extension

SSPACE_Standard_v3.0.pl -l libraries.txt -s ${1} -z 100 -T 12 -k 5 -a 0.7 -x 0 -b ${BASE}_default           #libraries.txt correspond to the ABC-clipped MP reads (see Nextclip above)

cat ${BASE}_default/${BASE}_default.summaryfile.txt | sed -e 's/target:/target/'  -e 's/unsatisfied:/\n            unsatisfied:/' | tr '=' ':' > ${BASE}_default.summaryfile.txt.sed
mv ${BASE}_default/${BASE}_default.final.scaffolds.fasta  ${BASE}_sspace.fasta

split_longN_fasta.sh ${BASE}_sspace.fasta           # gaps with more than 10,000 Ns were split into 2 contigs without Ns (see script aforementioned)
summarizeAssembly.sh ${BASE}_sspace_max10000N.fasta                  # this summarizeAssembly.sh script is the metrics scrpit given at the end of the page




#########################################
###   METASSEMBLER 
#########################################

## Metassemble configuration file  7

[global]

bowtie2_threads=12
bowtie2_read1=/Path/To/home/pganot/data/runs/genome_CR130114/data_genome/5_cleanMP/mp3_ABCclipped_1_paired.fq.gz
bowtie2_read2=/Path/To/home/pganot/data/runs/genome_CR130114/data_genome/5_cleanMP/mp3_ABCclipped_2_paired.fq.gz
bowtie2_maxins=6000
bowtie2_minins=3000

#estimated median size for MP3 is 4400
mateAn_A=3600
mateAn_B=5200
#nucmer_l = "Minimum length of an maximal exact match" ; nucmer_c = "Minimum cluster length"
nucmer_l=100
nucmer_c=500

#When linking two primary scaffolds to the same secondary scaffold, only use alignments with length>=l (Default: 60)
asseMerge_l=100


[1]
fasta=/Path/To/home/pganot/assembly/mzrk_canu/1Lrna_rascaf_sspace_gapcloser/mzrk152_max10000N_lrna_rascaf_max10000N_sspace.fasta
ID=152s

[2]
fasta=/Path/To/home/pganot/data/assembly/contigs/masurca/mzrk151/mzrk151_max10000N.fasta
ID=151

[3]
fasta=/Path/To/home/pganot/data/assembly/contigs/canu/canu1_6/pbraw.contigs.fasta
ID=canu16


## metass.sh

#! /usr/bin/env bash

export PATH=/Path/To/build/MUMmer3.22:$PATH
export PATH=/Path/To/build/Metassembler/bin:$PATH

metassemble --conf metassemble.config --outd ./MergeMetassemble

#########################################
###   CORRECTION_POLISH
#########################################



#####################
###   PILLON 
#####################

####Create bam alignements with trimmed, merged (when possible e.g. ovelappin miseq), and corrected reads 

#! /usr/bin/env bash

if [ $# -ne 1 ]
then
    echo "Usage: $0 <file.fasta>"
    exit -1
fi

BASE=$(echo ${1} | sed 's/.fasta$//')

## create genome index
bwa index -p ${BASE} -a is ${1}

## run bwa mem with default parameter
bwa mem  -t 8  ${BASE}  miseqmerge.fq.gz  > ${BASE}_miseqSE.sam
bwa mem  -t 8  ${BASE}  kaustPEmerged_min130.fq.gz  > ${BASE}_kaustSE.sam

samtools view -u ${BASE}_miseqSE.sam | samtools sort -m 100G -@ 8 - -T ${BASE}_miseq  -o ${BASE}_miseqSE.bam
samtools stats ${BASE}_miseqSE.bam | grep -e "The command line was"  -e "^SN" | sed 's/# //' |  cut -f 2,3 > ${BASE}_miseqSE.stats

samtools view -u ${BASE}_kaustSE.sam | samtools sort -m 100G -@ 8 - -T ${BASE}_kaust -o ${BASE}_kaustSE.bam
samtools stats ${BASE}_kaustSE.bam | grep -e "The command line was"  -e "^SN" | sed 's/# //' |  cut -f 2,3 > ${BASE}_kaustSE.stats

samtools index -b ${BASE}_miseqSE.bam
samtools index -b ${BASE}_kaustSE.bam


bwa mem  -t 8  ${BASE}  pe2_1_ecc.fq.gz pe2_2_ecc.fq.gz  > ${BASE}_pe2PE.sam
bwa mem  -t 8  ${BASE}  pe13_1_bbnorm_T70.fq.gz pe13_2_bbnorm_T70.fq.gz  > ${BASE}_pe13PE.sam

samtools view -u ${BASE}_pe2PE.sam | samtools sort -m 100G -@ 8 - -T ${BASE}_pe2 -o ${BASE}_pe2PE.bam
samtools stats ${BASE}_pe2PE.bam | grep -e "The command line was"  -e "^SN" | sed 's/# //' |  cut -f 2,3 > ${BASE}_pe2PE.stats

samtools view -u ${BASE}_pe13PE.sam | samtools sort -m 100G -@ 8 - -T ${BASE}_pe13 -o ${BASE}_pe13PE.bam
samtools stats ${BASE}_pe13PE.bam | grep -e "The command line was"  -e "^SN" | sed 's/# //' |  cut -f 2,3 > ${BASE}_pe13PE.stats

samtools index -b ${BASE}_pe2PE.bam
samtools index -b ${BASE}_pe13PE.bam


#### run Pillon with bam alignments

#! /usr/bin/env bash

java -Xmx650G -jar ~/anaconda2/envs/genomeQC/share/pilon-1.22-0/pilon-1.22.jar --genome Qcanu16_151_152s.fasta --unpaired Qcanu16_151_152s_kaustSE.bam  --unpaired Qcanu16_151_152s_miseqSE.bam  --frags Qcanu16_151_152s_pe13PE.bam  --frags Qcanu16_151_152s_pe2PE.bam --output Qcanu16_151_152s_pilon --diploid --fix all,breaks --threads 12




#####################
###   Gap Closer
#####################

######
#config.sh
######

#maximal read length
max_rd_len=301

[LIB]
#average insert size
avg_ins=179
#if sequence needs to be reversed
reverse_seq=0
#in which part(s) the reads are used
asm_flags=4
#use only first 100 bps of each read
rd_len_cutoff=100
#in which order the reads are used while scaffolding
rank=1
# cutoff of pair number for a reliable connection (at least 3 for short insert size)
pair_num_cutoff=3
#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
map_len=50
#a pair of fastq file, read 1 file should always be followed by read 2 file
q1=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/2_errorCorrection/bbnormT80/pekaust100_1_bbnorm_T80.fq.gz
q2=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/2_errorCorrection/bbnormT80/pekaust100_2_bbnorm_T80.fq.gz

[LIB]
#average insert size
avg_ins=269
#if sequence needs to be reversed
reverse_seq=0
#in which part(s) the reads are used
asm_flags=4
#use only first 100 bps of each read
rd_len_cutoff=100
#in which order the reads are used while scaffolding
rank=2
# cutoff of pair number for a reliable connection (at least 3 for short insert size)
pair_num_cutoff=3
#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
map_len=50
#a pair of fastq file, read 1 file should always be followed by read 2 file
q1=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/2_errorCorrection/bbnormT70/pe13_1_bbnorm_T70.fq.gz
q2=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/2_errorCorrection/bbnormT70/pe13_2_bbnorm_T70.fq.gz

[LIB]
#average insert size
avg_ins=360
#if sequence needs to be reversed
reverse_seq=0
#in which part(s) the reads are used
asm_flags=4
#use only first 100 bps of each read
rd_len_cutoff=100
#in which order the reads are used while scaffolding
rank=3
# cutoff of pair number for a reliable connection (at least 3 for short insert size)
pair_num_cutoff=3
#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
map_len=50
#a pair of fastq file, read 1 file should always be followed by read 2 file
q1=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/3_eccOnly/pe2_1_ecc.fq.gz
q2=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/3_eccOnly/pe2_2_ecc.fq.gz

[LIB]
#average insert size
avg_ins=380
#if sequence needs to be reversed
reverse_seq=0
#in which part(s) the reads are used
asm_flags=4
#use only first 100 bps of each read
rd_len_cutoff=200
#in which order the reads are used while scaffolding
rank=3
# cutoff of pair number for a reliable connection (at least 3 for short insert size)
pair_num_cutoff=3
#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
map_len=50
#a pair of fastq file, read 1 file should always be followed by read 2 file
q1=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/3_eccOnly/miseq12_1_paired.fq.gz
q2=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/3_eccOnly/miseq12_2_paired.fq.gz


[LIB]
#average insert size
avg_ins=390
#if sequence needs to be reversed
reverse_seq=0
#in which part(s) the reads are used
asm_flags=4
#use only first 100 bps of each read
rd_len_cutoff=255
#in which order the reads are used while scaffolding
rank=3
# cutoff of pair number for a reliable connection (at least 3 for short insert size)
pair_num_cutoff=3
#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
map_len=50
#a pair of fastq file, read 1 file should always be followed by read 2 file
q1=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/3_eccOnly/miseq3_1_paired.fq.gz
q2=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/3_eccOnly/miseq3_2_paired.fq.gz

######
#GapCloser.sh
######


#! /usr/bin/env bash
if [ $# -ne 1 ]
then
    echo "Usage: $0 <file.fasta>"
    exit -1
fi

BASE=$(echo $1 | sed 's/.fasta$//')

/path/to/GapCloser -b config.txt -a ${1} -o ${BASE}_gapcloser.fasta -l 155 -p 31 -t 12
sed -i 's/|/_/' ${BASE}_gapcloser.fasta

summarizeAssembly.sh ${BASE}_gapcloser.fasta





#####################
###   METRICS Counts
#####################

#### the summarizeAssembly.sh script merge 2 statistics python scripts, the first one, contig_stats.py, counts the number of sequences belonging to different length intervals [500, 1.000, 5.000, 10.000, 50.000, 100.000, 500.000, 1.000.000 bp] ; the second is part of PBSuite (v15.8.24) and calculate various metrics for Scaffolds, Contigs, and GAPs. 

#### summarizeAssembly.sh 

#! /usr/bin/env bash

if [ $# -ne 1 ]
then
    echo "Usage: $0 <file.fasta>"
    exit -1
fi

BASE=$(echo $1 | sed 's/.fasta$//')


echo ${1} > ${BASE}_summary.stats

python contig_stats.py $1 | grep -e "number contigs" -e "largest contig" >> ${BASE}_summary.stats

python /Path/To/home/pganot/bin/PBSuite_15.8.24/pbsuite/utils/summarizeAssembly.py $1 >> ${BASE}_summary.stats


#### contig_stats.py
#! /usr/bin/env python

from __future__ import print_function
import sys
from readfq import readfq
from collections import Counter

usage = "Usage: {0} <.fa>".format(sys.argv[0])

if len(sys.argv) != 2:
    print(usage, file=sys.stderr)
    sys.exit(2)

seqlens = []
thresholds = [500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]
l_tot = 0
#cnts_thresh = Counter()
#bp_thresh = Counter()

with open(sys.argv[1]) as f_in:
    for name, seq, quals in readfq(f_in):
        l = len(seq)
        seqlens.append(l)
        l_tot += l
        #for thresh in thresholds:
        #    if l >= thresh:
        #        cnts_thresh[thresh] += 1
        #        bp_thresh[thresh] += l

seqlens.sort(reverse=True)

def n50(ls, l_tot):
    cum = 0
    for l in ls:
        cum += l
        if cum >= l_tot // 2:
            return l

print("-" * 60)
print("number contigs:", len(seqlens))
print("total bp:", l_tot)
print("n50:", n50(seqlens, l_tot))
print("E-size:", sum((l**2)/l_tot for l in seqlens))
print("largest contig:", seqlens[0])
for thresh in thresholds:
    ls = [l for l in seqlens if l >= thresh]
    ls_t = sum(ls)
    print("-" * 60)
    print("number contigs (>={}):".format(thresh), len(ls))
    print("total bp contigs (>={}):".format(thresh), ls_t)
    print("n50 (>={}):".format(thresh), n50(ls, ls_t))
    print("E-size (>={}):".format(thresh), sum((l**2)/ls_t for l in ls))
print("-" * 60)

#print()
#for l in seqlens:
#    print("LEN", l, sep='\t')

#### readfq.py
def readfq(fp): # this is a generator function
    last = None # this is a buffer keeping the last unprocessed line
    while True: # mimic closure; is it a bad idea?
        if not last: # the first record or a record following a fastq
            for l in fp: # search for the start of the next record
                if l[0] in '>@': # fasta/q header line
                    last = l[:-1] # save this line
                    break
        if not last: break
        name, seqs, last = last[1:].partition(" ")[0], [], None
        for l in fp: # read the sequence
            if l[0] in '@+>':
                last = l[:-1]
                break
            seqs.append(l[:-1])
        if not last or last[0] != '+': # this is a fasta record
            yield name, ''.join(seqs), None # yield a fasta record
            if not last: break
        else: # this is a fastq record
            seq, leng, seqs = ''.join(seqs), 0, []
            for l in fp: # read the quality
                seqs.append(l[:-1])
                leng += len(l) - 1
                if leng >= len(seq): # have read enough quality
                    last = None
                    yield name, seq, ''.join(seqs); # yield a fastq record
                    break
            if last: # reach EOF before reading enough quality
                yield name, seq, None # yield a fasta record instead
                break

if __name__ == "__main__":
    import sys
    n, slen, qlen = 0, 0, 0
    for name, seq, qual in readfq(sys.stdin):
        n += 1
        slen += len(seq)
        qlen += qual and len(qual) or 0
    print n, '\t', slen, '\t', qlen


#####################
###   STAR alignment of RNA library
#####################

### 1/ GMAP alignment of Genome (crub_gen152) with transcriptome (V53) to produce GTF coordinate file
#! /usr/bin/env bash

#### conda activate alignment

### first build the reference for gmap
gmap_build \
  -D /media/physio/home/pganot/mapping/3genome_ban60/GMAP_gen152_V53 \
  -d crub_gen152 \
  Crub_genome152_sorted.fasta

### run GMAP to show summuary of alignment
gmap \
  -D /media/physio/home/pganot/mapping/3genome_ban60/GMAP_gen152_V53 \
  -d crub_gen152 \
  -t 24 \
  -S \
  --direction=sense_force \
  -n 1 \
  -f 2 \
  --min-intronlength=20 \
  --max-intronlength=100000 \
  v53trinityCRmiseqCDHIT.fasta  > Crub152_V53.gff

### Convert gff to gtf
gffread -E Crub152_V53.gff -T -o Crub152_V53.gtf


#######

### 2/ STAR alignment (two pass) of Genome (crub_gen152) with RNA reads ()
#! /usr/bin/env bash

# conda activate alignment

#### generate genome reference ####
#mkdir crub152
STAR  --runThreadN 36  --runMode genomeGenerate  --genomeDir /media/physio/home/pganot/mapping/1STAR_genome_CRmiseq/crub152  --genomeFastaFiles /media/physio/home/pganot/mapping/1STAR_genome_CRmiseq/Crub_genome152_sorted.fasta  --sjdbGTFfile /media/physio/home/pganot/mapping/1STAR_genome_CRmiseq/Crub152_V53.gtf  --genomeSAindexNbases 13

#### run STAR ####
STAR  --runThreadN 24    --genomeDir /media/physio/home/pganot/mapping/1STAR_genome_CRmiseq/crub152    --readFilesIn  miseq_min50_1.fq.gz  miseq_min50_2.fq.gz  --readFilesCommand zcat  --outSAMtype BAM SortedByCoordinate   --twopassMode Basic

mv  Aligned.sortedByCoord.out.bam  crub152_CRmiseq.out.bam
samtools index -b crub152_CRmiseq.out.bam


exit;