######################################### ### LIBRRIES ######################################### ### 4 types of libraries were sequenced: Paired-end on Illumina_Hiseq (PE) ; Paired-end on Illumina_miseq (miseq) ; Mate-Pair on Illumina_Hiseq (MP) ; PacBio on PacBio (PacBio) ################ ### Reads Post processing ################ ### For assemblies, MaSurCa works best with raw reads (illumina&PacBio); Canu was used with the Long raw reads longer than 1500bp. ### For scaffolding and correcting/polishing steps, PE and MP reads were post-rocessed using Trimomatic (all), bbnorm (PE), bbmerge (ovelapping PE libraries) and Nextclip/Prinseq (MPs) ######### ### Trimmomatic ######### #! /usr/bin/env bash if [ $# -ne 3 ] then echo "Usage: $0 " exit -1 fi QUALITY=33 THREADS=6 MINLEN=50 FILE1=${1} FILE2=${2} OUT=${3} /usr/bin/java -classpath /path/to/Trimmomatic-0.30/trimmomatic-0.30.jar \ org.usadellab.trimmomatic.TrimmomaticPE -threads ${THREADS} -phred33 ${FILE1} ${FILE2} \ ${OUT}_1_paired.fq.gz ${OUT}_1_unpaired.fq.gz ${OUT}_2_paired.fq.gz ${OUT}_2_unpaired.fq.gz \ ILLUMINACLIP:/path/to/Trimmomatic-0.30/adapters/clippingmiseq.fa:2:30:10 LEADING:${QUALITY} \ TRAILING:${QUALITY} SLIDINGWINDOW:4:${QUALITY} MINLEN:${MINLEN} ./fastqc_4.sh ${OUT}_1_paired.fq.gz ${OUT}_1_unpaired.fq.gz ${OUT}_2_paired.fq.gz ${OUT}_2_unpaired.fq.gz ################ ### PAIRED_END Post processing ################ ######### ### bbnorm PAIRED-END error correction ######### #! /usr/bin/env bash if [ $# -ne 3 ] then echo "Usage: $0 " exit -1 fi FILE1=$(echo ${1} | sed 's/_paired.fq.gz$//') FILE2=$(echo ${2} | sed 's/_paired.fq.gz$//') QVALUE=${3} bbnorm.sh -Xmx200g in=${1} in2=${2} extra=bbnormT40/miseq12_1_bbnorm.fq extra=bbnormT40/miseq12_2_bbnorm.fq out=${FILE1}_bbnorm_T${QVALUE}.fq out2=${FILE2}_bbnorm_T${QVALUE}.fq outt=${FILE1}_bbnormtrashT${QVALUE}.txt k=25 prefilter=t threads=16 fixspikes=t target=${QVALUE} ecc=t cec=t hist=${FILE1}_T${QVALUE}_histogram ######### ### bbmerge MISEQ sequence merge ######### #! /usr/bin/env bash if [ $# -ne 2 ] then echo "Usage: $0 " exit -1 fi OUT=$(echo ${1} | sed 's/_1.fq.gz$//') # bbmerge /path/to/bbmap/bbmerge.sh in1=${1} in2=${2} qtrim=t trimq=20 verystrict=t out=${OUT}_merged.fq outu1=${OUT}_pair1.fq outu2=${OUT}_pair2.fq # Trimmomatic /usr/bin/java -classpath /path/to/Trimmomatic-0.30/trimmomatic-0.30.jar org.usadellab.trimmomatic.TrimmomaticSE -threads 12 -phred33 ${OUT}_merged.fq ${OUT}_mergedreads.fq.gz ILLUMINACLIP:/path/to/Trimmomatic-0.30/adapters/clippingmiseq.fa:2:30:10 LEADING:33 TRAILING:33 SLIDINGWINDOW:4:33 MINLEN:250 ################ ### MATE PAIR Post processing ################ ########## #Nextclip.sh ########## #! /usr/bin/env bash if [ $# -ne 3 ] then echo "Usage: $0 " exit -1 fi OUT=$(echo ${1} | sed 's/.fq$//') ### Nextclip /path/to/nextclip-master/bin/nextclip -i ${1} -j ${2} -o ${OUT}_nextclip --min_length 45 --number_of_reads ${3} --trim_ends 0 --remove_duplicates /path/to/nextclip-master/bin/nextclip -i ${OUT}_nextclip_D_R1.fastq -j ${OUT}_nextclip_D_R2.fastq -o ${OUT}_nextclipRel --min_length 50 --number_of_reads ${3} --trim_ends 0 -x '29,14' -y '28,14' ### trim class A, B, C perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclip_A_R1.fastq -fastq2 ${OUT}_nextclip_A_R2.fastq -trim_right 6 -out_good ${OUT}_A perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclipRel_A_R1.fastq -fastq2 ${OUT}_nextclipRel_A_R2.fastq -trim_right 6 -out_good ${OUT}_Arel perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclip_B_R2.fastq -trim_right 6 -out_good ${OUT}_B perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclipRel_B_R2.fastq -trim_right 6 -out_good ${OUT}_Brel perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclip_C_R1.fastq -trim_right 6 -out_good ${OUT}_C perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclipRel_C_R1.fastq -trim_right 6 -out_good ${OUT}_Crel perl /path/to/prinseq-lite-0.20.4/prinseq-lite.pl -fastq ${OUT}_nextclipRel_D_R1.fastq -fastq2 ${OUT}_nextclipRel_D_R2.fastq -trim_right 10 -min_len 50 -out_good ${OUT}_D ### cat nextclip A, B, and C classes of paired reads cat ${OUT}_A_1.fastq ${OUT}_Arel_1.fastq ${OUT}_nextclip_B_R1.fastq ${OUT}_nextclipRel_B_R1.fastq ${OUT}_C.fastq ${OUT}_Crel.fastq > ${OUT}_ABC_R1.fastq cat ${OUT}_A_2.fastq ${OUT}_Arel_2.fastq ${OUT}_B.fastq ${OUT}_Brel.fastq ${OUT}_nextclip_C_R2.fastq ${OUT}_nextclipRel_C_R2.fastq > ${OUT}_ABC_R2.fastq ######################################### ### ASSEMBLY ######################################### ######################################### ### MasurCa Illumina_ONLY V151 ######################################### # v1.5.1 configuration file # DATA is specified as type {PE,JUMP,OTHER,PACBIO} and 5 fields: # 1)two_letter_prefix 2)mean 3)stdev 4)fastq(.gz)_fwd_reads # 5)fastq(.gz)_rev_reads. The PE reads are always assumed to be # innies, i.e. --->.<---, and JUMP are assumed to be outties # <---.--->. If there are any jump libraries that are innies, such as # longjump, specify them as JUMP and specify NEGATIVE mean. Reverse reads # are optional for PE libraries and mandatory for JUMP libraries. Any # OTHER sequence data (454, Sanger, Ion torrent, etc) must be first # converted into Celera Assembler compatible .frg files (see # http://wgs-assembler.sourceforge.com) # MASURCA: /Path/To/build/MaSuRCA-3.2.2/bin/masurca DATA PE= p4 190 40 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaust_PE1_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaust_PE1_2.fq.gz PE= p1 400 100 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE2_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE2_2.fq.gz PE= p2 280 50 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE1_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE1_2.fq.gz PE= p3 280 50 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE3_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE3_2.fq.gz PE= m1 380 90 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq1_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq1_2.fq.gz PE= m2 380 90 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq2_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq2_2.fq.gz PE= m3 400 100 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq3_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq3_2.fq.gz JUMP= j5 4400 650 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_MP_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_MP_2.fq.gz JUMP= j4 3600 600 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP4_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP4_2.fq.gz JUMP= j3 6000 900 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP3_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP3_2.fq.gz JUMP= j2 8500 1200 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP2_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP2_2.fq.gz JUMP= j1 11000 2000 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP1_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP1_2.fq.gz #pacbio reads must be in a single fasta file! make sure you provide absolute path #PACBIO=/Path/To/home/pganot/data/runs/genome_CR130114/pacbio_fastq/pacbio/1_all_fastq/pacbio_subreads_all.fasta #OTHER=/FULL_PATH/file.frg #PE reads coverage estimated > 186X #JUMP reads coverage estimated 80X #PACBIO coverage estimated 0X END PARAMETERS #this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content GRAPH_KMER_SIZE = auto #set this to 1 for all Illumina-only assemblies #set this to 1 if you have less than 20x long reads (454, Sanger, Pacbio) and less than 50x CLONE coverage by Illumina, Sanger or 454 mate pairs #otherwise keep at 0 USE_LINKING_MATES = 0 #this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms LIMIT_JUMP_COVERAGE = 300 #these are the additional parameters to Celera Assembler. do not worry about performance, number or processors or batch sizes -- these are computed automatically. #set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms. CA_PARAMETERS = cgwErrorRate=0.15 #minimum count k-mers used in error correction 1 means all k-mers are used. one can increase to 2 if Illumina coverage >100 KMER_COUNT_THRESHOLD = 2 #auto-detected number of cpus to use NUM_THREADS = 36 #this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*estimated_coverage JF_SIZE = 100000000000 #set this to 1 to use SOAPdenovo contigging/scaffolding module. Assembly will be worse but will run faster. Useful for very large (>5Gbp) genomes SOAP_ASSEMBLY=0 END ######################################### ### CANU canu16 ######################################### (base) [pganot@localhost assembly]$ cat /Path/To/home/pganot/assembly/canu/canu1_6.sh #! /usr/bin/env bash #####this uses the stabe release canu1-6 export PATH=/Path/To/build/canu-1.6/Linux-amd64/bin:$PATH canu -p pbraw -d v1.1 genomeSize=550m maxMemory=650 maxThreads=36 -pacbio-raw pacbio_subreads_raw.fastq ######################################### ### MasurCa_Hybrid Illumina_PacBIO V152 ######################################### # v1.5.2 configuration file # same config as v1.5.1 configuration file with uncommented line for PACBIO subreads # DATA is specified as type {PE,JUMP,OTHER,PACBIO} and 5 fields: # 1)two_letter_prefix 2)mean 3)stdev 4)fastq(.gz)_fwd_reads # 5)fastq(.gz)_rev_reads. The PE reads are always assumed to be # innies, i.e. --->.<---, and JUMP are assumed to be outties # <---.--->. If there are any jump libraries that are innies, such as # longjump, specify them as JUMP and specify NEGATIVE mean. Reverse reads # are optional for PE libraries and mandatory for JUMP libraries. Any # OTHER sequence data (454, Sanger, Ion torrent, etc) must be first # converted into Celera Assembler compatible .frg files (see # http://wgs-assembler.sourceforge.com) # MASURCA: /Path/To/build/MaSuRCA-3.2.2/bin/masurca DATA PE= p4 190 40 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaust_PE1_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaust_PE1_2.fq.gz PE= p1 400 100 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE2_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE2_2.fq.gz PE= p2 280 50 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE1_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE1_2.fq.gz PE= p3 280 50 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE3_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_PE3_2.fq.gz PE= m1 380 90 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq1_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq1_2.fq.gz PE= m2 380 90 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq2_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq2_2.fq.gz PE= m3 400 100 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq3_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_miseq3_2.fq.gz *JUMP= j5 4400 650 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_MP_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_EMBL_MP_2.fq.gz JUMP= j4 3600 600 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP4_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP4_2.fq.gz JUMP= j3 6000 900 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP3_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP3_2.fq.gz JUMP= j2 8500 1200 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP2_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP2_2.fq.gz JUMP= j1 11000 2000 /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP1_1.fq.gz /Path/To/home/pganot/data/runs/genome_CR130114/Crub130114_kaustMP1_2.fq.gz #pacbio reads must be in a single fasta file! make sure you provide absolute path PACBIO=/Path/To/home/pganot/data/runs/genome_CR130114/pacbio_fastq/pacbio/1_all_fastq/pacbio_subreads_all.fasta #OTHER=/FULL_PATH/file.frg #PE reads coverage estimated > 186X #JUMP reads coverage estimated 80X #PACBIO coverage estimated 0X END PARAMETERS #this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content GRAPH_KMER_SIZE = auto #set this to 1 for all Illumina-only assemblies #set this to 1 if you have less than 20x long reads (454, Sanger, Pacbio) and less than 50x CLONE coverage by Illumina, Sanger or 454 mate pairs #otherwise keep at 0 USE_LINKING_MATES = 0 #this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms LIMIT_JUMP_COVERAGE = 300 #these are the additional parameters to Celera Assembler. do not worry about performance, number or processors or batch sizes -- these are computed automatically. #set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms. CA_PARAMETERS = cgwErrorRate=0.15 #minimum count k-mers used in error correction 1 means all k-mers are used. one can increase to 2 if Illumina coverage >100 KMER_COUNT_THRESHOLD = 2 #auto-detected number of cpus to use NUM_THREADS = 36 #this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*estimated_coverage JF_SIZE = 100000000000 #set this to 1 to use SOAPdenovo contigging/scaffolding module. Assembly will be worse but will run faster. Useful for very large (>5Gbp) genomes SOAP_ASSEMBLY=0 END ######################################### ### Assembly post-processing ##################################### #### IMPORTANT NOTE: after every assembly step of the pipeline (MaSurCa assembly V151,152, LRnascaff, Rascaf, sspace), scaffolds containing gaps with more than 10,000 Ns were split into 2 contigs without Ns with the following homemade scripts: ############################## ######## split_longN_fasta.sh #! /usr/bin/env bash if [ $# -ne 1 ] then echo "Usage: $0 " exit -1 fi BASE=$(echo $1 | sed 's/.fasta$//') ## reformat fasta to have 1 line sequences /Path/To/build/bbmap/reformat.sh in=${1} out=${BASE}_1.fasta fastawrap=0 ## replace stretch of Ns longer than 10000 by >tmp perl -p -e 's/N{10000,}/\n>tmp\n/g' ${BASE}_1.fasta > ${BASE}_2.fasta ## remove any empty sequences, e.g. Ns as the begining or the end of the sequence. ### awk cmd explanation: With RS, we declare that the records are separated by >. If the record contains a second line $2 (i.e. is not an empty fasta record), print the record (add a > in front) awk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} $2 {print ">"$0}' ${BASE}_2.fasta > ${BASE}_3.fasta ## remove sequence shorter than 500 bp /Path/To/build/kentLib/faFilter -minSize=500 ${BASE}_3.fasta ${BASE}_4.fasta ## rename scaffold scaff_header.sh ${BASE}_4.fasta > ${BASE}_max10000N.fasta ## clean & stats rm ${BASE}_1.fasta rm ${BASE}_2.fasta rm ${BASE}_3.fasta rm ${BASE}_4.fasta summarizeAssembly.sh ${BASE}_max10000N.fasta # this summarizeAssembly.sh script is the metrics scrpit given at the end of the page exit ; ###### ######################################### ### SCAFFOLDING ######################################### ######### ### Lrna_Scaffolder ######### #! /usr/bin/env bash if [ $# -ne 1 ] then echo "Usage: $0 " exit -1 fi CONT=$(echo $1 | sed 's/_max10000N.fasta$//') ####### 1/ Blat the "ban60_mRNA_softmask_longestIsoform.fasta" transcripts against the contigs /Path/To/build/kentLib/blat/blat ${1} /Path/To/home/pganot/repeats/redUnix64/2banv60_longIso/ban60_mRNA_softmask_longestIsoform.fasta blat_${CONT}_vs_banv60.psl -noHead -qMask=lower ###### 2/ run L_RNA_scaffolder /Path/To/build/L_RNA_scaffolder/L_RNA_scaffolder.sh -d /Path/To/build/L_RNA_scaffolder/ -i blat_${CONT}_vs_banv60.psl -j ${1} mv L_RNA_scaffolder.fasta ${CONT}_lrna.fasta summarizeAssembly.sh ${CONT}_lrna.fasta # this summarizeAssembly.sh script is the metrics scrpit given at the end of the page split_longN_fasta.sh ${CONT}_lrna.fasta # gaps with more than 10,000 Ns were split into 2 contigs without Ns (see script aforementioned) ../clean_scaffolder.sh ######### ### RASCAF ######### #! /usr/bin/env bash if [ $# -ne 1 ] then echo "Usage: $0 " exit -1 fi export PATH=/Path/To/build/Rascaf:$PATH BASE=$(echo ${1} | sed 's/_max10000N.fasta$//') ### create genome index bwa index -p ${BASE} -a is ${1} ### run bwa mem with default parameter bwa mem -t 12 ${BASE} rna_miseq_uniq_1.fastq rna_miseq_uniq_2.fastq > ${BASE}_miseq.sam samtools view -u ${BASE}_miseq.sam | samtools sort -m 100G -@ 4 - ${BASE}_miseq samtools stats ${BASE}_miseq.bam | grep -e "The command line was" -e "^SN" | sed 's/# //' | cut -f 2,3 > ${BASE}_miseq.stats rm ${BASE}.sam bwa mem -t 12 ${BASE} rna_pe1_uniq_1.fastq rna_pe1_uniq_2.fastq > ${BASE}_tot.sam samtools view -u ${BASE}_tot.sam | samtools sort -m 100G -@ 4 - ${BASE}_tot samtools stats ${BASE}_tot.bam | grep -e "The command line was" -e "^SN" | sed 's/# //' | cut -f 2,3 > ${BASE}_tot.stats rm ${BASE}_tot.sam ### run rascaf rascaf -b ${BASE}_miseq.bam -f ${1} -breakN 26 -o ${BASE}_miseq -ml 100 rascaf -b ${BASE}_tot.bam -f ${1} -breakN 26 -o ${BASE}_tot -ml 100 ### run rascaf-join rascaf-join -r ${BASE}_miseq.out -r ${BASE}_tot.out -o ${BASE}_rascaf ### stats mv ${BASE}_rascaf.fa ${BASE}_rascaf.fasta summarizeAssembly.sh ${BASE}_rascaf.fasta split_longN_fasta.sh ${BASE}_rascaf.fasta # gaps with more than 10,000 Ns were split into 2 contigs without Ns (see script aforementioned) summarizeAssembly.sh ${BASE}_rascaf_max10000N.fasta # this summarizeAssembly.sh script is the metrics scrpit given at the end of the page rm *.bam rm *.out exit ; ######### ### SSPACE ######### #! /usr/bin/env bash if [ $# -ne 1 ] then echo "Usage: $0 " exit -1 fi BASE=$(echo $1 | sed 's/_max10000N.fasta$//') export PATH=/Path/To/build/SSPACE-STANDARD-3.0_linux-x86_64/:$PATH ## Run #Usage: /Path/To/build/SSPACE-STANDARD-3.0_linux-x86_64/SSPACE_Standard_v3.0.pl [SSPACE_Standard_v3.0_linux] # #============ General Parameters ============ #-l Library file containing two mate pate files with insert size, error and either mate pair or paired end indication. #-s Fasta file containing contig sequences used for extension. Inserted pairs are mapped to extended and non-extended contigs (REQUIRED) #-x Indicate whether to extend the contigs of -s using paired reads in -l. (-x 1=extension, -x 0=no extension, default -x 0) #============ Extension Parameters ============ #-m Minimum number of overlapping bases with the seed/contig during overhang consensus build up (default -m 32) #-o Minimum number of reads needed to call a base during an extension (default -o 20) #============ Scaffolding Parameters ============ #-z Minimum contig length used for scaffolding. Filters out contigs that are below -z (default -z 0 (no filtering), optional). #-k Minimum number of links (read pairs) to compute scaffold (default -k 5, optional) #-a Maximum link ratio between two best contig pairs *higher values lead to least accurate scaffolding* (default -a 0.7, optional) #-n Minimum overlap required between contigs to merge adjacent contigs in a scaffold (default -n 15, optional) #============ Bowtie Parameters ============ #-g Maximum number of allowed gaps during mapping with Bowtie. Corresponds to the -v option in Bowtie. *higher number of allowed gaps can lead to least accurate scaffolding* (default -v 0, optional) #============ Additional Parameters ============ #-T Specify the number of threads to run SSPACE, used both for reading the input readfiles and mapping the reads against the contigs. For reading in the files, multiple files are read-in simultaneously. With read-mapping, the readmapper is called multiple times with 1 million reads per calls (default -T 1, optional) #-S Skip the processing of the reads. Meaning that SSPACE was already run, but user now wants to use different extension/scaffold parameters. #-b Base name for your output files (optional) #-v Runs the scaffolding process in verbose mode (-v 1=yes, -v 0=no, default -v 0, optional) #-p Make .dot file for visualisation (-p 1=yes, -p 0=no, default -p 0, optional) # #from the tutorial #perl (path_to_SSPACE)/SSPACE_Standard_v3.0.pl -l libraries.txt -s contigs_abyss.fa -k 5 -a 0.7 -x 0 -b ecoli_scaffolds_no_extension #perl (path_to_SSPACE)/SSPACE_Standard_v3.0.pl -l libraries.txt -s contigs_abyss.fa -k 5 -a 0.7 -x 1 -m 30 -o 20 -b ecoli_scaffolds_extension SSPACE_Standard_v3.0.pl -l libraries.txt -s ${1} -z 100 -T 12 -k 5 -a 0.7 -x 0 -b ${BASE}_default #libraries.txt correspond to the ABC-clipped MP reads (see Nextclip above) cat ${BASE}_default/${BASE}_default.summaryfile.txt | sed -e 's/target:/target/' -e 's/unsatisfied:/\n unsatisfied:/' | tr '=' ':' > ${BASE}_default.summaryfile.txt.sed mv ${BASE}_default/${BASE}_default.final.scaffolds.fasta ${BASE}_sspace.fasta split_longN_fasta.sh ${BASE}_sspace.fasta # gaps with more than 10,000 Ns were split into 2 contigs without Ns (see script aforementioned) summarizeAssembly.sh ${BASE}_sspace_max10000N.fasta # this summarizeAssembly.sh script is the metrics scrpit given at the end of the page ######################################### ### METASSEMBLER ######################################### ## Metassemble configuration file 7 [global] bowtie2_threads=12 bowtie2_read1=/Path/To/home/pganot/data/runs/genome_CR130114/data_genome/5_cleanMP/mp3_ABCclipped_1_paired.fq.gz bowtie2_read2=/Path/To/home/pganot/data/runs/genome_CR130114/data_genome/5_cleanMP/mp3_ABCclipped_2_paired.fq.gz bowtie2_maxins=6000 bowtie2_minins=3000 #estimated median size for MP3 is 4400 mateAn_A=3600 mateAn_B=5200 #nucmer_l = "Minimum length of an maximal exact match" ; nucmer_c = "Minimum cluster length" nucmer_l=100 nucmer_c=500 #When linking two primary scaffolds to the same secondary scaffold, only use alignments with length>=l (Default: 60) asseMerge_l=100 [1] fasta=/Path/To/home/pganot/assembly/mzrk_canu/1Lrna_rascaf_sspace_gapcloser/mzrk152_max10000N_lrna_rascaf_max10000N_sspace.fasta ID=152s [2] fasta=/Path/To/home/pganot/data/assembly/contigs/masurca/mzrk151/mzrk151_max10000N.fasta ID=151 [3] fasta=/Path/To/home/pganot/data/assembly/contigs/canu/canu1_6/pbraw.contigs.fasta ID=canu16 ## metass.sh #! /usr/bin/env bash export PATH=/Path/To/build/MUMmer3.22:$PATH export PATH=/Path/To/build/Metassembler/bin:$PATH metassemble --conf metassemble.config --outd ./MergeMetassemble ######################################### ### CORRECTION_POLISH ######################################### ##################### ### PILLON ##################### ####Create bam alignements with trimmed, merged (when possible e.g. ovelappin miseq), and corrected reads #! /usr/bin/env bash if [ $# -ne 1 ] then echo "Usage: $0 " exit -1 fi BASE=$(echo ${1} | sed 's/.fasta$//') ## create genome index bwa index -p ${BASE} -a is ${1} ## run bwa mem with default parameter bwa mem -t 8 ${BASE} miseqmerge.fq.gz > ${BASE}_miseqSE.sam bwa mem -t 8 ${BASE} kaustPEmerged_min130.fq.gz > ${BASE}_kaustSE.sam samtools view -u ${BASE}_miseqSE.sam | samtools sort -m 100G -@ 8 - -T ${BASE}_miseq -o ${BASE}_miseqSE.bam samtools stats ${BASE}_miseqSE.bam | grep -e "The command line was" -e "^SN" | sed 's/# //' | cut -f 2,3 > ${BASE}_miseqSE.stats samtools view -u ${BASE}_kaustSE.sam | samtools sort -m 100G -@ 8 - -T ${BASE}_kaust -o ${BASE}_kaustSE.bam samtools stats ${BASE}_kaustSE.bam | grep -e "The command line was" -e "^SN" | sed 's/# //' | cut -f 2,3 > ${BASE}_kaustSE.stats samtools index -b ${BASE}_miseqSE.bam samtools index -b ${BASE}_kaustSE.bam bwa mem -t 8 ${BASE} pe2_1_ecc.fq.gz pe2_2_ecc.fq.gz > ${BASE}_pe2PE.sam bwa mem -t 8 ${BASE} pe13_1_bbnorm_T70.fq.gz pe13_2_bbnorm_T70.fq.gz > ${BASE}_pe13PE.sam samtools view -u ${BASE}_pe2PE.sam | samtools sort -m 100G -@ 8 - -T ${BASE}_pe2 -o ${BASE}_pe2PE.bam samtools stats ${BASE}_pe2PE.bam | grep -e "The command line was" -e "^SN" | sed 's/# //' | cut -f 2,3 > ${BASE}_pe2PE.stats samtools view -u ${BASE}_pe13PE.sam | samtools sort -m 100G -@ 8 - -T ${BASE}_pe13 -o ${BASE}_pe13PE.bam samtools stats ${BASE}_pe13PE.bam | grep -e "The command line was" -e "^SN" | sed 's/# //' | cut -f 2,3 > ${BASE}_pe13PE.stats samtools index -b ${BASE}_pe2PE.bam samtools index -b ${BASE}_pe13PE.bam #### run Pillon with bam alignments #! /usr/bin/env bash java -Xmx650G -jar ~/anaconda2/envs/genomeQC/share/pilon-1.22-0/pilon-1.22.jar --genome Qcanu16_151_152s.fasta --unpaired Qcanu16_151_152s_kaustSE.bam --unpaired Qcanu16_151_152s_miseqSE.bam --frags Qcanu16_151_152s_pe13PE.bam --frags Qcanu16_151_152s_pe2PE.bam --output Qcanu16_151_152s_pilon --diploid --fix all,breaks --threads 12 ##################### ### Gap Closer ##################### ###### #config.sh ###### #maximal read length max_rd_len=301 [LIB] #average insert size avg_ins=179 #if sequence needs to be reversed reverse_seq=0 #in which part(s) the reads are used asm_flags=4 #use only first 100 bps of each read rd_len_cutoff=100 #in which order the reads are used while scaffolding rank=1 # cutoff of pair number for a reliable connection (at least 3 for short insert size) pair_num_cutoff=3 #minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) map_len=50 #a pair of fastq file, read 1 file should always be followed by read 2 file q1=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/2_errorCorrection/bbnormT80/pekaust100_1_bbnorm_T80.fq.gz q2=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/2_errorCorrection/bbnormT80/pekaust100_2_bbnorm_T80.fq.gz [LIB] #average insert size avg_ins=269 #if sequence needs to be reversed reverse_seq=0 #in which part(s) the reads are used asm_flags=4 #use only first 100 bps of each read rd_len_cutoff=100 #in which order the reads are used while scaffolding rank=2 # cutoff of pair number for a reliable connection (at least 3 for short insert size) pair_num_cutoff=3 #minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) map_len=50 #a pair of fastq file, read 1 file should always be followed by read 2 file q1=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/2_errorCorrection/bbnormT70/pe13_1_bbnorm_T70.fq.gz q2=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/2_errorCorrection/bbnormT70/pe13_2_bbnorm_T70.fq.gz [LIB] #average insert size avg_ins=360 #if sequence needs to be reversed reverse_seq=0 #in which part(s) the reads are used asm_flags=4 #use only first 100 bps of each read rd_len_cutoff=100 #in which order the reads are used while scaffolding rank=3 # cutoff of pair number for a reliable connection (at least 3 for short insert size) pair_num_cutoff=3 #minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) map_len=50 #a pair of fastq file, read 1 file should always be followed by read 2 file q1=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/3_eccOnly/pe2_1_ecc.fq.gz q2=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/3_eccOnly/pe2_2_ecc.fq.gz [LIB] #average insert size avg_ins=380 #if sequence needs to be reversed reverse_seq=0 #in which part(s) the reads are used asm_flags=4 #use only first 100 bps of each read rd_len_cutoff=200 #in which order the reads are used while scaffolding rank=3 # cutoff of pair number for a reliable connection (at least 3 for short insert size) pair_num_cutoff=3 #minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) map_len=50 #a pair of fastq file, read 1 file should always be followed by read 2 file q1=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/3_eccOnly/miseq12_1_paired.fq.gz q2=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/3_eccOnly/miseq12_2_paired.fq.gz [LIB] #average insert size avg_ins=390 #if sequence needs to be reversed reverse_seq=0 #in which part(s) the reads are used asm_flags=4 #use only first 100 bps of each read rd_len_cutoff=255 #in which order the reads are used while scaffolding rank=3 # cutoff of pair number for a reliable connection (at least 3 for short insert size) pair_num_cutoff=3 #minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) map_len=50 #a pair of fastq file, read 1 file should always be followed by read 2 file q1=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/3_eccOnly/miseq3_1_paired.fq.gz q2=/path/to/home/pganot/data/runs/genome_CR130114/data_genome/3_eccOnly/miseq3_2_paired.fq.gz ###### #GapCloser.sh ###### #! /usr/bin/env bash if [ $# -ne 1 ] then echo "Usage: $0 " exit -1 fi BASE=$(echo $1 | sed 's/.fasta$//') /path/to/GapCloser -b config.txt -a ${1} -o ${BASE}_gapcloser.fasta -l 155 -p 31 -t 12 sed -i 's/|/_/' ${BASE}_gapcloser.fasta summarizeAssembly.sh ${BASE}_gapcloser.fasta ##################### ### METRICS Counts ##################### #### the summarizeAssembly.sh script merge 2 statistics python scripts, the first one, contig_stats.py, counts the number of sequences belonging to different length intervals [500, 1.000, 5.000, 10.000, 50.000, 100.000, 500.000, 1.000.000 bp] ; the second is part of PBSuite (v15.8.24) and calculate various metrics for Scaffolds, Contigs, and GAPs. #### summarizeAssembly.sh #! /usr/bin/env bash if [ $# -ne 1 ] then echo "Usage: $0 " exit -1 fi BASE=$(echo $1 | sed 's/.fasta$//') echo ${1} > ${BASE}_summary.stats python contig_stats.py $1 | grep -e "number contigs" -e "largest contig" >> ${BASE}_summary.stats python /Path/To/home/pganot/bin/PBSuite_15.8.24/pbsuite/utils/summarizeAssembly.py $1 >> ${BASE}_summary.stats #### contig_stats.py #! /usr/bin/env python from __future__ import print_function import sys from readfq import readfq from collections import Counter usage = "Usage: {0} <.fa>".format(sys.argv[0]) if len(sys.argv) != 2: print(usage, file=sys.stderr) sys.exit(2) seqlens = [] thresholds = [500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000] l_tot = 0 #cnts_thresh = Counter() #bp_thresh = Counter() with open(sys.argv[1]) as f_in: for name, seq, quals in readfq(f_in): l = len(seq) seqlens.append(l) l_tot += l #for thresh in thresholds: # if l >= thresh: # cnts_thresh[thresh] += 1 # bp_thresh[thresh] += l seqlens.sort(reverse=True) def n50(ls, l_tot): cum = 0 for l in ls: cum += l if cum >= l_tot // 2: return l print("-" * 60) print("number contigs:", len(seqlens)) print("total bp:", l_tot) print("n50:", n50(seqlens, l_tot)) print("E-size:", sum((l**2)/l_tot for l in seqlens)) print("largest contig:", seqlens[0]) for thresh in thresholds: ls = [l for l in seqlens if l >= thresh] ls_t = sum(ls) print("-" * 60) print("number contigs (>={}):".format(thresh), len(ls)) print("total bp contigs (>={}):".format(thresh), ls_t) print("n50 (>={}):".format(thresh), n50(ls, ls_t)) print("E-size (>={}):".format(thresh), sum((l**2)/ls_t for l in ls)) print("-" * 60) #print() #for l in seqlens: # print("LEN", l, sep='\t') #### readfq.py def readfq(fp): # this is a generator function last = None # this is a buffer keeping the last unprocessed line while True: # mimic closure; is it a bad idea? if not last: # the first record or a record following a fastq for l in fp: # search for the start of the next record if l[0] in '>@': # fasta/q header line last = l[:-1] # save this line break if not last: break name, seqs, last = last[1:].partition(" ")[0], [], None for l in fp: # read the sequence if l[0] in '@+>': last = l[:-1] break seqs.append(l[:-1]) if not last or last[0] != '+': # this is a fasta record yield name, ''.join(seqs), None # yield a fasta record if not last: break else: # this is a fastq record seq, leng, seqs = ''.join(seqs), 0, [] for l in fp: # read the quality seqs.append(l[:-1]) leng += len(l) - 1 if leng >= len(seq): # have read enough quality last = None yield name, seq, ''.join(seqs); # yield a fastq record break if last: # reach EOF before reading enough quality yield name, seq, None # yield a fasta record instead break if __name__ == "__main__": import sys n, slen, qlen = 0, 0, 0 for name, seq, qual in readfq(sys.stdin): n += 1 slen += len(seq) qlen += qual and len(qual) or 0 print n, '\t', slen, '\t', qlen ##################### ### STAR alignment of RNA library ##################### ### 1/ GMAP alignment of Genome (crub_gen152) with transcriptome (V53) to produce GTF coordinate file #! /usr/bin/env bash #### conda activate alignment ### first build the reference for gmap gmap_build \ -D /media/physio/home/pganot/mapping/3genome_ban60/GMAP_gen152_V53 \ -d crub_gen152 \ Crub_genome152_sorted.fasta ### run GMAP to show summuary of alignment gmap \ -D /media/physio/home/pganot/mapping/3genome_ban60/GMAP_gen152_V53 \ -d crub_gen152 \ -t 24 \ -S \ --direction=sense_force \ -n 1 \ -f 2 \ --min-intronlength=20 \ --max-intronlength=100000 \ v53trinityCRmiseqCDHIT.fasta > Crub152_V53.gff ### Convert gff to gtf gffread -E Crub152_V53.gff -T -o Crub152_V53.gtf ####### ### 2/ STAR alignment (two pass) of Genome (crub_gen152) with RNA reads () #! /usr/bin/env bash # conda activate alignment #### generate genome reference #### #mkdir crub152 STAR --runThreadN 36 --runMode genomeGenerate --genomeDir /media/physio/home/pganot/mapping/1STAR_genome_CRmiseq/crub152 --genomeFastaFiles /media/physio/home/pganot/mapping/1STAR_genome_CRmiseq/Crub_genome152_sorted.fasta --sjdbGTFfile /media/physio/home/pganot/mapping/1STAR_genome_CRmiseq/Crub152_V53.gtf --genomeSAindexNbases 13 #### run STAR #### STAR --runThreadN 24 --genomeDir /media/physio/home/pganot/mapping/1STAR_genome_CRmiseq/crub152 --readFilesIn miseq_min50_1.fq.gz miseq_min50_2.fq.gz --readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --twopassMode Basic mv Aligned.sortedByCoord.out.bam crub152_CRmiseq.out.bam samtools index -b crub152_CRmiseq.out.bam exit;