#!/bin/bash
#SBATCH --job-name=Beauveria_complete_pipeline
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=128
#SBATCH --output=logs/pipeline_%j.out
#SBATCH --error=logs/pipeline_%j.err
#SBATCH --time=7-00:00:00

# ------------------------------------------------------------
# Beauveria bassiana multi‑genome analysis pipeline
# Combines: download, preprocessing, BUSCO, BRAKER3,
# funannotate, antiSMASH, BiG‑SCAPE, phylogenomics
# ------------------------------------------------------------

set -euo pipefail
trap 'echo "Error at line $LINENO" >&2' ERR

# ----------------------------------------------------------------------
# 0. Configuration & directories
# ----------------------------------------------------------------------
WORKDIR=$(pwd)
mkdir -p logs raw_genome reference protein_dbs antismash_input antismash_output bigscape_input bigscape_results orthofinder_input orthofinder_output
THREADS=$SLURM_CPUS_PER_TASK

# Conda initialisation
source $HOME/miniconda3/etc/profile.d/conda.sh

# ----------------------------------------------------------------------
# 1. Download genomes and reference proteins (if not already present)
# ----------------------------------------------------------------------
echo "[1] Downloading Beauveria genomes and reference proteins"
conda activate ncbi_datasets

if [ ! -d raw_genome ] || [ -z "$(ls -A raw_genome/*.fna 2>/dev/null)" ]; then
    datasets download genome taxon "Beauveria" --include genome --filename beauveria_genomic.zip
    unzip -q beauveria_genomic.zip -d temp_download
    find temp_download -name "*.fna" -exec mv {} raw_genome/ \;
    rm -rf temp_download beauveria_genomic.zip
fi

if [ ! -f reference/protein.faa ]; then
    mkdir -p reference
    datasets download genome accession GCF_000280675.1 --include protein --filename ref_prot.zip
    unzip -q ref_prot.zip -d ref_temp
    find ref_temp -name "*.faa" -exec cp {} reference/protein.faa \;
    rm -rf ref_temp ref_prot.zip
fi
conda deactivate

# ----------------------------------------------------------------------
# 2. Preprocess genomes (clean, sort, mask) with funannotate
# ----------------------------------------------------------------------
echo "[2] Preprocessing genomes with funannotate"
conda activate funannotate_env

cd raw_genome
for f in *.fna; do
    base=${f%.fna}
    if [ ! -f "${base}.clean.sort.mask.fna" ]; then
        funannotate clean -i "$f" -o "${base}.clean.fna"
        funannotate sort -i "${base}.clean.fna" -o "${base}.clean.sort.fna"
        funannotate mask -i "${base}.clean.sort.fna" -o "${base}.clean.sort.mask.fna"
    fi
done
cd ..
conda deactivate

# ----------------------------------------------------------------------
# 3. BUSCO quality assessment (optional, but informative)
# ----------------------------------------------------------------------
echo "[3] Running BUSCO quality assessment"
conda activate busco_env

cd raw_genome
for genome in *.clean.sort.mask.fna; do
    base=${genome%.clean.sort.mask.fna}
    if [ ! -d "${base}_busco" ]; then
        busco -i "$genome" -o "${base}_busco" -l hypocreales_odb10 -m genome --cpu $THREADS --download_path ../busco_downloads
    fi
done
cd ..
conda deactivate

# ----------------------------------------------------------------------
# 4. Gene prediction with BRAKER3 using reference proteins
# ----------------------------------------------------------------------
echo "[4] Running BRAKER3 gene prediction"
conda activate braker3_env

cd raw_genome
REF_PROT="../reference/protein.faa"
for genome in *.clean.sort.mask.fna; do
    base=${genome%.clean.sort.mask.fna}
    if [ ! -d "${base}_braker" ]; then
        braker.py \
            --genome="$genome" \
            --prot_seq="$REF_PROT" \
            --workingdir="${base}_braker" \
            --threads=$THREADS \
            --gff3 \
            --fungus \
            --softmasking
    fi
done
cd ..
conda deactivate

# ----------------------------------------------------------------------
# 5. Functional annotation with funannotate (using BRAKER GFF3)
# ----------------------------------------------------------------------
echo "[5] Running funannotate annotation"
conda activate funannotate_env

cd raw_genome
for genome in *.clean.sort.mask.fna; do
    base=${genome%.clean.sort.mask.fna}
    gff="${base}_braker/braker.gff3"
    if [ -f "$gff" ] && [ ! -d "$base" ]; then
        funannotate annotate \
            --fasta "$genome" \
            --gff "$gff" \
            -o "$base" \
            --species "Beauveria bassiana" \
            --cpus $THREADS
    fi
done
cd ..
conda deactivate

# ----------------------------------------------------------------------
# 6. antiSMASH detection of secondary metabolite clusters
# ----------------------------------------------------------------------
echo "[6] Running antiSMASH"
conda activate antismash_env

# Collect all GenBank files from funannotate outputs
find raw_genome -path "*/annotate_results/*.gbk" -exec cp {} antismash_input/ \;

cd antismash_input
for gbk in *.gbk; do
    base=${gbk%.gbk}
    if [ ! -d "../antismash_output/$base" ]; then
        antismash "$gbk" \
            --taxon fungi \
            --genefinding-tool none \
            --output-dir ../antismash_output/$base \
            --cpus $THREADS
    fi
done
cd ..
conda deactivate

# ----------------------------------------------------------------------
# 7. BiG‑SCAPE clustering of biosynthetic gene clusters
# ----------------------------------------------------------------------
echo "[7] Running BiG‑SCAPE"
conda activate bigscape_env

# Copy all region GBK files to input directory
find antismash_output -name "*region*.gbk" -exec cp {} bigscape_input/ \;

if [ ! -d bigscape_results ] || [ -z "$(ls -A bigscape_results)" ]; then
    bigscape.py \
        --inputdir bigscape_input \
        --outputdir bigscape_results \
        --threads $THREADS \
        --mibig \
        --mix \
        --keep_singletons
fi

# Create final bigscape_output with cleaned results
mkdir -p bigscape_output
cp -r bigscape_results/networkfiles/*_hybrids bigscape_output/
find bigscape_output -type d -name "GCF_trees" -exec rm -rf {} + 2>/dev/null || true
conda deactivate

# ----------------------------------------------------------------------
# 8. BUSCO for phylogenomics (nucleotide mode, Metaeuk)
# ----------------------------------------------------------------------
echo "[8] Running BUSCO for phylogenomics"
conda activate busco_env

mkdir -p busco_nt_output
cd raw_genome

for genome in *.fna; do
    base=${genome%.fna}
    if [ ! -d "../busco_nt_output/${base}_busco_nt" ]; then
        busco \
            -i "$genome" \
            -o "../busco_nt_output/${base}_busco_nt" \
            -l hypocreales_odb10 \
            -m genome \
            --nt \
            --metaeuk \
            --cpu $THREADS
    fi
done
cd ..
conda deactivate

# ----------------------------------------------------------------------
# 9. Generate supermatrix from BUSCO single-copy orthologs
# ----------------------------------------------------------------------
echo "[9] Generating supermatrix"
conda activate busco_env

# The custom script BUSCO_phylogenomics.py is assumed to be in PATH
if command -v BUSCO_phylogenomics.py >/dev/null 2>&1; then
    BUSCO_phylogenomics.py \
        -i raw_genome \
        -o Bb_Phylogeny \
        -t $THREADS \
        -l hypocreales_odb10 \
        -p 80 \
        --supermatrix \
        --nt
else
    echo "ERROR: BUSCO_phylogenomics.py not found. Please ensure it is installed and in PATH."
    exit 1
fi
conda deactivate

# ----------------------------------------------------------------------
# 10. IQ‑TREE phylogenetic inference
# ----------------------------------------------------------------------
echo "[10] Running IQ‑TREE"
conda activate iqtree_env  # Assumes iqtree environment exists

cd Bb_Phylogeny
if [ -f SUPERMATRIX.phylip ] && [ -f SUPERMATRIX.partitions.nex ]; then
    iqtree2 \
        -s SUPERMATRIX.phylip \
        -p SUPERMATRIX.partitions.nex \
        -m MFP+MERGE \
        -B 1000 \
        --alrt 1000 \
        -T AUTO
else
    echo "ERROR: Supermatrix files not found. Check BUSCO supermatrix step."
    exit 1
fi
conda deactivate

echo "Pipeline completed at $(date)"