#!/bin/bash
set -euo pipefail

#########################################################################################################
# This script sets up the benchmark datasets from ONT's genome in a bottle dataset. To run it make sure #
# that samtools is in the $PATH. It downloads the reference file, the kmer table and the pod5 files.    #
# The pod5 file files are base-called and mapped. Afterwards, the data is split into short, medium and  #
# long reads. The three subsets are then subset again by different number of reads.                     # 
#########################################################################################################

BASEDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
GIABDIR="${BASEDIR}/dna_giab"

SCRIPTDIR="${GIABDIR}/scripts"
POD5DIR="${GIABDIR}/pod5"
SUBSETDIR="${GIABDIR}/subsets"

REFFILE="${GIABDIR}/ref.fa"

# Load dorado basecaller variables (model & cuda settings)
source "${GIABDIR}/.env"
# f5c path
F5C="${BASEDIR}/../bin/f5c"


# echo "Preparing the Python environment..."

VENVDIR="${BASEDIR}/../venv"
python -m venv "$VENVDIR"
source "${VENVDIR}/bin/activate"
pip install -r "${BASEDIR}/../requirements.txt"


echo "Downloading the reference genome..."

wget -O "${GIABDIR}/ref.fa.gz" \
    "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/GRCh38.primary_assembly.genome.fa.gz"
gzip -d "${GIABDIR}/ref.fa.gz"


echo "Downloading the kmer-table..."

wget -O "${GIABDIR}/levels.txt" \
    "https://raw.githubusercontent.com/nanoporetech/kmer_models/refs/heads/master/dna_r10.4.1_e8.2_400bps/9mer_levels_v1.txt"


echo "Downloading pod5 files..."

for i in {0..19}; do
    filename="PAW79146_c6420e1f_4af91fd6_${i}.pod5"
    echo "Downloading file '${filename}'"
    aws s3 cp --no-sign-request \
        "s3://ont-open-data/giab_2025.01/flowcells/HG001/PAW79146/pod5/${filename}" \
        "$POD5DIR"
done


echo "Base-calling pod5 data..."

dorado basecaller "$MODEL" "$POD5DIR" \
    --reference "${GIABDIR}/ref.fa" \
    --emit-moves \
    -x "$DORADO_CUDA" \
    > "${GIABDIR}/giab.bam"


echo "Subsetting the data into into short, medium and long reads..."

python "${SCRIPTDIR}/subset_full_data.py" \
    "${GIABDIR}/giab.bam" \
    "$POD5DIR" \
    "$SUBSETDIR" \
    "100,1000,10000,100000"


echo "Setting up special data for f5c..."
ml samtools

for read_length in short medium long; do
    for n_reads in 100 1000 10000 100000; do
        subset_name="${read_length}_${n_reads}"
        echo "Processing subset ${subset_name}"

        blue-crab p2s "${SUBSETDIR}/${subset_name}/subset.pod5" \
            -o "${SUBSETDIR}/${subset_name}/subset.blow5"

        samtools fastq -@ 32 \
            "${SUBSETDIR}/${subset_name}/subset.bam" \
            > "${SUBSETDIR}/${subset_name}/subset.fastq"

        # Needed to get Uncalled4 to work with the data...
        samtools sort -@ 32 \
            "${SUBSETDIR}/${subset_name}/subset.bam" \
            > "${SUBSETDIR}/${subset_name}/subset.sorted.bam"
        
        mv "${SUBSETDIR}/${subset_name}/subset.sorted.bam" \
            "${SUBSETDIR}/${subset_name}/subset.bam"

        samtools index -@ 32 \
            "${SUBSETDIR}/${subset_name}/subset.bam"
        
        # Creates subset.blow5.idx, subset.fastq.index, subset.fastq.index.fai and subset.fastq.index.gzi
        $F5C index -t 32 --slow5 "${SUBSETDIR}/${subset_name}/subset.blow5" \
            "${SUBSETDIR}/${subset_name}/subset.fastq"
    done
done