# ==============================================================================
# B. bassiana & Fungal Genomics: Unified Analysis Pipeline
# ==============================================================================
# Sanitized version: removed duplicates, fixed errors, improved structure,
# consistent variable names, and added comments.
# ==============================================================================

# 1. LOAD LIBRARIES (once, in order)
# ------------------------------------------------------------------------------
library(tidyverse)
library(patchwork)
library(ggrepel)
library(scales)
library(ape)
library(phytools)
library(ragg)
library(ggpubr)
library(igraph)
library(progress)
library(poweRlaw)
library(tidygraph)
library(ggraph)
library(cowplot)
library(progressr)
library(future)
library(furrr)

# 2. GLOBAL CONFIGURATION & PATHS
# ------------------------------------------------------------------------------
set.seed(123)

bigscape_dir     <- "bigscape_output/"
input_bgc_file   <- file.path(bigscape_dir, "Network_Annotations_Full.tsv")
ortho_path       <- "orthofinder_output/"
antismash_dir    <- "antismash_result/GCA_052426205_Beauveria_bassiana/"
string_db_path   <- "string_db/"
output_dir       <- "final_results3/"
string_links_path <- file.path(string_db_path, "STRG0A07FVB.protein.links.v12.0.txt")
phi_results_path  <- "PHI_Base_output/AS272_vs_phi_best.tsv"

if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)

file_config <- list(
  ortho_stats  = file.path(ortho_path, "Comparative_Genomics_Statistics/Statistics_PerSpecies.tsv"),
  ortho_counts = file.path(ortho_path, "Orthogroups/Orthogroups.GeneCount.tsv"),
  as272_ko     = "ghost_koala_output/AS272_GhostKOALA.txt",
  ref_ko       = "ghost_koala_output/REF_GhostKOALA.txt",
  as272_phi    = "PHI_Base_output/AS272_vs_phi_best.tsv",
  ref_phi      = "PHI_Base_output/ARSEF2860_vs_phi_best.tsv",
  tree_file    = "SUPERMATRIX.partitions.nex.treefile",
  bgc_annots   = file.path(bigscape_dir, "Network_Annotations_Full.tsv")   # fixed variable
)

# Kelly colors for consistent plotting (used throughout)
kelly_colors <- c(
  "#FFB300", "#803E75", "#FF6800", "#A6BDD7", "#C10020", "#D9C121", "#007D34",
  "#F6768E", "#00538A", "#FF7A5C", "#53377A", "#FF8E00", "#B32851", "#F4C800",
  "#7F180D", "#93AA00", "#593315", "#F13A13", "#232C16", "#00A1C1"
)

# Color assignments for specific strains
strain_colors <- c("ARSEF 2860" = "#e57373", "AS272" = "#4db6ac")
primary_color   <- "#A6BDD7"   # soft blue for boxplots / fill
accent_color    <- "grey70"    # neutral for background points
highlight_color <- "#C10020"   # vibrant red for target highlight

two_col_width  <- 174  # mm
one_col_width  <- 85   # mm
dpi_setting    <- 300

# 3. HELPER FUNCTIONS (shared across parts)
# ------------------------------------------------------------------------------
clean_gene_id <- function(x) {
  str_extract(x, "g[0-9]+\\.t1")
}

process_ko_full <- function(path, strain_name) {
  if (!file.exists(path)) {
    warning("File not found: ", path)
    return(tibble(GeneID = character(), KO = character(),
                  Description = character(), Strain = character()))
  }
  read_tsv(path, col_names = FALSE, show_col_types = FALSE) %>%
    select(GeneID = 1, KO = 2, Description = 3) %>%
    filter(str_detect(KO, "^K\\d+")) %>%
    mutate(Strain = strain_name)
}

parse_phi <- function(path, strain_name) {
  if (!file.exists(path)) {
    warning("File not found: ", path)
    return(tibble(Query = character(), Subject = character(),
                  Phenotype = character(), Strain = character()))
  }
  read_tsv(path, col_names = FALSE, show_col_types = FALSE) %>%
    select(Query = 1, Subject = 2) %>%
    mutate(
      Phenotype = map_chr(str_split(Subject, "#"), tail, 1),
      Phenotype = str_replace_all(Phenotype, "__", " & "),
      Strain = strain_name
    )
}

# ==============================================================================
# PART 1: Comparative Genomics (Figure 1)
# ==============================================================================

# Data loading
p1_data <- read_tsv(file_config$ortho_stats, show_col_types = FALSE) %>%
  rename(Metric = 1) %>%
  pivot_longer(-Metric, names_to = "Strain", values_to = "Value") %>%
  mutate(Value = as.numeric(gsub(",", "", Value))) %>%
  filter(Metric %in% c("Number of genes", "Number of genes in orthogroups",
                       "Number of unassigned genes"))

og_counts <- read_tsv(file_config$ortho_counts, show_col_types = FALSE) %>%
  select(Orthogroup, AS272 = contains("proteins"), REF = contains("protein_REF")) %>%
  rename_with(~ str_replace_all(., c("proteins" = "AS272", "protein_REF" = "REF")))

as272_ko_raw <- process_ko_full(file_config$as272_ko, "AS272")
ref_ko_raw   <- process_ko_full(file_config$ref_ko, "REF")

ko_lookup <- bind_rows(as272_ko_raw, ref_ko_raw) %>%
  select(KO, Description) %>%
  distinct(KO, .keep_all = TRUE)

ko_counts_desc <- full_join(
  as272_ko_raw %>% count(KO, name = "n_as272"),
  ref_ko_raw %>% count(KO, name = "n_ref"),
  by = "KO"
) %>%
  replace_na(list(n_as272 = 0, n_ref = 0)) %>%
  mutate(diff = n_as272 - n_ref) %>%
  left_join(ko_lookup, by = "KO") %>%
  mutate(Label = paste0(KO, ": ", str_trunc(Description, 50)))

phi_combined <- bind_rows(parse_phi(file_config$as272_phi, "AS272"),
                          parse_phi(file_config$ref_phi, "ARSEF2860"))

phi_plot_data <- phi_combined %>%
  count(Phenotype, Strain) %>%
  mutate(Strain = recode(Strain, "ARSEF2860" = "ARSEF 2860"))

top_phenos <- phi_combined %>%
  count(Phenotype) %>%
  slice_max(n, n = 6) %>%
  pull(Phenotype)

# Global theme for Figure 1
theme_175mm <- theme_minimal(base_size = 8) +
  theme(
    plot.tag = element_text(face = "bold", size = 12),
    axis.title = element_text(size = 8, face = "bold"),
    axis.text = element_text(size = 7, color = "black"),
    legend.position = "right",
    legend.text = element_text(size = 6),
    legend.title = element_text(size = 7, face = "bold"),
    panel.grid.minor = element_blank(),
    plot.margin = margin(1, 1, 1, 1, "mm")
  )

# Individual plots (exactly as provided)
p1 <- ggplot(p1_data, aes(
  x = Metric,
  y = Value,
  fill = ifelse(Strain == "protein_REF", "ARSEF 2860", "AS272")
)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  scale_fill_manual(values = strain_colors) +
  labs(y = "Gene Count", tag = "A", fill = "Strain")

p2_clean <- ggplot(og_counts, aes(x = REF, y = AS272)) +
  geom_abline(
    slope = 1,
    intercept = 0,
    linetype = "dashed",
    alpha = 0.5
  ) +
  geom_jitter(
    alpha = 0.3,
    color = "#283593",
    size = 0.8,
    width = 0.2
  ) +
  scale_x_continuous(limits = c(0, 30)) +
  scale_y_continuous(limits = c(0, 10)) +
  stat_cor(
    aes(label = paste(..r.label.., ..p.label.., sep = "~`,`~")),
    method = "pearson",
    label.x = 20,
    label.y = 8,
    size = 2.5
  ) +
  coord_cartesian(expand = FALSE) +
  labs(
    x = "ARSEF 2860 Count",
    y = "AS272 Count",
    tag = "B"
  )

p2_free <- patchwork::free(p2_clean)

p3_clean <- ggplot(ko_counts_desc %>%
                     filter(diff != 0) %>%
                     slice_max(abs(diff), n = 10),
                   aes(x = reorder(Label, diff), y = diff)) +
  geom_segment(aes(xend = Label, yend = 0),
               color = "grey90",
               linewidth = 0.3) +
  geom_point(aes(color = diff > 0), size = 2) +
  scale_color_manual(values = c("TRUE" = "#00897b", "FALSE" = "#d81b60"),
                     guide = "none") +
  scale_x_discrete(
    labels = function(x)
      str_wrap(x, width = 35)
  ) +
  coord_flip() +
  labs(y = "Delta (AS272 - ARSEF 2860)", tag = "C", x = NULL) +
  theme(
    axis.text.y = element_text(size = 5.5, lineheight = 0.7),
    panel.grid.major.y = element_blank()
  )

p4 <- ggplot(
  phi_plot_data %>% filter(Phenotype %in% top_phenos),
  aes(
    x = reorder(Phenotype, n),
    y = n,
    fill = Strain
  )
) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  scale_fill_manual(values = strain_colors) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 25)) +
  labs(y = "Hit Count",
       tag = "D",
       x = NULL,
       fill = "Strain")

p4 <- p4 + theme(axis.text.y = element_text(margin = margin(r = 3)))

# Final assembly with design
design <- "
AAAAABBBBBB
CCCCCCDDDDD
"

fig1_final <- p1 + p2_clean + p3_clean + p4 +
  plot_layout(
    design = design,
    widths = c(1, 1.5, 1.5, 1.5),
    heights = c(1, 3),
    guides = "collect"
  ) &
  theme_175mm & theme(
    legend.position = "bottom",
    legend.justification = "center",
    legend.box = "horizontal"
  )

ggsave(filename = file.path(output_dir, "Figure_1_Final.tiff"),
       plot = fig1_final, width = 175, height = 140, units = "mm",
       dpi = 600, compression = "lzw")

# ==============================================================================
# PART 2: Phylogenomics (Figure 2)
# ==============================================================================

tree <- read.tree(file_config$tree_file)
outgroup_taxa <- c("Metarhizium_robertsii_ARSEF23", "Metarhizium_anisopliae_JEF-290")
outgroup_taxa <- intersect(outgroup_taxa, tree$tip.label)
rooted_tree <- ladderize(root(tree, outgroup = outgroup_taxa, resolve.root = TRUE))

highlight_isolates <- c("AS272" = "AS272", "MBC618" = "A", "MTCC8017" = "B", "SYSU-MS7908" = "C")
target_tips <- sapply(names(highlight_isolates), function(x) grep(x, rooted_tree$tip.label))

# Format tip labels
raw_tips <- rooted_tree$tip.label
formatted_labels <- sapply(raw_tips, function(x) {
  parts <- strsplit(x, "_")[[1]]
  genus <- parts[1]
  if (grepl("_sp", x)) return(paste(genus, "sp."))
  species <- parts[2]
  paste0(substr(genus, 1, 1), ". ", species)
})
rooted_tree$tip.label <- formatted_labels
unique_names <- unique(rooted_tree$tip.label)

# Color mapping
colors_palette <- setNames(rep(kelly_colors, length.out = length(unique_names)), unique_names)
tip_colors_vec <- colors_palette[rooted_tree$tip.label]

# Legend expressions
legend_exprs <- lapply(unique_names, function(x) {
  if (grepl("sp\\.$", x)) bquote(italic(.(gsub(" sp\\.$", "", x))) ~ "sp.")
  else bquote(italic(.(x)))
})

# Bootstrap values and edge highlighting
bootstrap_vals <- suppressWarnings(as.numeric(sub("/.*", "", rooted_tree$node.label)))
bootstrap_vals[is.na(bootstrap_vals)] <- 0
highlight_nodes <- which(bootstrap_vals >= 80)
edge_colors <- rep("grey40", nrow(rooted_tree$edge))
n_tips <- length(rooted_tree$tip.label)
for (i in seq_along(highlight_nodes)) {
  node_id <- highlight_nodes[i] + n_tips
  edge_index <- which(rooted_tree$edge[, 2] == node_id)
  edge_colors[edge_index] <- "red"
}

# Stretch branches
rooted_tree$edge.length <- log10(sqrt(pmax(rooted_tree$edge.length, 0)) + 1)

# Export
agg_tiff(file.path(output_dir, "Figure_2_Final.tiff"),
         width = 175, height = 220, units = "mm", res = 600, compression = "lzw")
par(mar = c(1, 1, 1, 1))
plot(rooted_tree, type = "fan", tip.color = tip_colors_vec, edge.color = edge_colors,
     edge.width = ifelse(edge_colors == "red", 1.2, 1), cex = 0.45,
     label.offset = 0.1, x.lim = c(-1.35, 1.15))

pp <- get("last_plot.phylo", envir = .PlotPhyloEnv)

# Highlight selected isolates with arrows and labels
for (i in seq_along(target_tips)) {
  tip_id <- target_tips[i]
  if (length(tip_id) == 1) {
    x <- pp$xx[tip_id]
    y <- pp$yy[tip_id]
    species_name <- rooted_tree$tip.label[tip_id]
    arrow_color <- colors_palette[species_name]
    label_text <- highlight_isolates[i]
    
    # default radial arrow
    x0 <- x * 1.35
    y0 <- y * 1.35
    
    # rotate A and C arrows 90° anticlockwise
    if (label_text %in% c("A", "C")) {
      x0 <- x - y * 0.35
      y0 <- y + x * 0.35
    }
    # rotate B arrow 30° clockwise
    if (label_text == "B") {
      theta <- -30 * pi / 180
      dx <- x0 - x
      dy <- y0 - y
      x0 <- x + dx * cos(theta) - dy * sin(theta)
      y0 <- y + dx * sin(theta) + dy * cos(theta)
    }
    
    arrows(x0 = x0, y0 = y0, x1 = x, y1 = y, length = 0.08, lwd = 2, col = arrow_color)
    
    # label placement
    if (label_text == "AS272") {
      text(x = x0 - 0.08, y = y0, labels = label_text, col = arrow_color,
           cex = 0.9, font = 2, adj = c(1, 0.5))
    } else if (label_text %in% c("A", "C")) {
      text(x = x0 + 0.01, y = y0 + 0.06, labels = label_text, col = arrow_color,
           cex = 0.9, font = 2, adj = c(0.5, 0))
    } else {
      text(x = x0, y = y0 + 0.08, labels = label_text, col = arrow_color,
           cex = 0.9, font = 2)
    }
  }
}

nodelabels(node = highlight_nodes, pch = NA, lwd = 2, col = "red")
legend(x = -0.5, y = 0.5, legend = as.expression(legend_exprs),
       fill = colors_palette, border = "white", cex = 0.65, bty = "n",
       ncol = 2, title = "Species Identification", title.font = 2, xpd = NA)

dev.off()

# ==============================================================================
# PART 3: BiG-SCAPE BGCs Analysis - Distribution & Highlights (Figure 3)
# ==============================================================================

# Data loading and processing
bgc_data <- read_tsv(input_bgc_file, show_col_types = FALSE) %>%
  mutate(
    `WGS Accession` = str_extract(BGC, "GCA_[0-9]+\\.[0-9]+|AS272"),
    Species = word(Organism, 1, 2),
    `BiG-SCAPE class` = case_when(
      `BiG-SCAPE class` %in% c("PKSI", "PKSother") ~ "PKS",
      TRUE ~ `BiG-SCAPE class`
    )
  ) %>%
  filter(str_detect(Description, "Beauveria"), !str_starts(BGC, "BGC")) %>%
  distinct()

bgc_data_processed <- bgc_data %>%
  mutate(Species = case_when(
    str_detect(Species, " sp\\.$") ~ Species,
    TRUE ~ gsub("^([A-Z])[a-z]+ ", "\\1. ", Species)
  ))

total_bgc_per_genome <- bgc_data_processed %>%
  group_by(`WGS Accession`, Species) %>%
  summarise(total_count = n(), .groups = "drop")

bgc_counts_per_genome_class <- bgc_data_processed %>%
  group_by(`WGS Accession`, `BiG-SCAPE class`, Species) %>%
  summarise(class_count = n(), .groups = "drop")

# Plotting function
create_bgc_plot_174 <- function(data, x_var, title_text, target_id) {
  plot_data <- data %>%
    mutate(is_target = if_else(str_detect(`WGS Accession`, target_id), "Target", "Normal")) %>%
    arrange(is_target)
  if (nrow(plot_data) == 0) return(NULL)
  ggplot(plot_data, aes(x = !!sym(x_var), y = Species)) +
    geom_boxplot(fill = primary_color, outlier.shape = NA, alpha = 0.3,
                 color = "grey40", size = 0.3) +
    geom_jitter(aes(color = is_target, size = is_target, alpha = is_target),
                height = 0.2, width = 0) +
    scale_color_manual(values = c("Normal" = accent_color, "Target" = highlight_color)) +
    scale_size_manual(values = c("Normal" = 0.8, "Target" = 2.5)) +
    scale_alpha_manual(values = c("Normal" = 0.3, "Target" = 1.0)) +
    labs(subtitle = title_text, x = NULL, y = NULL) +
    theme_minimal(base_size = 8) +
    theme(
      axis.text.y = element_text(face = "italic", size = 7),
      axis.text.x = element_text(size = 7),
      panel.grid.major.y = element_blank(),
      panel.grid.major.x = element_line(color = "grey93", linewidth = 0.2),
      plot.subtitle = element_text(hjust = 0.5, face = "bold", size = 9),
      legend.position = "none",
      plot.margin = margin(2, 5, 2, 2)
    )
}

target_genome <- "GCA_052426205.1"

p_total <- create_bgc_plot_174(total_bgc_per_genome, "total_count", "Total BGCs", target_genome) +
  xlim(30, 65)

classes <- c("Terpene", "NRPS", "PKS-NRP_Hybrids", "PKS", "Others", "RiPPs")
class_plots <- list()
for (cls in classes) {
  df_subset <- bgc_counts_per_genome_class %>% filter(`BiG-SCAPE class` == cls)
  if (nrow(df_subset) > 0) {
    class_plots[[cls]] <- create_bgc_plot_174(df_subset, "class_count", cls, target_genome)
  }
}

bottom_grid <- wrap_plots(class_plots, ncol = 2)
final_layout <- (p_total / bottom_grid) +
  plot_layout(heights = c(1, 3)) +
  plot_annotation(tag_levels = "A") &
  theme(plot.tag = element_text(size = 12, face = "bold"))

ggsave(file.path(output_dir, "Figure_3_Final.tiff"),
       plot = final_layout, width = 174, height = 230, units = "mm",
       dpi = 600, bg = "white", compression = "lzw")

# ==============================================================================
# PART 4: BiG-SCAPE Network Analysis Across Cutoffs (Figure 4)
# ==============================================================================
message("\n=== PART 4: Generating Figure 4 (Parallel Mode) ===")

plan(multisession, workers = parallel::detectCores() - 1)
handlers(global = TRUE)

classes_from_dirs <- c("NRPS", "Others", "PKS-NRP_Hybrids", "PKSI", "PKSother", "RiPPs", "Terpene")

extract_cutoff <- function(filename) {
  cutoff <- str_extract(filename, "c?\\d+\\.\\d+")
  if (!is.na(cutoff) && !str_starts(cutoff, "c")) cutoff <- paste0("c", cutoff)
  cutoff
}

read_network_file <- function(file_path) {
  tryCatch({
    first_line <- readLines(file_path, n = 1)
    sep <- if (str_detect(first_line, "\t")) "\t" else if (str_detect(first_line, ",")) "," else " "
    read.table(file_path, header = TRUE, sep = sep, stringsAsFactors = FALSE,
               quote = "", comment.char = "", fill = TRUE)
  }, error = function(e) {
    message("Error reading: ", basename(file_path), " - ", e$message)
    NULL
  })
}

process_network_file <- function(file_path, class_name) {
  filename <- basename(file_path)
  cutoff_val <- extract_cutoff(filename)
  if (is.na(cutoff_val)) return(tibble())
  edge_data <- read_network_file(file_path)
  if (is.null(edge_data) || nrow(edge_data) < 1) return(tibble())
  cols <- grep("Clustername[._]?[12]", names(edge_data), ignore.case = TRUE, value = TRUE)
  if (length(cols) < 2) cols <- names(edge_data)[1:2]
  g <- graph_from_data_frame(edge_data[, cols[1:2]], directed = FALSE)
  comps <- components(g)
  tibble(
    Class = class_name,
    Cutoff = cutoff_val,
    Nodes = gorder(g),
    Edges = gsize(g),
    GCFs = comps$no,
    MeanGCFSize = mean(comps$csize),
    MaxGCFSize = max(comps$csize),
    Singletons = sum(comps$csize == 1),
    NetworkDensity = edge_density(g)
  )
}

network_jobs <- map_dfr(classes_from_dirs, function(cls) {
  class_path <- file.path(bigscape_dir, cls)
  if (!dir.exists(class_path)) return(tibble())
  network_files <- list.files(class_path, pattern = "\\.network$", full.names = TRUE, ignore.case = TRUE)
  if (length(network_files) == 0) return(tibble())
  tibble(Class = cls, File = network_files)
})

all_stats_df <- future_pmap_dfr(list(network_jobs$File, network_jobs$Class),
                                function(file, cls) process_network_file(file, cls),
                                .progress = TRUE)

all_stats_df <- all_stats_df %>%
  filter(!is.na(Cutoff)) %>%
  mutate(CutoffNumeric = as.numeric(str_remove(Cutoff, "c")),
         Cutoff = factor(Cutoff, levels = unique(Cutoff[order(CutoffNumeric)]))) %>%
  arrange(Class, CutoffNumeric)

write_csv(all_stats_df, file.path(output_dir, "bigscape_network_stats.csv"))

# Plotting
all_stats_df$Class <- factor(all_stats_df$Class)
all_classes <- levels(all_stats_df$Class)
my_colors <- setNames(kelly_colors[1:length(all_classes)], all_classes)
scale_color_shared <- scale_color_manual(values = my_colors, limits = all_classes, drop = FALSE)

base_theme <- theme_minimal(base_size = 8) +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey95", linewidth = 0.2),
    axis.line = element_line(color = "black", linewidth = 0.3),
    strip.background = element_rect(fill = "grey90", color = NA),
    strip.text = element_text(face = "bold")
  )
theme_84mm <- base_theme +
  theme(
    plot.tag = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 9, face = "bold"),
    axis.text = element_text(size = 7),
    legend.text = element_text(size = 7),
    legend.title = element_blank(),
    plot.margin = margin(2, 5, 2, 2)
  )

F4A <- ggplot(all_stats_df, aes(x = Cutoff, y = GCFs, color = Class, group = Class)) +
  geom_line(linewidth = 0.6) + geom_point(size = 1.2) + scale_color_shared +
  theme_84mm + theme(legend.position = "none") + labs(y = "No. GCFs", x = "Cutoff")

F4B <- ggplot(all_stats_df, aes(x = Cutoff, y = Singletons, color = Class, group = Class)) +
  geom_line(linewidth = 0.6) + geom_point(size = 1.2) + scale_color_shared +
  theme_84mm + theme(legend.position = "none") + labs(y = "No. Singletons", x = "Cutoff")

F4C <- ggplot(all_stats_df, aes(x = Cutoff, y = MeanGCFSize, color = Class, group = Class)) +
  geom_line(linewidth = 0.6) + geom_point(size = 1.2) + scale_y_log10() +
  scale_color_shared + theme_84mm + labs(y = "Mean GCF (log10)", x = "Cutoff") +
  guides(color = guide_legend(nrow = 3, byrow = TRUE))

F4D <- all_stats_df %>%
  filter(Cutoff == "c0.60") %>%
  ggplot(aes(x = Class, y = MeanGCFSize)) +
  geom_col(fill = primary_color, width = 0.7) +
  labs(y = "Mean GCF Size", x = "") +
  theme_84mm + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

F4E <- all_stats_df %>%
  filter(Cutoff == "c0.60") %>%
  mutate(SingletonPct = Singletons / Nodes * 100) %>%
  ggplot(aes(x = Class, y = SingletonPct)) +
  geom_col(fill = accent_color, width = 0.7) +
  labs(y = "% Singletons", x = "") +
  theme_84mm + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

grid_F4 <- (F4A / F4B / F4C / F4D / F4E) +
  plot_layout(heights = c(1,1,1,1,1), guides = "collect") +
  plot_annotation(tag_levels = "A", theme = theme(legend.position = "top",
                                                  plot.tag = element_text(size = 12, face = "bold")))

ggsave(file.path(output_dir, "Figure_4_Final.tiff"),
       plot = grid_F4, width = 84, height = 275, units = "mm",
       dpi = 600, compression = "lzw")

# ==============================================================================
# PART 5: GCF Size Analysis (Figure 5)
# ==============================================================================

classes_F5 <- c("NRPS", "Others", "PKS-NRP_Hybrids", "PKSI", "PKSother", "RiPPs", "Terpene")
cutoff_F5 <- "c0.60"

process_clustering_data <- function(class_name, base_dir, cutoff_val) {
  file_path <- file.path(base_dir, class_name, paste0(class_name, "_clustering_", cutoff_val, ".tsv"))
  if (!file.exists(file_path)) {
    message("Warning: File not found: ", file_path)
    return(NULL)
  }
  read_tsv(file_path, show_col_types = FALSE) %>%
    select(BGC = 1, GCF = 2) %>%
    mutate(Class = class_name, Is_MIBiG = str_starts(BGC, "BGC"))
}

all_data_F5 <- map_dfr(classes_F5, process_clustering_data,
                       base_dir = bigscape_dir, cutoff_val = cutoff_F5) %>%
  mutate(Class = case_when(Class %in% c("PKSI", "PKSother") ~ "PKS", TRUE ~ Class))

all_gcf_stats <- all_data_F5 %>%
  group_by(Class, GCF) %>%
  summarise(GCF_Size = n(), Contains_MIBiG = any(Is_MIBiG), .groups = "drop")

class_order_F5 <- all_gcf_stats %>%
  group_by(Class) %>%
  summarise(Med = median(GCF_Size)) %>%
  arrange(desc(Med)) %>%
  pull(Class)

all_gcf_stats$Class <- factor(all_gcf_stats$Class, levels = class_order_F5)
fig5_palette <- setNames(kelly_colors[1:length(class_order_F5)], class_order_F5)

create_gcf_pseudo_log_174 <- function(data, title_text) {
  ggplot(data, aes(x = Class, y = GCF_Size, fill = Class)) +
    geom_violin(alpha = 0.2, color = NA, scale = "width") +
    geom_boxplot(width = 0.15, outlier.shape = NA, alpha = 0.5, color = "black", linewidth = 0.3) +
    geom_jitter(width = 0.15, alpha = 0.3, size = 0.8, shape = 16) +
    scale_y_continuous(trans = "pseudo_log", breaks = c(1, 5, 10, 20, 50, 100, 200), limits = c(0, 250)) +
    scale_fill_manual(values = fig5_palette) +
    labs(subtitle = title_text, y = "GCF Size (No. of BGCs)", x = "") +
    theme_minimal(base_size = 8) +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1, color = "black", size = 7),
      axis.text.y = element_text(size = 7, color = "black"),
      panel.grid.major.y = element_line(color = "grey92", linewidth = 0.2),
      panel.grid.minor.y = element_blank(),
      plot.subtitle = element_text(hjust = 0.5, face = "bold", size = 10),
      legend.position = "none",
      plot.margin = margin(5, 5, 5, 5)
    )
}

F5A <- create_gcf_pseudo_log_174(all_gcf_stats, "GCF Size Distribution")
F5B <- create_gcf_pseudo_log_174(all_gcf_stats %>% filter(Contains_MIBiG == TRUE),
                                 "MIBiG-containing GCFs")

final_fig5 <- (F5A + F5B) +
  plot_annotation(tag_levels = "A") &
  theme(plot.tag = element_text(size = 12, face = "bold"))

ggsave(file.path(output_dir, "Figure_5_Final.tiff"),
       plot = final_fig5, width = 174, height = 100, units = "mm",
       dpi = 600, bg = "white", compression = "lzw")

# ==============================================================================
# PART 6: Power Law Analysis & Taxonomic Distribution (Figure 6)
# ==============================================================================

bgc_metadata_full <- bgc_data %>%
  mutate(Clean_ID = str_remove(BGC, "\\.\\d+$")) %>%
  select(Clean_ID, BGC, `WGS Accession`, Species, Description) %>%
  distinct()

species_totals <- bgc_metadata_full %>%
  distinct(`WGS Accession`, Species) %>%
  count(Species, name = "Total_Isolates_In_Dataset")
total_species_count <- n_distinct(bgc_metadata_full$Species)

# Table S2: large GCFs
large_gcf_summary <- all_data_F5 %>%
  left_join(bgc_metadata_full %>% select(BGC, Species), by = "BGC") %>%
  group_by(Class, GCF) %>%
  mutate(GCF_Size = n()) %>%
  filter(GCF_Size > 100) %>%
  summarise(
    Total_BGCs = n(),
    Unique_Species_Count = n_distinct(Species, na.rm = TRUE),
    Strains_Count = length(Species),
    Has_MIBiG = any(Is_MIBiG),
    MIBiG_ID = paste(unique(BGC[Is_MIBiG]), collapse = "; "),
    .groups = "drop"
  )
write_csv(large_gcf_summary, file.path(output_dir, "Table_S2.csv"))

# Power law for B. bassiana
bb_isolates_per_gcf <- all_data_F5 %>%
  left_join(bgc_metadata_full, by = "BGC") %>%
  filter(Species == "Beauveria bassiana") %>%
  group_by(Class, GCF) %>%
  summarise(Isolates_Count = n_distinct(`WGS Accession`, na.rm = TRUE), .groups = "drop")

data_vec <- na.omit(bb_isolates_per_gcf$Isolates_Count)
data_vec <- data_vec[data_vec > 0]
if (length(data_vec) < 10) stop("Insufficient data points for reliable power-law fitting.")

m_pl <- displ$new(data_vec)
est_pl <- estimate_xmin(m_pl)
m_pl$setXmin(est_pl)
m_pl$setPars(estimate_pars(m_pl))
xmin_value <- m_pl$getXmin()
alpha_value <- m_pl$pars

m_ln <- dislnorm$new(data_vec)
m_ln$setXmin(xmin_value)
m_ln$setPars(estimate_pars(m_ln))

comparison <- compare_distributions(m_pl, m_ln)
LR_stat <- comparison$test_statistic
p_value <- comparison$p_two_sided

cat("Power Law alpha:", round(alpha_value, 3), "\n")
cat("Estimated xmin:", xmin_value, "\n")
cat("Likelihood Ratio (PL vs LN):", round(LR_stat, 3), "\n")
cat("p-value:", signif(p_value, 3), "\n")

# Plot
agg_tiff(file.path(output_dir, "Figure_6_Final.tiff"),
         width = 84, height = 84, units = "mm", res = 600)
par(mar = c(3.5, 3.5, 2, 1), mgp = c(2.2, 0.7, 0))
plot(m_pl, pch = 21, bg = alpha("grey80", 0.6), col = "black", cex = 0.5,
     xlab = "Isolates per GCF", ylab = expression(P(X >= x)),
     main = bquote("Scaling of" ~ italic("B. bassiana") ~ "GCFs"),
     cex.main = 0.9, cex.lab = 0.8, cex.axis = 0.7, bty = "l")
lines(m_pl, col = highlight_color, lwd = 1.5)
lines(m_ln, col = "#00538A", lwd = 1.2, lty = 2)

p_label <- ifelse(p_value < 0.001, "p < 0.001", paste0("p = ", signif(p_value, 3)))
legend("bottomleft",
       legend = c(as.expression(bquote("Power Law (" * alpha == .(round(alpha_value, 2)) *
                                         ", xmin = " * .(xmin_value) * ")")),
                  "Log-Normal Fit", paste0("LR = ", round(LR_stat, 2), ", ", p_label)),
       col = c(highlight_color, "#00538A", NA), lty = c(1, 2, NA), lwd = 1.2,
       cex = 0.6, bty = "n")
dev.off()

# ==============================================================================
# PART 7: Integrated Pipeline – BGC Seeds, STRING PPI, and PHI-Base Virulence
# ==============================================================================

# Load PHI-Base hits
phi_hits <- read_tsv(phi_results_path, col_names = FALSE, show_col_types = FALSE) %>%
  select(ProteinID = 1) %>%
  mutate(ProteinID = str_extract(ProteinID, "g[0-9]+\\.t[0-9]+")) %>%
  filter(!is.na(ProteinID)) %>%
  distinct() %>%
  pull(ProteinID)

# Helper functions for this part
map_gcf_to_as272 <- function(class_name, gcf_id, base_dir, target_pattern) {
  file_path <- file.path(base_dir, class_name, paste0(class_name, "_clustering_c0.60.tsv"))
  if (!file.exists(file_path)) return(NA_character_)
  data <- read_tsv(file_path, show_col_types = FALSE)
  match <- data %>%
    filter(.[[2]] == gcf_id) %>%
    filter(str_detect(.[[1]], target_pattern)) %>%
    pull(1)
  if (length(match) > 0) match[1] else NA_character_
}

extract_seeds <- function(bgc_id, root_path) {
  if (is.na(bgc_id)) return(NA_character_)
  acc <- str_match(bgc_id, "(JBQTZT[0-9]+)\\.(region[0-9]+)")
  if (is.na(acc[1, 1])) return(NA_character_)
  target_file <- paste0(acc[1, 2], ".1.", acc[1, 3], ".gbk")
  gbk_path <- file.path(root_path, target_file)
  if (!file.exists(gbk_path)) return(NA_character_)
  lines <- readLines(gbk_path, warn = FALSE)
  seeds <- lines[str_detect(lines, "/Parent=|/ID=")] %>%
    str_extract("(?<==\").*?(?=\")") %>%
    str_extract("^g[0-9]+\\.t[0-9]+") %>%
    na.omit() %>%
    unique() %>%
    sort()
  paste(seeds, collapse = ", ")
}

find_interactors_for_bgc <- function(seed_string, links_db, max_total_interactors = 50) {
  if (is.na(seed_string)) return(NA_character_)
  seeds <- unlist(str_split(seed_string, ", "))
  neighbors <- links_db %>%
    filter(protein1 %in% seeds | protein2 %in% seeds) %>%
    mutate(neighbor = if_else(protein1 %in% seeds, protein2, protein1)) %>%
    filter(!neighbor %in% seeds) %>%
    group_by(neighbor) %>%
    summarise(score = max(combined_score), .groups = "drop") %>%
    arrange(desc(score)) %>%
    slice_head(n = max_total_interactors) %>%
    pull(neighbor)
  if (length(neighbors) > 0) paste(sort(neighbors), collapse = ", ") else NA_character_
}

classify_virulence <- function(protein_string, type, phi_list) {
  if (is.na(protein_string)) return(NA_character_)
  prots <- unlist(str_split(protein_string, ", "))
  classes <- map_chr(prots, function(p) {
    has_phi <- p %in% phi_list
    if (type == "Seed") {
      ifelse(has_phi, "Hub + PHI", "Hub")
    } else {
      ifelse(has_phi, "Connector + PHI", "Connector")
    }
  })
  paste(classes, collapse = ", ")
}

get_n <- function(x, label) {
  if (is.na(x)) return(0L)
  sum(unlist(str_split(x, ", ")) == label)
}

# Load STRING
string_links <- read_table(string_links_path, show_col_types = FALSE) %>%
  mutate(protein1 = str_remove(protein1, "STRG0A07FVB\\."),
         protein2 = str_remove(protein2, "STRG0A07FVB\\.")) %>%
  filter(combined_score >= 700)

# Define GCFs of interest (each row is a pair)
hubs_to_retrieve <- tribble(
  ~Class,               ~GCF,
  "Terpene",           1989,
  "Terpene",           8171,
  "NRPS",              8144,
  "PKSother",          12262,
  "PKSI",              8173,
  "Terpene",           8185,
  "NRPS",              1963,
  "NRPS",              15427,
  "PKSI",              8162,
  "NRPS",              4080,
  "PKSI",              7963,
  "PKSI",              6648,
  "PKS-NRP_Hybrids",   12259,
  "NRPS",              16137,
  "NRPS",              12279
)

# Execute pipeline
AS272_mappings <- hubs_to_retrieve %>%
  mutate(AS272_BGC_ID = map2_chr(Class, GCF,
                                 ~map_gcf_to_as272(.x, .y, bigscape_dir, "GCA_052426205.1"))) %>%
  mutate(Protein_IDs = map_chr(AS272_BGC_ID, ~extract_seeds(.x, antismash_dir))) %>%
  mutate(Interactor_IDs = map_chr(Protein_IDs,
                                  ~find_interactors_for_bgc(.x, string_links, max_total_interactors = 50))) %>%
  mutate(Interactor_Count = str_count(Interactor_IDs, "g\\d+")) %>%
  mutate(Seed_Classification = map_chr(Protein_IDs, ~classify_virulence(.x, "Seed", phi_hits)),
         Connector_Classification = map_chr(Interactor_IDs, ~classify_virulence(.x, "Connector", phi_hits)))

# Count and summarize
AS272_summary_table <- AS272_mappings %>%
  rowwise() %>%
  mutate(
    n_Seed          = get_n(Seed_Classification, "Hub"),
    n_Seed_PHI      = get_n(Seed_Classification, "Hub + PHI"),
    n_Connector     = get_n(Connector_Classification, "Connector"),
    n_Connector_PHI = get_n(Connector_Classification, "Connector + PHI")
  ) %>%
  ungroup() %>%
  select(Class, GCF, n_Seed, n_Seed_PHI, n_Connector, n_Connector_PHI) %>%
  arrange(desc(n_Seed_PHI), desc(n_Connector_PHI))

# Network metrics for these GCFs
build_network <- function(row_df, string_links, phi_hits) {
  seeds <- unlist(str_split(row_df$Protein_IDs, ", "))
  interactors <- unlist(str_split(row_df$Interactor_IDs, ", "))
  seeds <- seeds[!is.na(seeds)]
  interactors <- interactors[!is.na(interactors)]
  nodes <- unique(c(seeds, interactors))
  if (length(nodes) == 0) return(NULL)
  edges <- string_links %>%
    filter(protein1 %in% nodes & protein2 %in% nodes) %>%
    select(protein1, protein2)
  if (nrow(edges) == 0) {
    g <- make_empty_graph() %>% add_vertices(length(nodes), name = nodes)
  } else {
    g <- graph_from_data_frame(edges, vertices = tibble(name = nodes), directed = FALSE)
  }
  V(g)$is_seed <- V(g)$name %in% seeds
  V(g)$is_phi  <- V(g)$name %in% phi_hits
  return(g)
}

evaluate_network <- function(row_df, string_links, phi_hits, background_nodes) {
  if (is.na(row_df$Protein_IDs)) return(NULL)
  g <- build_network(row_df, string_links, phi_hits)
  if (is.null(g)) return(NULL)
  n_nodes <- vcount(g)
  n_edges <- ecount(g)
  density <- ifelse(n_nodes > 1, edge_density(g), NA)
  avg_degree <- ifelse(n_nodes > 0, mean(degree(g)), NA)
  n_components <- ifelse(n_nodes > 0, components(g)$no, NA)
  clustering <- ifelse(n_nodes > 2, suppressWarnings(transitivity(g, type = "global")), NA)
  random_clustering <- NA
  if (!is.na(n_nodes) && !is.na(n_edges) && n_nodes > 2 && n_edges > 0) {
    g_random <- sample_gnm(n_nodes, n_edges)
    random_clustering <- suppressWarnings(transitivity(g_random, type = "global"))
  }
  clustering_enrichment <- ifelse(!is.na(clustering) && !is.na(random_clustering) && random_clustering > 0,
                                  clustering / random_clustering, NA)
  phi_in_network <- sum(V(g)$is_phi)
  non_phi_in_network <- n_nodes - phi_in_network
  total_phi_background <- length(phi_hits)
  total_background <- length(background_nodes)
  fisher_p <- NA
  fisher_odds <- NA
  if (total_background > 0) {
    fisher_matrix <- matrix(c(phi_in_network, non_phi_in_network,
                              total_phi_background - phi_in_network,
                              total_background - total_phi_background - non_phi_in_network), nrow = 2)
    fisher_res <- tryCatch(fisher.test(fisher_matrix), error = function(e) NULL)
    if (!is.null(fisher_res)) {
      fisher_p <- fisher_res$p.value
      fisher_odds <- as.numeric(fisher_res$estimate)
    }
  }
  tibble(
    Class = row_df$Class,
    GCF = row_df$GCF,
    Nodes = n_nodes,
    Edges = n_edges,
    Density = density,
    Avg_Degree = avg_degree,
    Components = n_components,
    Clustering = clustering,
    Random_Clustering = random_clustering,
    Clustering_Enrichment = clustering_enrichment,
    PHI_Count = phi_in_network,
    PHI_Enrichment_Odds = fisher_odds,
    PHI_Enrichment_p = fisher_p
  )
}

background_nodes <- unique(c(string_links$protein1, string_links$protein2))

network_metrics_table <- map_dfr(seq_len(nrow(AS272_mappings)),
                                 ~evaluate_network(AS272_mappings[.x, ],
                                                   string_links, phi_hits, background_nodes)) %>%
  filter(!is.na(Nodes)) %>%
  mutate(PHI_FDR = p.adjust(PHI_Enrichment_p, method = "BH")) %>%
  mutate(across(c(Density, Avg_Degree, Clustering, Random_Clustering, Clustering_Enrichment,
                  PHI_Enrichment_Odds), round, digits = 3),
         PHI_Enrichment_p = signif(PHI_Enrichment_p, 3),
         PHI_FDR = signif(PHI_FDR, 3)) %>%
  select(Class, GCF, Nodes, Edges, Density, Avg_Degree, Components, Clustering,
         Random_Clustering, Clustering_Enrichment, PHI_Count, PHI_Enrichment_Odds,
         PHI_Enrichment_p, PHI_FDR)

write_csv(network_metrics_table, file.path(output_dir, "Table_S3.csv"))

# Network plots for selected GCFs
build_network_plot <- function(row_df, string_links, phi_hits) {
  g <- build_network(row_df, string_links, phi_hits)
  if (is.null(g)) return(NULL)
  seeds <- unlist(str_split(row_df$Protein_IDs, ", "))
  V(g)$type <- case_when(
    V(g)$name %in% seeds & V(g)$name %in% phi_hits ~ "Hub + PHI",
    V(g)$name %in% seeds ~ "Hub",
    V(g)$name %in% phi_hits ~ "Connector + PHI",
    TRUE ~ "Connector"
  )
  V(g)$type <- factor(V(g)$type, levels = c("Connector", "Connector + PHI", "Hub", "Hub + PHI"))
  return(g)
}

plot_panel <- function(row_df, tag_letter) {
  g <- build_network_plot(row_df, string_links, phi_hits)
  if (is.null(g)) return(NULL)
  V(g)$type <- factor(V(g)$type, levels = c("Connector", "Connector + PHI", "Hub", "Hub + PHI"))
  g_tbl <- as_tbl_graph(g)
  ggraph(g_tbl, layout = "fr") +
    geom_edge_link(color = "grey75", alpha = 0.6) +
    geom_node_point(aes(fill = type, size = type), shape = 21, color = "black", stroke = 0.3) +
    scale_fill_manual(values = c("Connector" = "white", "Connector + PHI" = "#1a9641",
                                 "Hub" = "#fdae61", "Hub + PHI" = "#d7191c")) +
    scale_size_manual(values = c("Connector" = 3, "Connector + PHI" = 4,
                                 "Hub" = 5, "Hub + PHI" = 6), guide = "none") +
    theme_void() +
    labs(title = paste0(row_df$Class, " (GCF ", row_df$GCF, ")"), tag = tag_letter) +
    theme(plot.title = element_text(size = 11, face = "bold", hjust = 0.5),
          plot.tag = element_text(size = 14, face = "bold"),
          legend.position = "none")
}

selected_gcfs <- c(1989, 8144, 12262, 6648, 12259, 16137)
rows_to_plot <- AS272_mappings %>% filter(GCF %in% selected_gcfs)
panel_letters <- LETTERS[seq_len(nrow(rows_to_plot))]

plots <- map2(split(rows_to_plot, seq_len(nrow(rows_to_plot))),
              panel_letters,
              ~plot_panel(.x, .y)) %>%
  compact()

# Extract legend from one plot
legend_plot <- plots[[3]] +
  theme(legend.position = "bottom", legend.title = element_text(size = 11, face = "bold"),
        legend.text = element_text(size = 11), legend.key.size = unit(1.2, "cm"),
        legend.spacing.x = unit(0.6, "cm"))
legend <- cowplot::get_legend(legend_plot)

panel_grid <- wrap_plots(plots, ncol = 2)
final_plot7 <- plot_grid(panel_grid, legend, ncol = 1, rel_heights = c(1, 0.08))

ggsave(filename = file.path(output_dir, "Figure_7_Final.tiff"),
       plot = final_plot7, width = 174, height = 225, units = "mm",
       dpi = 600, compression = "lzw")

# End of script