# ============================================================================
# TCGA-BRCA miRNA-seq validation script
# Purpose: Independently validate ER- and HER2-associated miRNAs from the
#          Taipei VGH Clariom D microarray cohort using TCGA-BRCA miRNA-seq.
#
# Tested with R >= 4.3 and TCGAbiolinks 2.x / 3.x.
# Robust to subtype-table column-naming changes across versions.
# Uses BCR Biotab clinical_patient_brca for ER/HER2 IHC status (most reliable
# source in current TCGAbiolinks releases).
# ============================================================================

# ---------------------------------------------------------------------------
# 0. INSTALL DEPENDENCIES (runs once; safe to leave in for every run)
# ---------------------------------------------------------------------------
required_cran <- c(
  "BiocManager", "dplyr", "ggplot2", "ggrepel",
  "vroom", "readr", "tibble", "tidyr", "purrr",
  "jsonlite", "xml2", "httr", "curl", "data.table",
  "R.utils", "stringr", "knitr"
)
for (pkg in required_cran) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    message("Installing CRAN package: ", pkg)
    install.packages(pkg, repos = "https://cloud.r-project.org",
                     dependencies = TRUE)
  }
}
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager", repos = "https://cloud.r-project.org")
}
required_bioc <- c("TCGAbiolinks", "SummarizedExperiment", "limma", "edgeR")
for (pkg in required_bioc) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    message("Installing Bioconductor package: ", pkg)
    BiocManager::install(pkg, ask = FALSE, update = FALSE,
                         dependencies = TRUE)
  }
}

# ---------------------------------------------------------------------------
# 1. LOAD LIBRARIES
# ---------------------------------------------------------------------------
suppressPackageStartupMessages({
  library(TCGAbiolinks)
  library(SummarizedExperiment)
  library(limma)
  library(edgeR)
  library(dplyr)
  library(ggplot2)
  library(ggrepel)
})

# ---------------------------------------------------------------------------
# 2. WORKING DIRECTORY
# ---------------------------------------------------------------------------
work_dir <- file.path(Sys.getenv("HOME"), "TCGA_validation")
dir.create(work_dir, showWarnings = FALSE, recursive = TRUE)
setwd(work_dir)
message("Working directory: ", getwd())

# ---------------------------------------------------------------------------
# 3. DOWNLOAD TCGA-BRCA miRNA-seq QUANTIFICATION
# ---------------------------------------------------------------------------
message("\n--- Step 3: Downloading TCGA-BRCA miRNA-seq quantification ---")
query_mirna <- GDCquery(
  project       = "TCGA-BRCA",
  data.category = "Transcriptome Profiling",
  data.type     = "miRNA Expression Quantification",
  experimental.strategy = "miRNA-Seq",
  workflow.type = "BCGSC miRNA Profiling"
)
GDCdownload(query_mirna, method = "api", files.per.chunk = 30)
mirna <- GDCprepare(query_mirna)

rpm_cols   <- grep("^reads_per_million_miRNA_mapped_", colnames(mirna), value = TRUE)
expr_mat   <- as.matrix(mirna[, rpm_cols])
rownames(expr_mat) <- mirna$miRNA_ID
colnames(expr_mat) <- sub("^reads_per_million_miRNA_mapped_", "", rpm_cols)
message(sprintf("Expression matrix: %d miRNAs x %d aliquots", nrow(expr_mat), ncol(expr_mat)))

# ---------------------------------------------------------------------------
# 4. ER / HER2 ANNOTATION - tries 3 sources, in order
# ---------------------------------------------------------------------------
# Source A: TCGAquery_subtype("brca")               (varies by version)
# Source B: BCR Biotab clinical_patient_brca        (has er_status_by_ihc etc.)
# Source C: GDCquery_clinic("TCGA-BRCA")            (rarely has IHC anymore)

find_col <- function(df, patterns, label) {
  for (pat in patterns) {
    hit <- grep(pat, colnames(df), ignore.case = TRUE, value = TRUE)
    if (length(hit) > 0) {
      message(sprintf("  -> using column '%s' for %s", hit[1], label))
      return(hit[1])
    }
  }
  return(NA_character_)
}

normalize_status <- function(x) {
  ifelse(grepl("positive", x, ignore.case = TRUE), "Positive",
  ifelse(grepl("negative", x, ignore.case = TRUE), "Negative", NA))
}

annot <- NULL  # data.frame with patient, ER_status, HER2_status

# ----------- Source A: subtype table -----------
message("\n--- Step 4A: Trying TCGAquery_subtype(brca) ---")
subtype <- tryCatch(TCGAquery_subtype(tumor = "brca"), error = function(e) NULL)
if (!is.null(subtype)) {
  message("Subtype columns: ", paste(colnames(subtype), collapse = ", "))
  pat_col  <- find_col(subtype, c("^patient$", "^submitter_id$", "^bcr_patient_barcode$"), "patient ID")
  er_col   <- find_col(subtype, c("^ER[._ ]Status$", "ER.*Status.*IHC", "er_status_by_ihc", "^ER$"), "ER")
  her2_col <- find_col(subtype, c("^HER2[._ ]Final[._ ]Status$", "HER2.*Status.*IHC",
                                  "her2_status_by_ihc", "^HER2[._ ]Status$", "^HER2$"), "HER2")
  if (!is.na(pat_col) && !is.na(er_col) && !is.na(her2_col)) {
    annot <- data.frame(
      patient     = subtype[[pat_col]],
      ER_status   = normalize_status(subtype[[er_col]]),
      HER2_status = normalize_status(subtype[[her2_col]]),
      stringsAsFactors = FALSE
    ) |> filter(!is.na(ER_status), !is.na(HER2_status))
    message(sprintf("Source A success: %d patients with ER and HER2 calls", nrow(annot)))
  } else {
    message("Source A: subtype table lacks IHC columns; trying next source.")
  }
}

# ----------- Source B: BCR Biotab clinical_patient_brca -----------
if (is.null(annot) || nrow(annot) < 20) {
  message("\n--- Step 4B: Trying BCR Biotab clinical_patient_brca ---")
  query_biotab <- tryCatch(
    GDCquery(project = "TCGA-BRCA",
             data.category = "Clinical",
             data.type = "Clinical Supplement",
             data.format = "BCR Biotab"),
    error = function(e) { message("GDCquery biotab failed: ", conditionMessage(e)); NULL }
  )
  if (!is.null(query_biotab)) {
    GDCdownload(query_biotab, method = "api")
    biotab <- GDCprepare(query_biotab)
    # 'biotab' is a list of data.frames keyed by sheet name
    message("Biotab tables available: ", paste(names(biotab), collapse = ", "))
    # Find the patient-level table (clinical_patient_brca)
    pat_tab_name <- grep("clinical_patient", names(biotab), value = TRUE)
    if (length(pat_tab_name) >= 1) {
      pat_tab <- biotab[[pat_tab_name[1]]]
      # Biotab often has 1-2 header rows of column descriptions; drop them
      if (any(grepl("^CDE_ID|bcr_patient_barcode$", pat_tab$bcr_patient_barcode))) {
        pat_tab <- pat_tab[!grepl("^CDE_ID|bcr_patient_barcode$", pat_tab$bcr_patient_barcode), ]
      }
      message("clinical_patient_brca columns containing er/her2:")
      relcols <- grep("er_status|estrogen|her2", colnames(pat_tab), value = TRUE, ignore.case = TRUE)
      print(relcols)
      pat_col  <- "bcr_patient_barcode"
      er_col   <- find_col(pat_tab, c("er_status_by_ihc", "breast_carcinoma_estrogen_receptor_status",
                                       "estrogen_receptor_status"), "ER (biotab)")
      her2_col <- find_col(pat_tab, c("her2_status_by_ihc",
                                       "lab_proc_her2_neu_immunohistochemistry_receptor_status",
                                       "her2_immunohistochemistry_level_result",
                                       "her2_neu_status"), "HER2 (biotab)")
      if (!is.na(er_col) && !is.na(her2_col)) {
        annot <- data.frame(
          patient     = pat_tab[[pat_col]],
          ER_status   = normalize_status(pat_tab[[er_col]]),
          HER2_status = normalize_status(pat_tab[[her2_col]]),
          stringsAsFactors = FALSE
        ) |> filter(!is.na(ER_status), !is.na(HER2_status))
        message(sprintf("Source B success: %d patients with ER and HER2 calls", nrow(annot)))
      } else {
        message("Source B: biotab clinical_patient_brca lacks ER and/or HER2 IHC columns.")
      }
    }
  }
}

# ----------- Source C: GDCquery_clinic -----------
if (is.null(annot) || nrow(annot) < 20) {
  message("\n--- Step 4C: Trying GDCquery_clinic(TCGA-BRCA) ---")
  clin <- tryCatch(GDCquery_clinic("TCGA-BRCA", type = "clinical"), error = function(e) NULL)
  if (!is.null(clin)) {
    pat_col  <- "submitter_id"
    er_col   <- find_col(clin, c("breast_carcinoma_estrogen_receptor_status",
                                  "estrogen_receptor_status", "er_status_by_ihc"), "ER (clinical)")
    her2_col <- find_col(clin, c("lab_proc_her2_neu_immunohistochemistry_receptor_status",
                                  "her2_status_by_ihc", "her2_neu_status"), "HER2 (clinical)")
    if (!is.na(er_col) && !is.na(her2_col)) {
      annot <- data.frame(
        patient     = clin[[pat_col]],
        ER_status   = normalize_status(clin[[er_col]]),
        HER2_status = normalize_status(clin[[her2_col]]),
        stringsAsFactors = FALSE
      ) |> filter(!is.na(ER_status), !is.na(HER2_status))
      message(sprintf("Source C success: %d patients with ER and HER2 calls", nrow(annot)))
    }
  }
}

if (is.null(annot) || nrow(annot) < 20) {
  stop("Could not locate ER and HER2 status from any of the three TCGA sources. ",
       "Please run `View(biotab$clinical_patient_brca)` and identify the IHC ",
       "column names manually, then add them to the er_col / her2_col patterns ",
       "in Source B of this script.")
}

# ---------------------------------------------------------------------------
# 5. JOIN ANNOTATION TO THE EXPRESSION MATRIX AND FILTER
# ---------------------------------------------------------------------------
sample_meta <- data.frame(
  aliquot = colnames(expr_mat),
  patient = substr(colnames(expr_mat), 1, 12),
  stringsAsFactors = FALSE
) |>
  left_join(annot, by = "patient") |>
  filter(ER_status   %in% c("Positive", "Negative"),
         HER2_status %in% c("Positive", "Negative"))

expr_mat <- expr_mat[, sample_meta$aliquot]
message(sprintf("\nSamples retained for analysis: %d (ER+: %d, ER-: %d, HER2+: %d, HER2-: %d)",
                nrow(sample_meta),
                sum(sample_meta$ER_status   == "Positive"),
                sum(sample_meta$ER_status   == "Negative"),
                sum(sample_meta$HER2_status == "Positive"),
                sum(sample_meta$HER2_status == "Negative")))

# ---------------------------------------------------------------------------
# 6. DIFFERENTIAL EXPRESSION (limma-trend on log2(RPM+1))
# ---------------------------------------------------------------------------
y <- log2(expr_mat + 1)

design_ER <- model.matrix(~ ER_status + HER2_status, data = sample_meta)
fit_ER    <- lmFit(y, design_ER)
fit_ER    <- eBayes(fit_ER, trend = TRUE, robust = TRUE)
res_ER    <- topTable(fit_ER, coef = "ER_statusPositive", number = Inf,
                      sort.by = "P", adjust.method = "BH")
res_ER$miRNA <- rownames(res_ER)

design_H2 <- model.matrix(~ HER2_status + ER_status, data = sample_meta)
fit_H2    <- lmFit(y, design_H2)
fit_H2    <- eBayes(fit_H2, trend = TRUE, robust = TRUE)
res_H2    <- topTable(fit_H2, coef = "HER2_statusPositive", number = Inf,
                      sort.by = "P", adjust.method = "BH")
res_H2$miRNA <- rownames(res_H2)

# ---------------------------------------------------------------------------
# 7. COMPARE TO DISCOVERY HITS
# ---------------------------------------------------------------------------
discovery_ER <- c("hsa-mir-7-2","hsa-mir-4468","hsa-mir-6812","hsa-mir-548i-2",
                  "hsa-mir-1278","hsa-mir-548ba","hsa-mir-8086","hsa-mir-378d-2",
                  "hsa-mir-190a","hsa-mir-92b","hsa-mir-1273f")
discovery_H2 <- c("hsa-mir-4728","hsa-mir-7641-2","hsa-mir-4311","hsa-mir-6762",
                  "hsa-mir-3668","hsa-mir-1343","hsa-mir-384","hsa-mir-4535",
                  "hsa-mir-146a","hsa-mir-6825","hsa-mir-5701-1","hsa-mir-5701-2",
                  "hsa-mir-5701-3","hsa-mir-496","hsa-mir-550b-1","hsa-mir-4520-1")

val_ER <- res_ER |> filter(miRNA %in% discovery_ER) |>
          select(miRNA, logFC, P.Value, adj.P.Val)
val_H2 <- res_H2 |> filter(miRNA %in% discovery_H2) |>
          select(miRNA, logFC, P.Value, adj.P.Val)

write.csv(val_ER, "tcga_validation_ER.csv",   row.names = FALSE)
write.csv(val_H2, "tcga_validation_HER2.csv", row.names = FALSE)
write.csv(res_ER, "tcga_full_results_ER.csv",   row.names = FALSE)
write.csv(res_H2, "tcga_full_results_HER2.csv", row.names = FALSE)

# ---------------------------------------------------------------------------
# 8. VOLCANO PLOTS
# ---------------------------------------------------------------------------
make_volcano <- function(res, hits, title, outfile) {
  res$signif <- res$adj.P.Val < 0.05
  res$label  <- ifelse(res$miRNA %in% hits, res$miRNA, NA)
  p <- ggplot(res, aes(x = logFC, y = -log10(P.Value))) +
         geom_point(aes(color = signif), alpha = 0.4, size = 0.6) +
         geom_point(data = subset(res, miRNA %in% hits),
                    color = "red", size = 1.8) +
         ggrepel::geom_text_repel(aes(label = label), size = 3, max.overlaps = 30) +
         scale_color_manual(values = c("grey70", "steelblue"),
                            labels = c("q >= 0.05", "q < 0.05")) +
         theme_classic(base_size = 11) +
         labs(title = title, color = "TCGA-BRCA significance",
              x = "log2 fold change", y = "-log10(P)")
  ggsave(outfile, p, width = 7, height = 5)
}

make_volcano(res_ER, discovery_ER,
             "TCGA-BRCA: ER+ vs ER- (limma-trend)",
             "tcga_brca_validation_volcano_ER.pdf")
make_volcano(res_H2, discovery_H2,
             "TCGA-BRCA: HER2+ vs HER2- (limma-trend)",
             "tcga_brca_validation_volcano_HER2.pdf")

cat("\nValidation complete. Outputs in:", getwd(), "\n")
cat("Files written:\n")
cat("  tcga_validation_ER.csv             - discovery ER candidates with TCGA stats\n")
cat("  tcga_validation_HER2.csv           - discovery HER2 candidates with TCGA stats\n")
cat("  tcga_full_results_ER.csv           - full TCGA ER contrast\n")
cat("  tcga_full_results_HER2.csv         - full TCGA HER2 contrast\n")
cat("  tcga_brca_validation_volcano_ER.pdf\n")
cat("  tcga_brca_validation_volcano_HER2.pdf\n")