# ============================================================================ # TCGA-BRCA miRNA-seq validation script # Purpose: Independently validate ER- and HER2-associated miRNAs from the # Taipei VGH Clariom D microarray cohort using TCGA-BRCA miRNA-seq. # # Tested with R >= 4.3 and TCGAbiolinks 2.x / 3.x. # Robust to subtype-table column-naming changes across versions. # Uses BCR Biotab clinical_patient_brca for ER/HER2 IHC status (most reliable # source in current TCGAbiolinks releases). # ============================================================================ # --------------------------------------------------------------------------- # 0. INSTALL DEPENDENCIES (runs once; safe to leave in for every run) # --------------------------------------------------------------------------- required_cran <- c( "BiocManager", "dplyr", "ggplot2", "ggrepel", "vroom", "readr", "tibble", "tidyr", "purrr", "jsonlite", "xml2", "httr", "curl", "data.table", "R.utils", "stringr", "knitr" ) for (pkg in required_cran) { if (!requireNamespace(pkg, quietly = TRUE)) { message("Installing CRAN package: ", pkg) install.packages(pkg, repos = "https://cloud.r-project.org", dependencies = TRUE) } } if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager", repos = "https://cloud.r-project.org") } required_bioc <- c("TCGAbiolinks", "SummarizedExperiment", "limma", "edgeR") for (pkg in required_bioc) { if (!requireNamespace(pkg, quietly = TRUE)) { message("Installing Bioconductor package: ", pkg) BiocManager::install(pkg, ask = FALSE, update = FALSE, dependencies = TRUE) } } # --------------------------------------------------------------------------- # 1. LOAD LIBRARIES # --------------------------------------------------------------------------- suppressPackageStartupMessages({ library(TCGAbiolinks) library(SummarizedExperiment) library(limma) library(edgeR) library(dplyr) library(ggplot2) library(ggrepel) }) # --------------------------------------------------------------------------- # 2. WORKING DIRECTORY # --------------------------------------------------------------------------- work_dir <- file.path(Sys.getenv("HOME"), "TCGA_validation") dir.create(work_dir, showWarnings = FALSE, recursive = TRUE) setwd(work_dir) message("Working directory: ", getwd()) # --------------------------------------------------------------------------- # 3. DOWNLOAD TCGA-BRCA miRNA-seq QUANTIFICATION # --------------------------------------------------------------------------- message("\n--- Step 3: Downloading TCGA-BRCA miRNA-seq quantification ---") query_mirna <- GDCquery( project = "TCGA-BRCA", data.category = "Transcriptome Profiling", data.type = "miRNA Expression Quantification", experimental.strategy = "miRNA-Seq", workflow.type = "BCGSC miRNA Profiling" ) GDCdownload(query_mirna, method = "api", files.per.chunk = 30) mirna <- GDCprepare(query_mirna) rpm_cols <- grep("^reads_per_million_miRNA_mapped_", colnames(mirna), value = TRUE) expr_mat <- as.matrix(mirna[, rpm_cols]) rownames(expr_mat) <- mirna$miRNA_ID colnames(expr_mat) <- sub("^reads_per_million_miRNA_mapped_", "", rpm_cols) message(sprintf("Expression matrix: %d miRNAs x %d aliquots", nrow(expr_mat), ncol(expr_mat))) # --------------------------------------------------------------------------- # 4. ER / HER2 ANNOTATION - tries 3 sources, in order # --------------------------------------------------------------------------- # Source A: TCGAquery_subtype("brca") (varies by version) # Source B: BCR Biotab clinical_patient_brca (has er_status_by_ihc etc.) # Source C: GDCquery_clinic("TCGA-BRCA") (rarely has IHC anymore) find_col <- function(df, patterns, label) { for (pat in patterns) { hit <- grep(pat, colnames(df), ignore.case = TRUE, value = TRUE) if (length(hit) > 0) { message(sprintf(" -> using column '%s' for %s", hit[1], label)) return(hit[1]) } } return(NA_character_) } normalize_status <- function(x) { ifelse(grepl("positive", x, ignore.case = TRUE), "Positive", ifelse(grepl("negative", x, ignore.case = TRUE), "Negative", NA)) } annot <- NULL # data.frame with patient, ER_status, HER2_status # ----------- Source A: subtype table ----------- message("\n--- Step 4A: Trying TCGAquery_subtype(brca) ---") subtype <- tryCatch(TCGAquery_subtype(tumor = "brca"), error = function(e) NULL) if (!is.null(subtype)) { message("Subtype columns: ", paste(colnames(subtype), collapse = ", ")) pat_col <- find_col(subtype, c("^patient$", "^submitter_id$", "^bcr_patient_barcode$"), "patient ID") er_col <- find_col(subtype, c("^ER[._ ]Status$", "ER.*Status.*IHC", "er_status_by_ihc", "^ER$"), "ER") her2_col <- find_col(subtype, c("^HER2[._ ]Final[._ ]Status$", "HER2.*Status.*IHC", "her2_status_by_ihc", "^HER2[._ ]Status$", "^HER2$"), "HER2") if (!is.na(pat_col) && !is.na(er_col) && !is.na(her2_col)) { annot <- data.frame( patient = subtype[[pat_col]], ER_status = normalize_status(subtype[[er_col]]), HER2_status = normalize_status(subtype[[her2_col]]), stringsAsFactors = FALSE ) |> filter(!is.na(ER_status), !is.na(HER2_status)) message(sprintf("Source A success: %d patients with ER and HER2 calls", nrow(annot))) } else { message("Source A: subtype table lacks IHC columns; trying next source.") } } # ----------- Source B: BCR Biotab clinical_patient_brca ----------- if (is.null(annot) || nrow(annot) < 20) { message("\n--- Step 4B: Trying BCR Biotab clinical_patient_brca ---") query_biotab <- tryCatch( GDCquery(project = "TCGA-BRCA", data.category = "Clinical", data.type = "Clinical Supplement", data.format = "BCR Biotab"), error = function(e) { message("GDCquery biotab failed: ", conditionMessage(e)); NULL } ) if (!is.null(query_biotab)) { GDCdownload(query_biotab, method = "api") biotab <- GDCprepare(query_biotab) # 'biotab' is a list of data.frames keyed by sheet name message("Biotab tables available: ", paste(names(biotab), collapse = ", ")) # Find the patient-level table (clinical_patient_brca) pat_tab_name <- grep("clinical_patient", names(biotab), value = TRUE) if (length(pat_tab_name) >= 1) { pat_tab <- biotab[[pat_tab_name[1]]] # Biotab often has 1-2 header rows of column descriptions; drop them if (any(grepl("^CDE_ID|bcr_patient_barcode$", pat_tab$bcr_patient_barcode))) { pat_tab <- pat_tab[!grepl("^CDE_ID|bcr_patient_barcode$", pat_tab$bcr_patient_barcode), ] } message("clinical_patient_brca columns containing er/her2:") relcols <- grep("er_status|estrogen|her2", colnames(pat_tab), value = TRUE, ignore.case = TRUE) print(relcols) pat_col <- "bcr_patient_barcode" er_col <- find_col(pat_tab, c("er_status_by_ihc", "breast_carcinoma_estrogen_receptor_status", "estrogen_receptor_status"), "ER (biotab)") her2_col <- find_col(pat_tab, c("her2_status_by_ihc", "lab_proc_her2_neu_immunohistochemistry_receptor_status", "her2_immunohistochemistry_level_result", "her2_neu_status"), "HER2 (biotab)") if (!is.na(er_col) && !is.na(her2_col)) { annot <- data.frame( patient = pat_tab[[pat_col]], ER_status = normalize_status(pat_tab[[er_col]]), HER2_status = normalize_status(pat_tab[[her2_col]]), stringsAsFactors = FALSE ) |> filter(!is.na(ER_status), !is.na(HER2_status)) message(sprintf("Source B success: %d patients with ER and HER2 calls", nrow(annot))) } else { message("Source B: biotab clinical_patient_brca lacks ER and/or HER2 IHC columns.") } } } } # ----------- Source C: GDCquery_clinic ----------- if (is.null(annot) || nrow(annot) < 20) { message("\n--- Step 4C: Trying GDCquery_clinic(TCGA-BRCA) ---") clin <- tryCatch(GDCquery_clinic("TCGA-BRCA", type = "clinical"), error = function(e) NULL) if (!is.null(clin)) { pat_col <- "submitter_id" er_col <- find_col(clin, c("breast_carcinoma_estrogen_receptor_status", "estrogen_receptor_status", "er_status_by_ihc"), "ER (clinical)") her2_col <- find_col(clin, c("lab_proc_her2_neu_immunohistochemistry_receptor_status", "her2_status_by_ihc", "her2_neu_status"), "HER2 (clinical)") if (!is.na(er_col) && !is.na(her2_col)) { annot <- data.frame( patient = clin[[pat_col]], ER_status = normalize_status(clin[[er_col]]), HER2_status = normalize_status(clin[[her2_col]]), stringsAsFactors = FALSE ) |> filter(!is.na(ER_status), !is.na(HER2_status)) message(sprintf("Source C success: %d patients with ER and HER2 calls", nrow(annot))) } } } if (is.null(annot) || nrow(annot) < 20) { stop("Could not locate ER and HER2 status from any of the three TCGA sources. ", "Please run `View(biotab$clinical_patient_brca)` and identify the IHC ", "column names manually, then add them to the er_col / her2_col patterns ", "in Source B of this script.") } # --------------------------------------------------------------------------- # 5. JOIN ANNOTATION TO THE EXPRESSION MATRIX AND FILTER # --------------------------------------------------------------------------- sample_meta <- data.frame( aliquot = colnames(expr_mat), patient = substr(colnames(expr_mat), 1, 12), stringsAsFactors = FALSE ) |> left_join(annot, by = "patient") |> filter(ER_status %in% c("Positive", "Negative"), HER2_status %in% c("Positive", "Negative")) expr_mat <- expr_mat[, sample_meta$aliquot] message(sprintf("\nSamples retained for analysis: %d (ER+: %d, ER-: %d, HER2+: %d, HER2-: %d)", nrow(sample_meta), sum(sample_meta$ER_status == "Positive"), sum(sample_meta$ER_status == "Negative"), sum(sample_meta$HER2_status == "Positive"), sum(sample_meta$HER2_status == "Negative"))) # --------------------------------------------------------------------------- # 6. DIFFERENTIAL EXPRESSION (limma-trend on log2(RPM+1)) # --------------------------------------------------------------------------- y <- log2(expr_mat + 1) design_ER <- model.matrix(~ ER_status + HER2_status, data = sample_meta) fit_ER <- lmFit(y, design_ER) fit_ER <- eBayes(fit_ER, trend = TRUE, robust = TRUE) res_ER <- topTable(fit_ER, coef = "ER_statusPositive", number = Inf, sort.by = "P", adjust.method = "BH") res_ER$miRNA <- rownames(res_ER) design_H2 <- model.matrix(~ HER2_status + ER_status, data = sample_meta) fit_H2 <- lmFit(y, design_H2) fit_H2 <- eBayes(fit_H2, trend = TRUE, robust = TRUE) res_H2 <- topTable(fit_H2, coef = "HER2_statusPositive", number = Inf, sort.by = "P", adjust.method = "BH") res_H2$miRNA <- rownames(res_H2) # --------------------------------------------------------------------------- # 7. COMPARE TO DISCOVERY HITS # --------------------------------------------------------------------------- discovery_ER <- c("hsa-mir-7-2","hsa-mir-4468","hsa-mir-6812","hsa-mir-548i-2", "hsa-mir-1278","hsa-mir-548ba","hsa-mir-8086","hsa-mir-378d-2", "hsa-mir-190a","hsa-mir-92b","hsa-mir-1273f") discovery_H2 <- c("hsa-mir-4728","hsa-mir-7641-2","hsa-mir-4311","hsa-mir-6762", "hsa-mir-3668","hsa-mir-1343","hsa-mir-384","hsa-mir-4535", "hsa-mir-146a","hsa-mir-6825","hsa-mir-5701-1","hsa-mir-5701-2", "hsa-mir-5701-3","hsa-mir-496","hsa-mir-550b-1","hsa-mir-4520-1") val_ER <- res_ER |> filter(miRNA %in% discovery_ER) |> select(miRNA, logFC, P.Value, adj.P.Val) val_H2 <- res_H2 |> filter(miRNA %in% discovery_H2) |> select(miRNA, logFC, P.Value, adj.P.Val) write.csv(val_ER, "tcga_validation_ER.csv", row.names = FALSE) write.csv(val_H2, "tcga_validation_HER2.csv", row.names = FALSE) write.csv(res_ER, "tcga_full_results_ER.csv", row.names = FALSE) write.csv(res_H2, "tcga_full_results_HER2.csv", row.names = FALSE) # --------------------------------------------------------------------------- # 8. VOLCANO PLOTS # --------------------------------------------------------------------------- make_volcano <- function(res, hits, title, outfile) { res$signif <- res$adj.P.Val < 0.05 res$label <- ifelse(res$miRNA %in% hits, res$miRNA, NA) p <- ggplot(res, aes(x = logFC, y = -log10(P.Value))) + geom_point(aes(color = signif), alpha = 0.4, size = 0.6) + geom_point(data = subset(res, miRNA %in% hits), color = "red", size = 1.8) + ggrepel::geom_text_repel(aes(label = label), size = 3, max.overlaps = 30) + scale_color_manual(values = c("grey70", "steelblue"), labels = c("q >= 0.05", "q < 0.05")) + theme_classic(base_size = 11) + labs(title = title, color = "TCGA-BRCA significance", x = "log2 fold change", y = "-log10(P)") ggsave(outfile, p, width = 7, height = 5) } make_volcano(res_ER, discovery_ER, "TCGA-BRCA: ER+ vs ER- (limma-trend)", "tcga_brca_validation_volcano_ER.pdf") make_volcano(res_H2, discovery_H2, "TCGA-BRCA: HER2+ vs HER2- (limma-trend)", "tcga_brca_validation_volcano_HER2.pdf") cat("\nValidation complete. Outputs in:", getwd(), "\n") cat("Files written:\n") cat(" tcga_validation_ER.csv - discovery ER candidates with TCGA stats\n") cat(" tcga_validation_HER2.csv - discovery HER2 candidates with TCGA stats\n") cat(" tcga_full_results_ER.csv - full TCGA ER contrast\n") cat(" tcga_full_results_HER2.csv - full TCGA HER2 contrast\n") cat(" tcga_brca_validation_volcano_ER.pdf\n") cat(" tcga_brca_validation_volcano_HER2.pdf\n")