Add MS1 annotation (Level 3) and KEGG pathway enrichment to Q Exactive E2E pipeline

jiajunagent · claude · jiajunagent · commit f536ddc659bb · 2026-03-18T21:50:07.000-04:00
- New annotation_ms1.R: loads pre-built compound database CSV (67K compounds
  from KEGG+LipidMaps+ChEBI), computes theoretical m/z for ESI+/- adducts,
  matches features via binary search within ppm tolerance. Replaces slow
  MSP/MGF parsing approach that caused Docker timeouts.
- New pathway_ora.R: Fisher's exact test ORA against KEGG pathways via
  KEGGREST API. Includes Nature-style dot plot visualization.
- E2E pipeline now covers 7 steps: peak detection → preprocessing →
  differential analysis → visualization → annotation → pathway → summary
- E2E results: 80.4% feature annotation coverage, 8 significant KEGG
  pathways, annotation completes in 12.7s (was timeout before)
- Dockerfile updated: adds KEGGREST, copies new R modules, mounts
  spectral_libraries volume
- .gitignore updated for E2E result dirs and spectral_libraries

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -59,6 +59,12 @@ tests/e2e/data/MTBLS733_mzML_subset/
 tests/e2e/data/*.txt
 tests/e2e/tools/
 tests/e2e/results/
+tests/e2e/results_qe/
+tests/e2e/results_qe_v2/
+
+# Spectral libraries (shared, not part of this repo)
+spectral_libraries
+
 *.CDF
 *.cdf
 *.mzML
diff --git a/packages/engines/stats-worker/R/annotation_ms1.R b/packages/engines/stats-worker/R/annotation_ms1.R
@@ -0,0 +1,175 @@
+##############################################################################
+##  annotation_ms1.R
+##  MS1-level metabolite annotation (Level 3) for xcms-based pipelines
+##
+##  Loads a pre-built compound database CSV (exact_mass + formula + IDs),
+##  calculates theoretical m/z for common adducts, and matches detected
+##  features within ppm tolerance.
+##  No dependency on tidymass, massdataset, or spectral library parsing.
+##############################################################################
+
+## Common ESI adducts: name -> mass shift (added to neutral monoisotopic mass)
+ESI_POS_ADDUCTS <- data.frame(
+  adduct = c("[M+H]+", "[M+Na]+", "[M+K]+", "[M+NH4]+"),
+  delta  = c(1.007276, 22.989218, 38.963158, 18.034164),
+  stringsAsFactors = FALSE
+)
+
+ESI_NEG_ADDUCTS <- data.frame(
+  adduct = c("[M-H]-", "[M+FA-H]-", "[M+Cl]-"),
+  delta  = c(-1.007276, 44.998201, 34.969402),
+  stringsAsFactors = FALSE
+)
+
+## Load Level 3 compound database from CSV
+## Expected columns: name, formula, exact_mass, kegg_id, hmdb_id, chebi_id, lipidmaps_id, source
+load_compound_db <- function(csv_path) {
+  if (!file.exists(csv_path)) {
+    stop("Compound database not found: ", csv_path)
+  }
+  cat("  Loading compound database:", basename(csv_path), "...")
+  db <- read.csv(csv_path, stringsAsFactors = FALSE)
+  db$exact_mass <- as.numeric(db$exact_mass)
+  db <- db[!is.na(db$exact_mass) & db$exact_mass > 0, ]
+  cat(" loaded", nrow(db), "compounds\n")
+  db
+}
+
+## Pre-compute theoretical m/z for all compounds x adducts
+## Returns a data.frame with: compound_idx, adduct, theoretical_mz
+compute_theoretical_mz <- function(compound_db, polarity = "positive") {
+  adducts <- if (polarity == "positive") ESI_POS_ADDUCTS else ESI_NEG_ADDUCTS
+  masses <- compound_db$exact_mass
+
+  rows <- list()
+  n <- 0L
+  for (a in seq_len(nrow(adducts))) {
+    theo_mz <- masses + adducts$delta[a]
+    valid <- theo_mz > 50  # skip unreasonable values
+    idx <- which(valid)
+    if (length(idx) > 0) {
+      n <- n + 1L
+      rows[[n]] <- data.frame(
+        compound_idx   = idx,
+        adduct         = adducts$adduct[a],
+        theoretical_mz = theo_mz[idx],
+        stringsAsFactors = FALSE
+      )
+    }
+  }
+
+  theo_df <- do.call(rbind, rows)
+  # Sort by theoretical m/z for fast binary search
+  theo_df <- theo_df[order(theo_df$theoretical_mz), ]
+  cat("  Pre-computed", nrow(theo_df), "theoretical m/z values (",
+      nrow(adducts), "adducts x", nrow(compound_db), "compounds)\n")
+  theo_df
+}
+
+## MS1-level annotation using pre-computed theoretical m/z
+##
+## @param feature_mz   numeric vector of feature m/z values
+## @param feature_ids  character vector of feature IDs
+## @param feature_rt   numeric vector of feature RT in seconds (optional)
+## @param compound_db  data.frame from load_compound_db()
+## @param theo_mz_df   data.frame from compute_theoretical_mz()
+## @param ppm_tol      matching tolerance in ppm (default 5)
+## @param max_matches  maximum matches per feature (default 3)
+##
+## @return data.frame with annotation results
+annotate_ms1 <- function(feature_mz,
+                          feature_ids,
+                          feature_rt = NULL,
+                          compound_db,
+                          theo_mz_df,
+                          ppm_tol = 5,
+                          max_matches = 3) {
+  cat("  MS1 annotation:", length(feature_mz), "features x",
+      nrow(theo_mz_df), "theoretical m/z (ppm=", ppm_tol, ")...\n")
+
+  theo_mz_vec <- theo_mz_df$theoretical_mz
+  results <- list()
+  n_matched <- 0L
+
+  for (i in seq_along(feature_mz)) {
+    obs_mz <- feature_mz[i]
+    fid <- feature_ids[i]
+    frt <- if (!is.null(feature_rt)) feature_rt[i] else NA
+
+    # Binary search for m/z window
+    mz_tol <- obs_mz * ppm_tol / 1e6
+    lo <- obs_mz - mz_tol
+    hi <- obs_mz + mz_tol
+
+    idx_lo <- findInterval(lo, theo_mz_vec) + 1L
+    idx_hi <- findInterval(hi, theo_mz_vec)
+
+    if (idx_lo > idx_hi || idx_lo > length(theo_mz_vec)) next
+
+    candidates <- theo_mz_df[idx_lo:idx_hi, ]
+    ppm_err <- abs(candidates$theoretical_mz - obs_mz) / obs_mz * 1e6
+    candidates$ppm_error <- ppm_err
+
+    # Sort by ppm error and take top matches
+    candidates <- candidates[order(candidates$ppm_error), ]
+    if (nrow(candidates) > max_matches) {
+      candidates <- candidates[1:max_matches, ]
+    }
+
+    for (j in seq_len(nrow(candidates))) {
+      cidx <- candidates$compound_idx[j]
+      cpd <- compound_db[cidx, ]
+      n_matched <- n_matched + 1L
+      results[[n_matched]] <- data.frame(
+        feature_id   = fid,
+        mz           = obs_mz,
+        rt           = frt,
+        matched_name = cpd$name,
+        formula      = cpd$formula,
+        adduct       = candidates$adduct[j],
+        ppm_error    = round(candidates$ppm_error[j], 2),
+        kegg_id      = cpd$kegg_id,
+        hmdb_id      = cpd$hmdb_id,
+        chebi_id     = cpd$chebi_id,
+        lipidmaps_id = cpd$lipidmaps_id,
+        source       = cpd$source,
+        stringsAsFactors = FALSE
+      )
+    }
+  }
+
+  if (length(results) == 0) {
+    cat("  No matches found\n")
+    return(data.frame(
+      feature_id = character(), mz = numeric(), rt = numeric(),
+      matched_name = character(), formula = character(),
+      adduct = character(), ppm_error = numeric(),
+      kegg_id = character(), hmdb_id = character(),
+      chebi_id = character(), lipidmaps_id = character(),
+      source = character(), stringsAsFactors = FALSE
+    ))
+  }
+
+  out <- do.call(rbind, results)
+  n_features_matched <- length(unique(out$feature_id))
+  cat("  Matched:", n_features_matched, "features ->", nrow(out), "annotations\n")
+  out
+}
+
+## Summarize annotation results
+summarize_annotations <- function(ann_df) {
+  if (nrow(ann_df) == 0) return(list(n_matched = 0, n_unique_compounds = 0))
+
+  best <- ann_df[order(ann_df$ppm_error), ]
+  best <- best[!duplicated(best$feature_id), ]
+
+  list(
+    n_matched           = length(unique(ann_df$feature_id)),
+    n_total_annotations = nrow(ann_df),
+    n_unique_compounds  = length(unique(best$matched_name)),
+    n_with_kegg         = sum(best$kegg_id != "", na.rm = TRUE),
+    n_with_hmdb         = sum(best$hmdb_id != "", na.rm = TRUE),
+    adduct_distribution = table(ann_df$adduct),
+    source_distribution = table(ann_df$source)
+  )
+}
diff --git a/packages/engines/stats-worker/R/pathway_ora.R b/packages/engines/stats-worker/R/pathway_ora.R
@@ -0,0 +1,173 @@
+##############################################################################
+##  pathway_ora.R
+##  Over-Representation Analysis (ORA) for metabolic pathway enrichment
+##
+##  Uses KEGGREST to fetch compound-pathway mappings, then runs Fisher's
+##  exact test. No dependency on MetaboAnalystR or tidymass.
+##############################################################################
+
+## Run KEGG pathway ORA using Fisher's exact test
+##
+## @param query_ids   character vector of KEGG compound IDs (e.g., "C00001")
+## @param organism    KEGG organism code (e.g., "hsa" for human, "ko" for reference)
+## @param p_cutoff    significance threshold (default 0.05)
+##
+## @return data.frame with columns: pathway, pathway_id, total, expected, hits,
+##         raw_p, fdr, fold_enrichment, hit_compounds
+run_kegg_ora <- function(query_ids,
+                          organism = "ko",
+                          p_cutoff = 0.05) {
+  if (!requireNamespace("KEGGREST", quietly = TRUE)) {
+    cat("  KEGGREST not installed, skipping KEGG pathway analysis\n")
+    return(data.frame())
+  }
+
+  cat("  KEGG ORA: fetching pathway definitions...\n")
+  library(KEGGREST)
+  old_timeout <- getOption("timeout")
+  options(timeout = 120)
+  on.exit(options(timeout = old_timeout))
+
+  # Get compound-pathway links
+  tryCatch({
+    cpd_pw_link <- keggLink("pathway", "compound")
+  }, error = function(e) {
+    cat("  KEGG API unavailable:", conditionMessage(e), "\n")
+    return(data.frame())
+  })
+
+  cpd_ids <- sub("cpd:", "", names(cpd_pw_link))
+  pw_ids  <- sub("path:map", "", as.character(cpd_pw_link))
+
+  # Get pathway names
+  tryCatch({
+    org_pathways <- keggList("pathway", organism)
+    pw_nums  <- sub(paste0("^", organism), "", names(org_pathways))
+    pw_names <- sub(" - .*$", "", as.character(org_pathways))
+    pw_name_map <- setNames(pw_names, pw_nums)
+  }, error = function(e) {
+    cat("  KEGG pathway list unavailable:", conditionMessage(e), "\n")
+    return(data.frame())
+  })
+
+  # Build pathway compound sets
+  pw_cpd_sets <- list()
+  all_kegg_cpds <- character()
+
+  for (i in seq_along(pw_nums)) {
+    matched_cpds <- cpd_ids[pw_ids == pw_nums[i]]
+    if (length(matched_cpds) >= 2) {
+      pw_cpd_sets[[pw_nums[i]]] <- matched_cpds
+      all_kegg_cpds <- union(all_kegg_cpds, matched_cpds)
+    }
+  }
+
+  N <- length(all_kegg_cpds)  # universe size
+  query_in_universe <- intersect(query_ids, all_kegg_cpds)
+  n <- length(query_in_universe)  # query set size
+
+  cat("  Universe:", N, "compounds |", length(pw_cpd_sets), "pathways | Query:", n,
+      "of", length(query_ids), "mapped\n")
+
+  if (n < 2) {
+    cat("  Too few query compounds mapped to KEGG, skipping ORA\n")
+    return(data.frame())
+  }
+
+  # Fisher's exact test for each pathway
+  results <- list()
+  for (pw_num in names(pw_cpd_sets)) {
+    pw_cpds <- pw_cpd_sets[[pw_num]]
+    K <- length(pw_cpds)  # pathway size
+    x <- length(intersect(query_in_universe, pw_cpds))  # hits
+
+    if (x == 0) next
+
+    # One-sided Fisher's exact test (enrichment)
+    contingency <- matrix(c(x, n - x, K - x, N - K - n + x), nrow = 2)
+    # Ensure no negative values
+    if (any(contingency < 0)) next
+
+    p_val <- phyper(x - 1, K, N - K, n, lower.tail = FALSE)
+    expected <- n * K / N
+    fold_enrich <- if (expected > 0) x / expected else Inf
+
+    hit_cpds <- intersect(query_in_universe, pw_cpds)
+
+    results[[length(results) + 1]] <- data.frame(
+      pathway         = ifelse(pw_num %in% names(pw_name_map),
+                                pw_name_map[pw_num], pw_num),
+      pathway_id      = pw_num,
+      total           = K,
+      expected        = round(expected, 2),
+      hits            = x,
+      raw_p           = p_val,
+      fold_enrichment = round(fold_enrich, 2),
+      hit_compounds   = paste(hit_cpds, collapse = ";"),
+      stringsAsFactors = FALSE
+    )
+  }
+
+  if (length(results) == 0) {
+    cat("  No pathways enriched\n")
+    return(data.frame())
+  }
+
+  out <- do.call(rbind, results)
+  out$fdr <- p.adjust(out$raw_p, method = "BH")
+  out <- out[order(out$raw_p), ]
+
+  n_sig <- sum(out$raw_p < p_cutoff)
+  cat("  Enriched pathways (p<", p_cutoff, "):", n_sig, "of", nrow(out), "\n")
+  out
+}
+
+## Plot pathway enrichment dot plot (Nature style)
+## @param pathway_df  data.frame from run_kegg_ora()
+## @param top_n       max pathways to show (default 20)
+## @param title       plot title
+plot_pathway_dotplot <- function(pathway_df,
+                                  top_n = 20,
+                                  title = "KEGG Pathway Enrichment") {
+  if (nrow(pathway_df) == 0) return(NULL)
+
+  # Top N by p-value
+  plot_data <- head(pathway_df[order(pathway_df$raw_p), ], top_n)
+  plot_data$neg_log_p <- -log10(plot_data$raw_p)
+
+  # Truncate long pathway names
+  plot_data$pathway_short <- ifelse(
+    nchar(plot_data$pathway) > 45,
+    paste0(substr(plot_data$pathway, 1, 42), "..."),
+    plot_data$pathway
+  )
+  plot_data$pathway_short <- factor(plot_data$pathway_short,
+                                     levels = rev(plot_data$pathway_short))
+
+  pathway_gradient <- c("#FEE0D2", "#FC9272", "#DE2D26", "#A50F15")
+
+  p <- ggplot(plot_data, aes(x = hits, y = pathway_short)) +
+    geom_point(aes(size = neg_log_p, color = fold_enrichment)) +
+    scale_color_gradientn(
+      colours = pathway_gradient,
+      name = "Fold\nenrichment"
+    ) +
+    scale_size_continuous(
+      range = c(2, 7),
+      name = expression(-log[10]*italic(P))
+    ) +
+    labs(x = "Hit compounds", y = NULL, title = title) +
+    theme_classic(base_size = 7) +
+    theme(
+      axis.text.y = element_text(size = 6),
+      axis.text.x = element_text(size = 6),
+      axis.title  = element_text(size = 7),
+      plot.title  = element_text(size = 8, face = "bold"),
+      legend.title = element_text(size = 6),
+      legend.text  = element_text(size = 5),
+      legend.key.size = unit(3, "mm"),
+      panel.grid.major.x = element_line(color = "grey92", linewidth = 0.3)
+    )
+
+  p
+}
diff --git a/tests/e2e/Dockerfile.qexactive b/tests/e2e/Dockerfile.qexactive
@@ -22,7 +22,7 @@ RUN Rscript -e " \
     if (!requireNamespace('BiocManager', quietly = TRUE)) \
         install.packages('BiocManager'); \
     BiocManager::install(version = '3.20', ask = FALSE, update = FALSE); \
-    BiocManager::install(c('xcms', 'limma', 'MSnbase'), ask = FALSE, update = FALSE); \
+    BiocManager::install(c('xcms', 'limma', 'MSnbase', 'KEGGREST'), ask = FALSE, update = FALSE); \
     install.packages(c( \
         'ggplot2', 'dplyr', 'tidyr', 'pheatmap', 'patchwork', \
         'ggsci', 'scales', \
@@ -38,10 +38,12 @@ WORKDIR /app
 COPY packages/engines/stats-worker/R/config.R /app/R/config.R
 COPY packages/engines/stats-worker/R/differential.R /app/R/differential.R
 COPY packages/engines/stats-worker/R/visualization.R /app/R/visualization.R
+COPY packages/engines/stats-worker/R/annotation_ms1.R /app/R/annotation_ms1.R
+COPY packages/engines/stats-worker/R/pathway_ora.R /app/R/pathway_ora.R
 
 # Copy Q Exactive pipeline script
 COPY tests/e2e/run_e2e_qexactive.R /app/run_e2e_qexactive.R
 
-VOLUME ["/data", "/results"]
+VOLUME ["/data", "/results", "/spectral_libraries"]
 
 CMD ["Rscript", "/app/run_e2e_qexactive.R"]
diff --git a/tests/e2e/run_e2e_qexactive.R b/tests/e2e/run_e2e_qexactive.R