Skip to content

Commit f536ddc

Browse files
jiajunagentclaude
andcommitted
Add MS1 annotation (Level 3) and KEGG pathway enrichment to Q Exactive E2E pipeline
- New annotation_ms1.R: loads pre-built compound database CSV (67K compounds from KEGG+LipidMaps+ChEBI), computes theoretical m/z for ESI+/- adducts, matches features via binary search within ppm tolerance. Replaces slow MSP/MGF parsing approach that caused Docker timeouts. - New pathway_ora.R: Fisher's exact test ORA against KEGG pathways via KEGGREST API. Includes Nature-style dot plot visualization. - E2E pipeline now covers 7 steps: peak detection → preprocessing → differential analysis → visualization → annotation → pathway → summary - E2E results: 80.4% feature annotation coverage, 8 significant KEGG pathways, annotation completes in 12.7s (was timeout before) - Dockerfile updated: adds KEGGREST, copies new R modules, mounts spectral_libraries volume - .gitignore updated for E2E result dirs and spectral_libraries Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5b6c9f0 commit f536ddc

5 files changed

Lines changed: 495 additions & 18 deletions

File tree

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,12 @@ tests/e2e/data/MTBLS733_mzML_subset/
5959
tests/e2e/data/*.txt
6060
tests/e2e/tools/
6161
tests/e2e/results/
62+
tests/e2e/results_qe/
63+
tests/e2e/results_qe_v2/
64+
65+
# Spectral libraries (shared, not part of this repo)
66+
spectral_libraries
67+
6268
*.CDF
6369
*.cdf
6470
*.mzML
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
##############################################################################
2+
## annotation_ms1.R
3+
## MS1-level metabolite annotation (Level 3) for xcms-based pipelines
4+
##
5+
## Loads a pre-built compound database CSV (exact_mass + formula + IDs),
6+
## calculates theoretical m/z for common adducts, and matches detected
7+
## features within ppm tolerance.
8+
## No dependency on tidymass, massdataset, or spectral library parsing.
9+
##############################################################################
10+
11+
## Common ESI adducts: name -> mass shift (added to neutral monoisotopic mass)
12+
ESI_POS_ADDUCTS <- data.frame(
13+
adduct = c("[M+H]+", "[M+Na]+", "[M+K]+", "[M+NH4]+"),
14+
delta = c(1.007276, 22.989218, 38.963158, 18.034164),
15+
stringsAsFactors = FALSE
16+
)
17+
18+
ESI_NEG_ADDUCTS <- data.frame(
19+
adduct = c("[M-H]-", "[M+FA-H]-", "[M+Cl]-"),
20+
delta = c(-1.007276, 44.998201, 34.969402),
21+
stringsAsFactors = FALSE
22+
)
23+
24+
## Load Level 3 compound database from CSV
25+
## Expected columns: name, formula, exact_mass, kegg_id, hmdb_id, chebi_id, lipidmaps_id, source
26+
load_compound_db <- function(csv_path) {
27+
if (!file.exists(csv_path)) {
28+
stop("Compound database not found: ", csv_path)
29+
}
30+
cat(" Loading compound database:", basename(csv_path), "...")
31+
db <- read.csv(csv_path, stringsAsFactors = FALSE)
32+
db$exact_mass <- as.numeric(db$exact_mass)
33+
db <- db[!is.na(db$exact_mass) & db$exact_mass > 0, ]
34+
cat(" loaded", nrow(db), "compounds\n")
35+
db
36+
}
37+
38+
## Pre-compute theoretical m/z for all compounds x adducts
39+
## Returns a data.frame with: compound_idx, adduct, theoretical_mz
40+
compute_theoretical_mz <- function(compound_db, polarity = "positive") {
41+
adducts <- if (polarity == "positive") ESI_POS_ADDUCTS else ESI_NEG_ADDUCTS
42+
masses <- compound_db$exact_mass
43+
44+
rows <- list()
45+
n <- 0L
46+
for (a in seq_len(nrow(adducts))) {
47+
theo_mz <- masses + adducts$delta[a]
48+
valid <- theo_mz > 50 # skip unreasonable values
49+
idx <- which(valid)
50+
if (length(idx) > 0) {
51+
n <- n + 1L
52+
rows[[n]] <- data.frame(
53+
compound_idx = idx,
54+
adduct = adducts$adduct[a],
55+
theoretical_mz = theo_mz[idx],
56+
stringsAsFactors = FALSE
57+
)
58+
}
59+
}
60+
61+
theo_df <- do.call(rbind, rows)
62+
# Sort by theoretical m/z for fast binary search
63+
theo_df <- theo_df[order(theo_df$theoretical_mz), ]
64+
cat(" Pre-computed", nrow(theo_df), "theoretical m/z values (",
65+
nrow(adducts), "adducts x", nrow(compound_db), "compounds)\n")
66+
theo_df
67+
}
68+
69+
## MS1-level annotation using pre-computed theoretical m/z
70+
##
71+
## @param feature_mz numeric vector of feature m/z values
72+
## @param feature_ids character vector of feature IDs
73+
## @param feature_rt numeric vector of feature RT in seconds (optional)
74+
## @param compound_db data.frame from load_compound_db()
75+
## @param theo_mz_df data.frame from compute_theoretical_mz()
76+
## @param ppm_tol matching tolerance in ppm (default 5)
77+
## @param max_matches maximum matches per feature (default 3)
78+
##
79+
## @return data.frame with annotation results
80+
annotate_ms1 <- function(feature_mz,
81+
feature_ids,
82+
feature_rt = NULL,
83+
compound_db,
84+
theo_mz_df,
85+
ppm_tol = 5,
86+
max_matches = 3) {
87+
cat(" MS1 annotation:", length(feature_mz), "features x",
88+
nrow(theo_mz_df), "theoretical m/z (ppm=", ppm_tol, ")...\n")
89+
90+
theo_mz_vec <- theo_mz_df$theoretical_mz
91+
results <- list()
92+
n_matched <- 0L
93+
94+
for (i in seq_along(feature_mz)) {
95+
obs_mz <- feature_mz[i]
96+
fid <- feature_ids[i]
97+
frt <- if (!is.null(feature_rt)) feature_rt[i] else NA
98+
99+
# Binary search for m/z window
100+
mz_tol <- obs_mz * ppm_tol / 1e6
101+
lo <- obs_mz - mz_tol
102+
hi <- obs_mz + mz_tol
103+
104+
idx_lo <- findInterval(lo, theo_mz_vec) + 1L
105+
idx_hi <- findInterval(hi, theo_mz_vec)
106+
107+
if (idx_lo > idx_hi || idx_lo > length(theo_mz_vec)) next
108+
109+
candidates <- theo_mz_df[idx_lo:idx_hi, ]
110+
ppm_err <- abs(candidates$theoretical_mz - obs_mz) / obs_mz * 1e6
111+
candidates$ppm_error <- ppm_err
112+
113+
# Sort by ppm error and take top matches
114+
candidates <- candidates[order(candidates$ppm_error), ]
115+
if (nrow(candidates) > max_matches) {
116+
candidates <- candidates[1:max_matches, ]
117+
}
118+
119+
for (j in seq_len(nrow(candidates))) {
120+
cidx <- candidates$compound_idx[j]
121+
cpd <- compound_db[cidx, ]
122+
n_matched <- n_matched + 1L
123+
results[[n_matched]] <- data.frame(
124+
feature_id = fid,
125+
mz = obs_mz,
126+
rt = frt,
127+
matched_name = cpd$name,
128+
formula = cpd$formula,
129+
adduct = candidates$adduct[j],
130+
ppm_error = round(candidates$ppm_error[j], 2),
131+
kegg_id = cpd$kegg_id,
132+
hmdb_id = cpd$hmdb_id,
133+
chebi_id = cpd$chebi_id,
134+
lipidmaps_id = cpd$lipidmaps_id,
135+
source = cpd$source,
136+
stringsAsFactors = FALSE
137+
)
138+
}
139+
}
140+
141+
if (length(results) == 0) {
142+
cat(" No matches found\n")
143+
return(data.frame(
144+
feature_id = character(), mz = numeric(), rt = numeric(),
145+
matched_name = character(), formula = character(),
146+
adduct = character(), ppm_error = numeric(),
147+
kegg_id = character(), hmdb_id = character(),
148+
chebi_id = character(), lipidmaps_id = character(),
149+
source = character(), stringsAsFactors = FALSE
150+
))
151+
}
152+
153+
out <- do.call(rbind, results)
154+
n_features_matched <- length(unique(out$feature_id))
155+
cat(" Matched:", n_features_matched, "features ->", nrow(out), "annotations\n")
156+
out
157+
}
158+
159+
## Summarize annotation results
160+
summarize_annotations <- function(ann_df) {
161+
if (nrow(ann_df) == 0) return(list(n_matched = 0, n_unique_compounds = 0))
162+
163+
best <- ann_df[order(ann_df$ppm_error), ]
164+
best <- best[!duplicated(best$feature_id), ]
165+
166+
list(
167+
n_matched = length(unique(ann_df$feature_id)),
168+
n_total_annotations = nrow(ann_df),
169+
n_unique_compounds = length(unique(best$matched_name)),
170+
n_with_kegg = sum(best$kegg_id != "", na.rm = TRUE),
171+
n_with_hmdb = sum(best$hmdb_id != "", na.rm = TRUE),
172+
adduct_distribution = table(ann_df$adduct),
173+
source_distribution = table(ann_df$source)
174+
)
175+
}
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
##############################################################################
2+
## pathway_ora.R
3+
## Over-Representation Analysis (ORA) for metabolic pathway enrichment
4+
##
5+
## Uses KEGGREST to fetch compound-pathway mappings, then runs Fisher's
6+
## exact test. No dependency on MetaboAnalystR or tidymass.
7+
##############################################################################
8+
9+
## Run KEGG pathway ORA using Fisher's exact test
10+
##
11+
## @param query_ids character vector of KEGG compound IDs (e.g., "C00001")
12+
## @param organism KEGG organism code (e.g., "hsa" for human, "ko" for reference)
13+
## @param p_cutoff significance threshold (default 0.05)
14+
##
15+
## @return data.frame with columns: pathway, pathway_id, total, expected, hits,
16+
## raw_p, fdr, fold_enrichment, hit_compounds
17+
run_kegg_ora <- function(query_ids,
18+
organism = "ko",
19+
p_cutoff = 0.05) {
20+
if (!requireNamespace("KEGGREST", quietly = TRUE)) {
21+
cat(" KEGGREST not installed, skipping KEGG pathway analysis\n")
22+
return(data.frame())
23+
}
24+
25+
cat(" KEGG ORA: fetching pathway definitions...\n")
26+
library(KEGGREST)
27+
old_timeout <- getOption("timeout")
28+
options(timeout = 120)
29+
on.exit(options(timeout = old_timeout))
30+
31+
# Get compound-pathway links
32+
tryCatch({
33+
cpd_pw_link <- keggLink("pathway", "compound")
34+
}, error = function(e) {
35+
cat(" KEGG API unavailable:", conditionMessage(e), "\n")
36+
return(data.frame())
37+
})
38+
39+
cpd_ids <- sub("cpd:", "", names(cpd_pw_link))
40+
pw_ids <- sub("path:map", "", as.character(cpd_pw_link))
41+
42+
# Get pathway names
43+
tryCatch({
44+
org_pathways <- keggList("pathway", organism)
45+
pw_nums <- sub(paste0("^", organism), "", names(org_pathways))
46+
pw_names <- sub(" - .*$", "", as.character(org_pathways))
47+
pw_name_map <- setNames(pw_names, pw_nums)
48+
}, error = function(e) {
49+
cat(" KEGG pathway list unavailable:", conditionMessage(e), "\n")
50+
return(data.frame())
51+
})
52+
53+
# Build pathway compound sets
54+
pw_cpd_sets <- list()
55+
all_kegg_cpds <- character()
56+
57+
for (i in seq_along(pw_nums)) {
58+
matched_cpds <- cpd_ids[pw_ids == pw_nums[i]]
59+
if (length(matched_cpds) >= 2) {
60+
pw_cpd_sets[[pw_nums[i]]] <- matched_cpds
61+
all_kegg_cpds <- union(all_kegg_cpds, matched_cpds)
62+
}
63+
}
64+
65+
N <- length(all_kegg_cpds) # universe size
66+
query_in_universe <- intersect(query_ids, all_kegg_cpds)
67+
n <- length(query_in_universe) # query set size
68+
69+
cat(" Universe:", N, "compounds |", length(pw_cpd_sets), "pathways | Query:", n,
70+
"of", length(query_ids), "mapped\n")
71+
72+
if (n < 2) {
73+
cat(" Too few query compounds mapped to KEGG, skipping ORA\n")
74+
return(data.frame())
75+
}
76+
77+
# Fisher's exact test for each pathway
78+
results <- list()
79+
for (pw_num in names(pw_cpd_sets)) {
80+
pw_cpds <- pw_cpd_sets[[pw_num]]
81+
K <- length(pw_cpds) # pathway size
82+
x <- length(intersect(query_in_universe, pw_cpds)) # hits
83+
84+
if (x == 0) next
85+
86+
# One-sided Fisher's exact test (enrichment)
87+
contingency <- matrix(c(x, n - x, K - x, N - K - n + x), nrow = 2)
88+
# Ensure no negative values
89+
if (any(contingency < 0)) next
90+
91+
p_val <- phyper(x - 1, K, N - K, n, lower.tail = FALSE)
92+
expected <- n * K / N
93+
fold_enrich <- if (expected > 0) x / expected else Inf
94+
95+
hit_cpds <- intersect(query_in_universe, pw_cpds)
96+
97+
results[[length(results) + 1]] <- data.frame(
98+
pathway = ifelse(pw_num %in% names(pw_name_map),
99+
pw_name_map[pw_num], pw_num),
100+
pathway_id = pw_num,
101+
total = K,
102+
expected = round(expected, 2),
103+
hits = x,
104+
raw_p = p_val,
105+
fold_enrichment = round(fold_enrich, 2),
106+
hit_compounds = paste(hit_cpds, collapse = ";"),
107+
stringsAsFactors = FALSE
108+
)
109+
}
110+
111+
if (length(results) == 0) {
112+
cat(" No pathways enriched\n")
113+
return(data.frame())
114+
}
115+
116+
out <- do.call(rbind, results)
117+
out$fdr <- p.adjust(out$raw_p, method = "BH")
118+
out <- out[order(out$raw_p), ]
119+
120+
n_sig <- sum(out$raw_p < p_cutoff)
121+
cat(" Enriched pathways (p<", p_cutoff, "):", n_sig, "of", nrow(out), "\n")
122+
out
123+
}
124+
125+
## Plot pathway enrichment dot plot (Nature style)
126+
## @param pathway_df data.frame from run_kegg_ora()
127+
## @param top_n max pathways to show (default 20)
128+
## @param title plot title
129+
plot_pathway_dotplot <- function(pathway_df,
130+
top_n = 20,
131+
title = "KEGG Pathway Enrichment") {
132+
if (nrow(pathway_df) == 0) return(NULL)
133+
134+
# Top N by p-value
135+
plot_data <- head(pathway_df[order(pathway_df$raw_p), ], top_n)
136+
plot_data$neg_log_p <- -log10(plot_data$raw_p)
137+
138+
# Truncate long pathway names
139+
plot_data$pathway_short <- ifelse(
140+
nchar(plot_data$pathway) > 45,
141+
paste0(substr(plot_data$pathway, 1, 42), "..."),
142+
plot_data$pathway
143+
)
144+
plot_data$pathway_short <- factor(plot_data$pathway_short,
145+
levels = rev(plot_data$pathway_short))
146+
147+
pathway_gradient <- c("#FEE0D2", "#FC9272", "#DE2D26", "#A50F15")
148+
149+
p <- ggplot(plot_data, aes(x = hits, y = pathway_short)) +
150+
geom_point(aes(size = neg_log_p, color = fold_enrichment)) +
151+
scale_color_gradientn(
152+
colours = pathway_gradient,
153+
name = "Fold\nenrichment"
154+
) +
155+
scale_size_continuous(
156+
range = c(2, 7),
157+
name = expression(-log[10]*italic(P))
158+
) +
159+
labs(x = "Hit compounds", y = NULL, title = title) +
160+
theme_classic(base_size = 7) +
161+
theme(
162+
axis.text.y = element_text(size = 6),
163+
axis.text.x = element_text(size = 6),
164+
axis.title = element_text(size = 7),
165+
plot.title = element_text(size = 8, face = "bold"),
166+
legend.title = element_text(size = 6),
167+
legend.text = element_text(size = 5),
168+
legend.key.size = unit(3, "mm"),
169+
panel.grid.major.x = element_line(color = "grey92", linewidth = 0.3)
170+
)
171+
172+
p
173+
}

tests/e2e/Dockerfile.qexactive

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ RUN Rscript -e " \
2222
if (!requireNamespace('BiocManager', quietly = TRUE)) \
2323
install.packages('BiocManager'); \
2424
BiocManager::install(version = '3.20', ask = FALSE, update = FALSE); \
25-
BiocManager::install(c('xcms', 'limma', 'MSnbase'), ask = FALSE, update = FALSE); \
25+
BiocManager::install(c('xcms', 'limma', 'MSnbase', 'KEGGREST'), ask = FALSE, update = FALSE); \
2626
install.packages(c( \
2727
'ggplot2', 'dplyr', 'tidyr', 'pheatmap', 'patchwork', \
2828
'ggsci', 'scales', \
@@ -38,10 +38,12 @@ WORKDIR /app
3838
COPY packages/engines/stats-worker/R/config.R /app/R/config.R
3939
COPY packages/engines/stats-worker/R/differential.R /app/R/differential.R
4040
COPY packages/engines/stats-worker/R/visualization.R /app/R/visualization.R
41+
COPY packages/engines/stats-worker/R/annotation_ms1.R /app/R/annotation_ms1.R
42+
COPY packages/engines/stats-worker/R/pathway_ora.R /app/R/pathway_ora.R
4143

4244
# Copy Q Exactive pipeline script
4345
COPY tests/e2e/run_e2e_qexactive.R /app/run_e2e_qexactive.R
4446

45-
VOLUME ["/data", "/results"]
47+
VOLUME ["/data", "/results", "/spectral_libraries"]
4648

4749
CMD ["Rscript", "/app/run_e2e_qexactive.R"]

0 commit comments

Comments
 (0)