|
| 1 | +############################################################################## |
| 2 | +## annotation_ms1.R |
| 3 | +## MS1-level metabolite annotation (Level 3) for xcms-based pipelines |
| 4 | +## |
| 5 | +## Loads a pre-built compound database CSV (exact_mass + formula + IDs), |
| 6 | +## calculates theoretical m/z for common adducts, and matches detected |
| 7 | +## features within ppm tolerance. |
| 8 | +## No dependency on tidymass, massdataset, or spectral library parsing. |
| 9 | +############################################################################## |
| 10 | + |
| 11 | +## Common ESI adducts: name -> mass shift (added to neutral monoisotopic mass) |
| 12 | +ESI_POS_ADDUCTS <- data.frame( |
| 13 | + adduct = c("[M+H]+", "[M+Na]+", "[M+K]+", "[M+NH4]+"), |
| 14 | + delta = c(1.007276, 22.989218, 38.963158, 18.034164), |
| 15 | + stringsAsFactors = FALSE |
| 16 | +) |
| 17 | + |
| 18 | +ESI_NEG_ADDUCTS <- data.frame( |
| 19 | + adduct = c("[M-H]-", "[M+FA-H]-", "[M+Cl]-"), |
| 20 | + delta = c(-1.007276, 44.998201, 34.969402), |
| 21 | + stringsAsFactors = FALSE |
| 22 | +) |
| 23 | + |
| 24 | +## Load Level 3 compound database from CSV |
| 25 | +## Expected columns: name, formula, exact_mass, kegg_id, hmdb_id, chebi_id, lipidmaps_id, source |
| 26 | +load_compound_db <- function(csv_path) { |
| 27 | + if (!file.exists(csv_path)) { |
| 28 | + stop("Compound database not found: ", csv_path) |
| 29 | + } |
| 30 | + cat(" Loading compound database:", basename(csv_path), "...") |
| 31 | + db <- read.csv(csv_path, stringsAsFactors = FALSE) |
| 32 | + db$exact_mass <- as.numeric(db$exact_mass) |
| 33 | + db <- db[!is.na(db$exact_mass) & db$exact_mass > 0, ] |
| 34 | + cat(" loaded", nrow(db), "compounds\n") |
| 35 | + db |
| 36 | +} |
| 37 | + |
| 38 | +## Pre-compute theoretical m/z for all compounds x adducts |
| 39 | +## Returns a data.frame with: compound_idx, adduct, theoretical_mz |
| 40 | +compute_theoretical_mz <- function(compound_db, polarity = "positive") { |
| 41 | + adducts <- if (polarity == "positive") ESI_POS_ADDUCTS else ESI_NEG_ADDUCTS |
| 42 | + masses <- compound_db$exact_mass |
| 43 | + |
| 44 | + rows <- list() |
| 45 | + n <- 0L |
| 46 | + for (a in seq_len(nrow(adducts))) { |
| 47 | + theo_mz <- masses + adducts$delta[a] |
| 48 | + valid <- theo_mz > 50 # skip unreasonable values |
| 49 | + idx <- which(valid) |
| 50 | + if (length(idx) > 0) { |
| 51 | + n <- n + 1L |
| 52 | + rows[[n]] <- data.frame( |
| 53 | + compound_idx = idx, |
| 54 | + adduct = adducts$adduct[a], |
| 55 | + theoretical_mz = theo_mz[idx], |
| 56 | + stringsAsFactors = FALSE |
| 57 | + ) |
| 58 | + } |
| 59 | + } |
| 60 | + |
| 61 | + theo_df <- do.call(rbind, rows) |
| 62 | + # Sort by theoretical m/z for fast binary search |
| 63 | + theo_df <- theo_df[order(theo_df$theoretical_mz), ] |
| 64 | + cat(" Pre-computed", nrow(theo_df), "theoretical m/z values (", |
| 65 | + nrow(adducts), "adducts x", nrow(compound_db), "compounds)\n") |
| 66 | + theo_df |
| 67 | +} |
| 68 | + |
| 69 | +## MS1-level annotation using pre-computed theoretical m/z |
| 70 | +## |
| 71 | +## @param feature_mz numeric vector of feature m/z values |
| 72 | +## @param feature_ids character vector of feature IDs |
| 73 | +## @param feature_rt numeric vector of feature RT in seconds (optional) |
| 74 | +## @param compound_db data.frame from load_compound_db() |
| 75 | +## @param theo_mz_df data.frame from compute_theoretical_mz() |
| 76 | +## @param ppm_tol matching tolerance in ppm (default 5) |
| 77 | +## @param max_matches maximum matches per feature (default 3) |
| 78 | +## |
| 79 | +## @return data.frame with annotation results |
| 80 | +annotate_ms1 <- function(feature_mz, |
| 81 | + feature_ids, |
| 82 | + feature_rt = NULL, |
| 83 | + compound_db, |
| 84 | + theo_mz_df, |
| 85 | + ppm_tol = 5, |
| 86 | + max_matches = 3) { |
| 87 | + cat(" MS1 annotation:", length(feature_mz), "features x", |
| 88 | + nrow(theo_mz_df), "theoretical m/z (ppm=", ppm_tol, ")...\n") |
| 89 | + |
| 90 | + theo_mz_vec <- theo_mz_df$theoretical_mz |
| 91 | + results <- list() |
| 92 | + n_matched <- 0L |
| 93 | + |
| 94 | + for (i in seq_along(feature_mz)) { |
| 95 | + obs_mz <- feature_mz[i] |
| 96 | + fid <- feature_ids[i] |
| 97 | + frt <- if (!is.null(feature_rt)) feature_rt[i] else NA |
| 98 | + |
| 99 | + # Binary search for m/z window |
| 100 | + mz_tol <- obs_mz * ppm_tol / 1e6 |
| 101 | + lo <- obs_mz - mz_tol |
| 102 | + hi <- obs_mz + mz_tol |
| 103 | + |
| 104 | + idx_lo <- findInterval(lo, theo_mz_vec) + 1L |
| 105 | + idx_hi <- findInterval(hi, theo_mz_vec) |
| 106 | + |
| 107 | + if (idx_lo > idx_hi || idx_lo > length(theo_mz_vec)) next |
| 108 | + |
| 109 | + candidates <- theo_mz_df[idx_lo:idx_hi, ] |
| 110 | + ppm_err <- abs(candidates$theoretical_mz - obs_mz) / obs_mz * 1e6 |
| 111 | + candidates$ppm_error <- ppm_err |
| 112 | + |
| 113 | + # Sort by ppm error and take top matches |
| 114 | + candidates <- candidates[order(candidates$ppm_error), ] |
| 115 | + if (nrow(candidates) > max_matches) { |
| 116 | + candidates <- candidates[1:max_matches, ] |
| 117 | + } |
| 118 | + |
| 119 | + for (j in seq_len(nrow(candidates))) { |
| 120 | + cidx <- candidates$compound_idx[j] |
| 121 | + cpd <- compound_db[cidx, ] |
| 122 | + n_matched <- n_matched + 1L |
| 123 | + results[[n_matched]] <- data.frame( |
| 124 | + feature_id = fid, |
| 125 | + mz = obs_mz, |
| 126 | + rt = frt, |
| 127 | + matched_name = cpd$name, |
| 128 | + formula = cpd$formula, |
| 129 | + adduct = candidates$adduct[j], |
| 130 | + ppm_error = round(candidates$ppm_error[j], 2), |
| 131 | + kegg_id = cpd$kegg_id, |
| 132 | + hmdb_id = cpd$hmdb_id, |
| 133 | + chebi_id = cpd$chebi_id, |
| 134 | + lipidmaps_id = cpd$lipidmaps_id, |
| 135 | + source = cpd$source, |
| 136 | + stringsAsFactors = FALSE |
| 137 | + ) |
| 138 | + } |
| 139 | + } |
| 140 | + |
| 141 | + if (length(results) == 0) { |
| 142 | + cat(" No matches found\n") |
| 143 | + return(data.frame( |
| 144 | + feature_id = character(), mz = numeric(), rt = numeric(), |
| 145 | + matched_name = character(), formula = character(), |
| 146 | + adduct = character(), ppm_error = numeric(), |
| 147 | + kegg_id = character(), hmdb_id = character(), |
| 148 | + chebi_id = character(), lipidmaps_id = character(), |
| 149 | + source = character(), stringsAsFactors = FALSE |
| 150 | + )) |
| 151 | + } |
| 152 | + |
| 153 | + out <- do.call(rbind, results) |
| 154 | + n_features_matched <- length(unique(out$feature_id)) |
| 155 | + cat(" Matched:", n_features_matched, "features ->", nrow(out), "annotations\n") |
| 156 | + out |
| 157 | +} |
| 158 | + |
| 159 | +## Summarize annotation results |
| 160 | +summarize_annotations <- function(ann_df) { |
| 161 | + if (nrow(ann_df) == 0) return(list(n_matched = 0, n_unique_compounds = 0)) |
| 162 | + |
| 163 | + best <- ann_df[order(ann_df$ppm_error), ] |
| 164 | + best <- best[!duplicated(best$feature_id), ] |
| 165 | + |
| 166 | + list( |
| 167 | + n_matched = length(unique(ann_df$feature_id)), |
| 168 | + n_total_annotations = nrow(ann_df), |
| 169 | + n_unique_compounds = length(unique(best$matched_name)), |
| 170 | + n_with_kegg = sum(best$kegg_id != "", na.rm = TRUE), |
| 171 | + n_with_hmdb = sum(best$hmdb_id != "", na.rm = TRUE), |
| 172 | + adduct_distribution = table(ann_df$adduct), |
| 173 | + source_distribution = table(ann_df$source) |
| 174 | + ) |
| 175 | +} |
0 commit comments