From 4a99b87ceb8332531ae708929af8870d2d950e92 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Thu, 4 Jun 2026 22:19:33 -0700 Subject: [PATCH 1/6] allow susie-inf to initiliaze susie-ash --- R/susie_wrapper.R | 53 +++++++++++----- R/univariate_pipeline.R | 57 +++++++++++------ man/rss_analysis_pipeline.Rd | 12 ++-- man/susie_rss_pipeline.Rd | 17 ++--- man/univariate_analysis_pipeline.Rd | 4 +- tests/testthat/test_methods_param.R | 96 +++++++++++++++++++++++++++++ vignettes/fine-mapping.Rmd | 31 +++++++--- 7 files changed, 213 insertions(+), 57 deletions(-) diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R index 7dbdaffd..167905a9 100644 --- a/R/susie_wrapper.R +++ b/R/susie_wrapper.R @@ -111,12 +111,21 @@ format_cs_column <- function(coverage, method) { fit } -prepare_susie_from_inf_args <- function(args, susie_inf_fit, refine_default = NULL) { +# Build the argument list for a SuSiE / SuSiE-ash fit initialised from a +# prior SuSiE-inf fit. `unmappable_effects` controls which branch the +# downstream fit takes: "none" yields the standard SuSiE-inf-initialised +# SuSiE; "ash" yields SuSiE-ash with the SuSiE-inf warm start. +prepare_susie_from_inf_args <- function(args, susie_inf_fit, refine_default = NULL, + unmappable_effects = c("none", "ash")) { + unmappable_effects <- match.arg(unmappable_effects) L <- args[["L"]] if (is.null(L)) L <- length(susie_inf_fit$V) if (is.null(args[["refine"]]) && !is.null(refine_default)) args[["refine"]] <- refine_default - args[["unmappable_effects"]] <- "none" + args[["unmappable_effects"]] <- unmappable_effects args[["model_init"]] <- susie_inf_fit + if (unmappable_effects == "ash") { + args[["convergence_method"]] <- args[["convergence_method"]] %||% "pip" + } if (!is.null(args[["L_greedy"]])) args[["L_greedy"]] <- min(length(susie_inf_fit$V), L) args } @@ -893,14 +902,15 @@ adjust_susie_weights <- function(twas_weights_results, keep_variants, run_allele #' fit. Any subset of \code{c("susie_rss", "susie_inf_rss", #' "susie_ash_rss")}. Default \code{NULL} falls back to a single-method fit #' driven by \code{analysis_method} (backward-compatible behavior). When -#' \code{methods} is passed explicitly, each requested method is fitted; if -#' both \code{"susie_inf_rss"} and \code{"susie_rss"} are present and -#' \code{add_susie_inf = TRUE}, SuSiE-RSS is initialised from the -#' SuSiE-inf-RSS result. -#' @param add_susie_inf Logical. When \code{methods} contains both -#' \code{"susie_inf_rss"} and \code{"susie_rss"}, controls whether the -#' SuSiE-RSS fit uses the SuSiE-inf-RSS result as initialisation. Default -#' \code{TRUE}. +#' \code{methods} is passed explicitly, each requested method is fitted; +#' if \code{"susie_inf_rss"} is paired with \code{"susie_rss"} or +#' \code{"susie_ash_rss"} (or both) and \code{add_susie_inf = TRUE}, the +#' SuSiE-inf-RSS fit initialises the downstream method. This exposes five +#' distinct fitting modes mirroring the individual-level pipeline. +#' @param add_susie_inf Logical. When \code{methods} contains +#' \code{"susie_inf_rss"} alongside \code{"susie_rss"} and/or +#' \code{"susie_ash_rss"}, controls whether SuSiE-inf-RSS is chained into +#' the downstream method(s) as initialisation. Default \code{TRUE}. #' @param coverage Coverage level (default: 0.95). #' @param secondary_coverage Secondary coverage levels (default: c(0.7, 0.5)). #' @param signal_cutoff PIP cutoff for selecting top loci (default: 0.1). @@ -949,8 +959,11 @@ susie_rss_pipeline <- function(sumstats, LD_mat = NULL, X_mat = NULL, n = NULL, } fit_methods <- unique(methods) } - chain_inf_to_susie_rss <- isTRUE(add_susie_inf) && + chain_inf_to_susie_rss <- isTRUE(add_susie_inf) && all(c("susie_inf_rss", "susie_rss") %in% fit_methods) + chain_inf_to_susie_ash_rss <- isTRUE(add_susie_inf) && + all(c("susie_inf_rss", "susie_ash_rss") %in% fit_methods) + any_chained_init_rss <- chain_inf_to_susie_rss || chain_inf_to_susie_ash_rss if (!is.null(sumstats$z)) { z <- sumstats$z @@ -992,7 +1005,7 @@ susie_rss_pipeline <- function(sumstats, LD_mat = NULL, X_mat = NULL, n = NULL, } fitted_models <- list() - if ("susie_inf_rss" %in% fit_methods || chain_inf_to_susie_rss) { + if ("susie_inf_rss" %in% fit_methods || any_chained_init_rss) { inf_fit <- fit_one_susie_inf_rss() fitted_models[["susie_inf_rss"]] <- .set_finemapping_fit_class(inf_fit, "susie_inf_rss") } @@ -1002,7 +1015,8 @@ susie_rss_pipeline <- function(sumstats, LD_mat = NULL, X_mat = NULL, n = NULL, if (chain_inf_to_susie_rss) { chained_args <- prepare_susie_from_inf_args( list(L = L, L_greedy = L_greedy), - fitted_models[["susie_inf_rss"]], refine_default = TRUE + fitted_models[["susie_inf_rss"]], refine_default = TRUE, + unmappable_effects = "none" ) rss_fit <- do.call(susie_rss, c(common, chained_args)) } else { @@ -1013,12 +1027,21 @@ susie_rss_pipeline <- function(sumstats, LD_mat = NULL, X_mat = NULL, n = NULL, fitted_models[[rss_label]] <- .set_finemapping_fit_class(rss_fit, rss_label) } if ("susie_ash_rss" %in% fit_methods) { - ash_fit <- fit_one_susie_ash_rss() + if (chain_inf_to_susie_ash_rss) { + chained_args <- prepare_susie_from_inf_args( + list(L = L, L_greedy = L_greedy), + fitted_models[["susie_inf_rss"]], refine_default = NULL, + unmappable_effects = "ash" + ) + ash_fit <- do.call(susie_rss, c(common, chained_args)) + } else { + ash_fit <- fit_one_susie_ash_rss() + } fitted_models[["susie_ash_rss"]] <- .set_finemapping_fit_class(ash_fit, "susie_ash_rss") } # Drop SuSiE-inf-RSS from post-processing if it was only fit for init - if (chain_inf_to_susie_rss && !("susie_inf_rss" %in% fit_methods)) { + if (any_chained_init_rss && !("susie_inf_rss" %in% fit_methods)) { fitted_models[["susie_inf_rss"]] <- NULL } diff --git a/R/univariate_pipeline.R b/R/univariate_pipeline.R index f8ee219c..3ee6fdf9 100644 --- a/R/univariate_pipeline.R +++ b/R/univariate_pipeline.R @@ -37,7 +37,9 @@ #' passed explicitly, each requested method is fitted; if #' \code{"susie_inf"} is paired with \code{"susie"} or \code{"susie_ash"} #' (or both) and \code{add_susie_inf = TRUE}, the SuSiE-inf fit -#' initialises each chained downstream method. +#' initialises each chained downstream method. This gives five distinct +#' fitting modes: SuSiE alone, SuSiE with SuSiE-inf init, SuSiE-inf alone, +#' SuSiE-ash alone, and SuSiE-ash with SuSiE-inf init. #' @param add_susie_inf When \code{methods} is \code{NULL}, controls whether #' SuSiE-inf is fitted and chained into SuSiE. When \code{methods} is set #' explicitly, controls whether the chained-init shortcut is applied to @@ -116,8 +118,13 @@ univariate_analysis_pipeline <- function( } methods <- unique(methods) } - chain_inf_to_susie <- isTRUE(add_susie_inf) && + # SuSiE-inf initialisation chains into SuSiE and/or SuSiE-ash whenever + # either of them is requested alongside SuSiE-inf and add_susie_inf is TRUE. + chain_inf_to_susie <- isTRUE(add_susie_inf) && all(c("susie_inf", "susie") %in% methods) + chain_inf_to_susie_ash <- isTRUE(add_susie_inf) && + all(c("susie_inf", "susie_ash") %in% methods) + any_chained_init <- chain_inf_to_susie || chain_inf_to_susie_ash if (isTRUE(twas_weights) && !("susie" %in% methods)) { stop("twas_weights = TRUE requires \"susie\" to be in methods.") } @@ -169,7 +176,7 @@ univariate_analysis_pipeline <- function( ) fitted_models <- list() - if ("susie_inf" %in% methods || chain_inf_to_susie) { + if ("susie_inf" %in% methods || any_chained_init) { message("Fitting SuSiE-inf model on input data ...") inf_args <- modifyList(susie_args, list( X = X, y = Y, @@ -186,7 +193,8 @@ univariate_analysis_pipeline <- function( message("Fitting SuSiE model initialized by SuSiE-inf ...") su_args <- prepare_susie_from_inf_args(susie_args, fitted_models[["susie_inf"]], - refine_default = TRUE) + refine_default = TRUE, + unmappable_effects = "none") su_fit <- do.call(susie, c(list(X = X, y = Y), su_args)) } else { message("Fitting SuSiE model on input data ...") @@ -196,19 +204,28 @@ univariate_analysis_pipeline <- function( } if ("susie_ash" %in% methods) { - message("Fitting SuSiE-ash model on input data ...") - ash_args <- modifyList(susie_args, list( - X = X, y = Y, - unmappable_effects = "ash", - convergence_method = "pip" - )) - ash_fit <- do.call(susie, ash_args) + if (chain_inf_to_susie_ash) { + message("Fitting SuSiE-ash model initialized by SuSiE-inf ...") + ash_args <- prepare_susie_from_inf_args(susie_args, + fitted_models[["susie_inf"]], + refine_default = NULL, + unmappable_effects = "ash") + ash_fit <- do.call(susie, c(list(X = X, y = Y), ash_args)) + } else { + message("Fitting SuSiE-ash model on input data ...") + ash_args <- modifyList(susie_args, list( + X = X, y = Y, + unmappable_effects = "ash", + convergence_method = "pip" + )) + ash_fit <- do.call(susie, ash_args) + } fitted_models[["susie_ash"]] <- .set_finemapping_fit_class(ash_fit, "susie_ash") } - # Drop susie_inf from post-processing if it was only fit to provide init for SuSiE - # (i.e. caller did not request "susie_inf" in methods) - if (chain_inf_to_susie && !("susie_inf" %in% methods)) { + # Drop susie_inf from post-processing if it was only fit to provide init for + # SuSiE / SuSiE-ash (i.e. caller did not request "susie_inf" in methods). + if (any_chained_init && !("susie_inf" %in% methods)) { fitted_models[["susie_inf"]] <- NULL } @@ -321,12 +338,12 @@ load_study_LD <- function(ld_path, region) { #' "susie_ash_rss")}. Default \code{NULL} preserves legacy single-method #' behavior via \code{finemapping_method}. When set explicitly, every #' requested method contributes rows to the unified \code{top_loci}; when -#' both \code{"susie_inf_rss"} and \code{"susie_rss"} are requested and -#' \code{add_susie_inf = TRUE}, SuSiE-RSS is initialised from the -#' SuSiE-inf-RSS result. -#' @param add_susie_inf Logical controlling chained init when both -#' \code{"susie_inf_rss"} and \code{"susie_rss"} are in \code{methods}. -#' Default \code{TRUE}. +#' \code{"susie_inf_rss"} is paired with \code{"susie_rss"} or +#' \code{"susie_ash_rss"} (or both) and \code{add_susie_inf = TRUE}, the +#' SuSiE-inf-RSS fit initialises the chained downstream method(s). +#' @param add_susie_inf Logical controlling chained init when +#' \code{"susie_inf_rss"} is in \code{methods} alongside +#' \code{"susie_rss"} and/or \code{"susie_ash_rss"}. Default \code{TRUE}. #' @param finemapping_opts List of fine-mapping options (L, L_greedy, coverage, #' signal_cutoff, min_abs_corr). #' @param impute Whether to impute missing variants via RAISS (default TRUE). diff --git a/man/rss_analysis_pipeline.Rd b/man/rss_analysis_pipeline.Rd index a9b87813..e2a78d75 100644 --- a/man/rss_analysis_pipeline.Rd +++ b/man/rss_analysis_pipeline.Rd @@ -69,13 +69,13 @@ variants to fit. Any subset of \code{c("susie_rss", "susie_inf_rss", "susie_ash_rss")}. Default \code{NULL} preserves legacy single-method behavior via \code{finemapping_method}. When set explicitly, every requested method contributes rows to the unified \code{top_loci}; when -both \code{"susie_inf_rss"} and \code{"susie_rss"} are requested and -\code{add_susie_inf = TRUE}, SuSiE-RSS is initialised from the -SuSiE-inf-RSS result.} +\code{"susie_inf_rss"} is paired with \code{"susie_rss"} or +\code{"susie_ash_rss"} (or both) and \code{add_susie_inf = TRUE}, the +SuSiE-inf-RSS fit initialises the chained downstream method(s).} -\item{add_susie_inf}{Logical controlling chained init when both -\code{"susie_inf_rss"} and \code{"susie_rss"} are in \code{methods}. -Default \code{TRUE}.} +\item{add_susie_inf}{Logical controlling chained init when +\code{"susie_inf_rss"} is in \code{methods} alongside +\code{"susie_rss"} and/or \code{"susie_ash_rss"}. Default \code{TRUE}.} \item{finemapping_opts}{List of fine-mapping options (L, L_greedy, coverage, signal_cutoff, min_abs_corr).} diff --git a/man/susie_rss_pipeline.Rd b/man/susie_rss_pipeline.Rd index bbbe6705..375837e3 100644 --- a/man/susie_rss_pipeline.Rd +++ b/man/susie_rss_pipeline.Rd @@ -46,15 +46,16 @@ method; ignored for \code{"susie_inf_rss"} and \code{"susie_ash_rss"}.} fit. Any subset of \code{c("susie_rss", "susie_inf_rss", "susie_ash_rss")}. Default \code{NULL} falls back to a single-method fit driven by \code{analysis_method} (backward-compatible behavior). When -\code{methods} is passed explicitly, each requested method is fitted; if -both \code{"susie_inf_rss"} and \code{"susie_rss"} are present and -\code{add_susie_inf = TRUE}, SuSiE-RSS is initialised from the -SuSiE-inf-RSS result.} +\code{methods} is passed explicitly, each requested method is fitted; +if \code{"susie_inf_rss"} is paired with \code{"susie_rss"} or +\code{"susie_ash_rss"} (or both) and \code{add_susie_inf = TRUE}, the +SuSiE-inf-RSS fit initialises the downstream method. This exposes five +distinct fitting modes mirroring the individual-level pipeline.} -\item{add_susie_inf}{Logical. When \code{methods} contains both -\code{"susie_inf_rss"} and \code{"susie_rss"}, controls whether the -SuSiE-RSS fit uses the SuSiE-inf-RSS result as initialisation. Default -\code{TRUE}.} +\item{add_susie_inf}{Logical. When \code{methods} contains +\code{"susie_inf_rss"} alongside \code{"susie_rss"} and/or +\code{"susie_ash_rss"}, controls whether SuSiE-inf-RSS is chained into +the downstream method(s) as initialisation. Default \code{TRUE}.} \item{coverage}{Coverage level (default: 0.95).} diff --git a/man/univariate_analysis_pipeline.Rd b/man/univariate_analysis_pipeline.Rd index 5323ac6e..3f82d2d9 100644 --- a/man/univariate_analysis_pipeline.Rd +++ b/man/univariate_analysis_pipeline.Rd @@ -89,7 +89,9 @@ SuSiE fit as initialization; \code{add_susie_inf = FALSE} maps to passed explicitly, each requested method is fitted; if \code{"susie_inf"} is paired with \code{"susie"} or \code{"susie_ash"} (or both) and \code{add_susie_inf = TRUE}, the SuSiE-inf fit -initialises each chained downstream method.} +initialises each chained downstream method. This gives five distinct +fitting modes: SuSiE alone, SuSiE with SuSiE-inf init, SuSiE-inf alone, +SuSiE-ash alone, and SuSiE-ash with SuSiE-inf init.} \item{add_susie_inf}{When \code{methods} is \code{NULL}, controls whether SuSiE-inf is fitted and chained into SuSiE. When \code{methods} is set diff --git a/tests/testthat/test_methods_param.R b/tests/testthat/test_methods_param.R index df158b3f..b0a0981a 100644 --- a/tests/testthat/test_methods_param.R +++ b/tests/testthat/test_methods_param.R @@ -108,6 +108,102 @@ test_that("univariate_analysis_pipeline: unknown method rejected", { ) }) +# Fifth fitting mode: SuSiE-ash with SuSiE-inf init ----------------------- + +# Helper: signal-rich synthetic data where SuSiE-inf finds non-zero mappable +# effects, so model_init genuinely carries information into SuSiE / SuSiE-ash. +# (The shipped eqtl_region_example is too weak: SuSiE-inf converges with all V=0 +# / mu=0, which susieR's extract_model_init_fields correctly returns NULL for, +# collapsing chained and independent runs to identical results.) +.make_strong_signal_inputs <- function(seed = 1) { + set.seed(seed) + n <- 400; p <- 200 + X <- matrix(rnorm(n * p), n, p) + colnames(X) <- paste0("chr1:", seq_len(p) * 1000, ":A:T") + rownames(X) <- paste0("S", seq_len(n)) + beta_sparse <- rep(0, p); beta_sparse[c(20, 70, 130)] <- 2 + beta_poly <- rnorm(p, sd = 0.05) + y <- as.numeric(X %*% (beta_sparse + beta_poly) + rnorm(n)) + maf <- rep(0.3, p) + list(X = X, y = y, maf = maf) +} + +test_that("univariate_analysis_pipeline: chained SuSiE-inf -> SuSiE-ash differs from independent susie_ash", { + skip_if_not_installed("susieR") + inp <- .make_strong_signal_inputs() + chained <- univariate_analysis_pipeline( + X = inp$X, Y = inp$y, maf = inp$maf, + methods = c("susie_inf", "susie_ash"), add_susie_inf = TRUE, + twas_weights = FALSE, + finemapping_extra_opts = list(refine = FALSE), + signal_cutoff = 0, verbose = 0 + ) + indep <- univariate_analysis_pipeline( + X = inp$X, Y = inp$y, maf = inp$maf, + methods = c("susie_inf", "susie_ash"), add_susie_inf = FALSE, + twas_weights = FALSE, + finemapping_extra_opts = list(refine = FALSE), + signal_cutoff = 0, verbose = 0 + ) + expect_true(all(c("susie_inf", "susie_ash") %in% unique(chained$top_loci$method))) + expect_true(all(c("susie_inf", "susie_ash") %in% unique(indep$top_loci$method))) + expect_false(is.null(chained$susie_inf_fitted)) + expect_false(is.null(chained$susie_ash_fitted)) + # SuSiE-inf must have non-trivial mappable effects for model_init to matter + expect_true(any(chained$susie_inf_fitted$V > 0)) + # With non-trivial init, chained SuSiE-ash PIPs differ from independent run + pip_chained <- chained$susie_ash_fitted$pip + pip_indep <- indep$susie_ash_fitted$pip + expect_equal(length(pip_chained), length(pip_indep)) + expect_true(max(abs(pip_chained - pip_indep)) > 1e-3) +}) + +test_that("univariate_analysis_pipeline: chain dispatch emits the chained-init message", { + skip_if_not_installed("susieR") + inp <- .make_strong_signal_inputs() + expect_message( + univariate_analysis_pipeline( + X = inp$X, Y = inp$y, maf = inp$maf, + methods = c("susie_inf", "susie_ash"), add_susie_inf = TRUE, + twas_weights = FALSE, + finemapping_extra_opts = list(refine = FALSE), + signal_cutoff = 0, verbose = 0 + ), + "SuSiE-ash model initialized by SuSiE-inf" + ) +}) + +test_that("univariate_analysis_pipeline: add_susie_inf=FALSE fits SuSiE-ash without chained init", { + skip_if_not_installed("susieR") + inp <- .make_strong_signal_inputs() + expect_message( + univariate_analysis_pipeline( + X = inp$X, Y = inp$y, maf = inp$maf, + methods = c("susie_inf", "susie_ash"), add_susie_inf = FALSE, + twas_weights = FALSE, + finemapping_extra_opts = list(refine = FALSE), + signal_cutoff = 0, verbose = 0 + ), + "Fitting SuSiE-ash model on input data" + ) +}) + +test_that("univariate_analysis_pipeline: all three methods + chain initialises both susie and susie_ash", { + skip_if_not_installed("susieR") + inp <- .make_strong_signal_inputs() + r <- univariate_analysis_pipeline( + X = inp$X, Y = inp$y, maf = inp$maf, + methods = c("susie_inf", "susie", "susie_ash"), add_susie_inf = TRUE, + twas_weights = FALSE, + finemapping_extra_opts = list(refine = FALSE), + signal_cutoff = 0, verbose = 0 + ) + expect_false(is.null(r$susie_inf_fitted)) + expect_false(is.null(r$susie_fitted)) + expect_false(is.null(r$susie_ash_fitted)) + expect_true(all(c("susie_inf", "susie", "susie_ash") %in% unique(r$top_loci$method))) +}) + test_that("univariate_analysis_pipeline: twas_weights = TRUE requires chained two-stage", { skip_if_not_installed("susieR") inp <- .make_uvp_inputs() diff --git a/vignettes/fine-mapping.Rmd b/vignettes/fine-mapping.Rmd index 35399bf0..cba698ce 100644 --- a/vignettes/fine-mapping.Rmd +++ b/vignettes/fine-mapping.Rmd @@ -95,8 +95,8 @@ rows are useful as a sanity check on inferred non-sparse effects. ## Choosing among SuSiE variants -`pecotmr` exposes four SuSiE-family fine-mapping variants. Pick which to -fit via the `methods` parameter on `univariate_analysis_pipeline()` +`pecotmr` exposes five SuSiE-family fine-mapping variants. Pick which +to fit via the `methods` parameter on `univariate_analysis_pipeline()` (individual-level) or `rss_analysis_pipeline()` / `susie_rss_pipeline()` (summary statistics): @@ -106,6 +106,7 @@ fit via the `methods` parameter on `univariate_analysis_pipeline()` | **SuSiE with SuSiE-inf init** (default) | `methods = c("susie_inf", "susie")` + `add_susie_inf = TRUE` | `methods = c("susie_inf_rss", "susie_rss")` + `add_susie_inf = TRUE` | Two-stage: SuSiE-inf absorbs background polygenicity first, SuSiE then refines sparse signals from that initialisation. Recommended default for most QTL and GWAS fine-mapping. | | **SuSiE-inf alone** | `methods = "susie_inf"` | `methods = "susie_inf_rss"` | Sparse + infinitesimal joint fit. Useful for regions with strong polygenic background where the SuSiE refinement step is undesired. | | **SuSiE-ash alone** | `methods = "susie_ash"` | `methods = "susie_ash_rss"` | Sparse + adaptive shrinkage (ASH) mixture. Useful when you suspect a mix of medium-effect and small-effect variants beyond a small handful of large ones. | +| **SuSiE-ash with SuSiE-inf init** | `methods = c("susie_inf", "susie_ash")` + `add_susie_inf = TRUE` | `methods = c("susie_inf_rss", "susie_ash_rss")` + `add_susie_inf = TRUE` | Same chained two-stage idea as the SuSiE variant but with ASH on the downstream fit: SuSiE-inf absorbs polygenic background first, then SuSiE-ash refines using the mixture prior. | The chained SuSiE-inf → SuSiE two-stage is the default both for backward compatibility and because it produces clean credible sets in @@ -113,10 +114,26 @@ most settings: SuSiE alone can fold polygenic background into noisy "extra" credible sets; SuSiE-inf absorbs that background first so the SuSiE refinement concentrates on sparse signals. -All four populate the same 22-column `top_loci` table. When you fit +`add_susie_inf = TRUE` chains SuSiE-inf into **any** of `"susie"` or +`"susie_ash"` that appear in `methods`. Listing all three +(`methods = c("susie_inf", "susie", "susie_ash")` with +`add_susie_inf = TRUE`) fits SuSiE-inf once, then both SuSiE and +SuSiE-ash initialised from it — useful for direct head-to-head +comparison on the same warm start. + +All five populate the same 22-column `top_loci` table. When you fit multiple methods in one call, the rows are row-bound and labelled by the `method` column. +> Note: SuSiE-inf chain init only carries information when the +> SuSiE-inf fit finds non-zero mappable effects. On a region where +> SuSiE-inf explains everything via its infinitesimal component (all +> `V == 0`, all `mu == 0`), upstream `susieR` correctly treats the +> init as empty, and the chained downstream fit collapses to the +> independent run. This is the right behaviour, not a bug — but it +> means head-to-head differences only show up on regions with at +> least one sparse signal. + ### Backward compatibility with `add_susie_inf` The pre-existing `add_susie_inf` boolean still works exactly as before @@ -127,10 +144,10 @@ when `methods` is `NULL` (the default): - `add_susie_inf = FALSE` → plain SuSiE alone. Equivalent to `methods = "susie"`. -When `methods` is set explicitly, `add_susie_inf` only controls whether -the chained-init shortcut is applied to a `c("susie_inf", "susie")` -pair. Set `add_susie_inf = FALSE` with both methods present to fit -SuSiE-inf and SuSiE independently rather than chained. +When `methods` is set explicitly, `add_susie_inf` controls whether the +chained-init shortcut is applied to any `"susie"` or `"susie_ash"` +method that appears alongside `"susie_inf"`. Set `add_susie_inf = FALSE` +to fit each requested method independently rather than chained. ### Fitting multiple variants in one call From 8f5bbe7ecbd800a6dc5cc7a1db4463456c8595e5 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Thu, 4 Jun 2026 23:25:50 -0700 Subject: [PATCH 2/6] add impute --- .github/recipe/recipe.yaml | 2 + DESCRIPTION | 1 + NAMESPACE | 3 + R/misc.R | 251 +++++++++++++++++++++ _pkgdown.yml | 6 + man/flashier_imputation.Rd | 48 ++++ man/molecular_trait_imputation_pipeline.Rd | 68 ++++++ man/softimpute_imputation.Rd | 47 ++++ pixi.toml | 1 + tests/testthat/test_molecular_imputation.R | 140 ++++++++++++ vignettes/molecular-imputation.Rmd | 185 +++++++++++++++ 11 files changed, 752 insertions(+) create mode 100644 man/flashier_imputation.Rd create mode 100644 man/molecular_trait_imputation_pipeline.Rd create mode 100644 man/softimpute_imputation.Rd create mode 100644 tests/testthat/test_molecular_imputation.R create mode 100644 vignettes/molecular-imputation.Rmd diff --git a/.github/recipe/recipe.yaml b/.github/recipe/recipe.yaml index d7e3ed13..c892356f 100644 --- a/.github/recipe/recipe.yaml +++ b/.github/recipe/recipe.yaml @@ -73,6 +73,7 @@ requirements: - r-readr - r-rfast - r-rlang + - r-softimpute - r-stringr - r-susier - r-tibble @@ -131,6 +132,7 @@ requirements: - r-readr - r-rfast - r-rlang + - r-softimpute - r-stringr - r-susier - r-tibble diff --git a/DESCRIPTION b/DESCRIPTION index 590b817a..3ca4b8c9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -74,6 +74,7 @@ Suggests: rtracklayer, SNPRelate, snpStats, + softImpute, testthat, VariantAnnotation, xgboost diff --git a/NAMESPACE b/NAMESPACE index 3b20514b..0bb64eff 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -72,6 +72,7 @@ export(find_overlapping_regions) export(fine_mr) export(fit_mash_contrast) export(fit_susie_inf_then_susie_rss) +export(flashier_imputation) export(format_finemapping_output) export(fsusie_get_cs) export(fsusie_wrapper) @@ -173,6 +174,7 @@ export(merge_sumstats_matrices) export(merge_variant_info) export(meta_analysis_per_cell) export(meta_sldsc_random) +export(molecular_trait_imputation_pipeline) export(mr_analysis) export(mr_ash_rss_weights) export(mr_format) @@ -216,6 +218,7 @@ export(sdpr_weights) export(slalom) export(sldsc_postprocessing_pipeline) export(slice_mash_data) +export(softimpute_imputation) export(standardise_sumstats_columns) export(standardize_sldsc_trait) export(subsetChr) diff --git a/R/misc.R b/R/misc.R index a1d264a6..2c135c42 100644 --- a/R/misc.R +++ b/R/misc.R @@ -959,6 +959,257 @@ xgboost_imputation <- function(data, maxiter = 10L, max_depth = 2L, if (iter == maxiter) ximp_history[[iter]] else ximp_history[[max(iter - 1L, 1L)]] } +#' softImpute-based matrix imputation +#' +#' Imputes missing values in a numeric matrix via the soft-thresholded SVD +#' algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"} +#' alternating-least-squares solver by default; \code{lambda} defaults to +#' a sensible point on the lambda grid (\code{0.5 * lambda0(x)}) when not +#' supplied. Columns with all entries missing are removed before fitting +#' and a one-line message records the count. +#' +#' @param data Numeric matrix (samples x features) with missing values +#' (NA). +#' @param rank.max Maximum rank of the SVD solution. Default +#' \code{min(dim(data) - 1)}. +#' @param lambda Nuclear-norm penalty. NULL (default) chooses +#' \code{0.5 * lambda0(data)} so the solution is not degenerate. +#' @param type Optimisation type passed to \code{softImpute::softImpute}; +#' one of \code{"als"} (default, faster) or \code{"svd"}. +#' @param thresh Convergence threshold. Default 1e-5. +#' @param maxit Maximum iterations. Default 100. +#' @param verbose Logical, print progress. Default FALSE. +#' @return The imputed matrix with the same dimensions as the input +#' (minus any all-NA columns). +#' @export +softimpute_imputation <- function(data, rank.max = NULL, lambda = NULL, + type = c("als", "svd"), + thresh = 1e-5, maxit = 100, + verbose = FALSE) { + if (!requireNamespace("softImpute", quietly = TRUE)) { + stop("Package 'softImpute' is required for softimpute_imputation.") + } + type <- match.arg(type) + X <- as.matrix(data) + n <- nrow(X); p <- ncol(X) + all_na <- colSums(is.na(X)) == n + if (any(all_na)) { + if (verbose) message("Removed ", sum(all_na), " column(s) with all entries missing.") + X <- X[, !all_na, drop = FALSE] + p <- ncol(X) + } + if (is.null(rank.max)) rank.max <- max(1L, min(n, p) - 1L) + if (is.null(lambda)) lambda <- 0.5 * softImpute::lambda0(X) + fit <- softImpute::softImpute(X, rank.max = rank.max, lambda = lambda, + type = type, thresh = thresh, maxit = maxit, + trace.it = isTRUE(verbose)) + Ximp <- softImpute::complete(X, fit) + if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X) + if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X) + Ximp +} + +#' flashier-based matrix imputation +#' +#' Imputes missing values in a numeric matrix via empirical Bayes +#' matrix factorisation (\code{flashier::flash}). \code{flash} natively +#' supports NA entries and reconstructs them from the posterior fitted +#' values. The default prior families match those used by +#' \code{\link{compute_cov_flash}}: \code{ebnm::ebnm_normal} on the row +#' factor and \code{ebnm::ebnm_normal_scale_mixture} on the column +#' factor. +#' +#' @param data Numeric matrix (samples x features) with missing values +#' (NA). +#' @param var_type Variance structure for the residuals, passed to +#' \code{flashier::flash}. Default 2 (per-column). +#' @param ebnm_fn Prior family. Default +#' \code{c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture)}. +#' @param greedy_Kmax Maximum number of greedy factors. Default +#' \code{min(20L, min(dim(data)) - 1L)}. +#' @param backfit Logical; run a final backfit pass. Default TRUE. +#' @param verbose Numeric verbosity (0 silent, default). +#' @param ... Additional arguments forwarded to \code{flashier::flash}. +#' @return The imputed matrix with the same dimensions as the input +#' (minus any all-NA columns). +#' @export +flashier_imputation <- function(data, var_type = 2, + ebnm_fn = c(ebnm::ebnm_normal, + ebnm::ebnm_normal_scale_mixture), + greedy_Kmax = NULL, + backfit = TRUE, verbose = 0, ...) { + if (!requireNamespace("flashier", quietly = TRUE)) { + stop("Package 'flashier' is required for flashier_imputation.") + } + if (!requireNamespace("ebnm", quietly = TRUE)) { + stop("Package 'ebnm' is required for flashier_imputation.") + } + X <- as.matrix(data) + n <- nrow(X); p <- ncol(X) + all_na <- colSums(is.na(X)) == n + if (any(all_na)) { + if (isTRUE(verbose) || (is.numeric(verbose) && verbose >= 1)) { + message("Removed ", sum(all_na), " column(s) with all entries missing.") + } + X <- X[, !all_na, drop = FALSE] + p <- ncol(X) + } + if (is.null(greedy_Kmax)) greedy_Kmax <- max(1L, min(20L, min(n, p) - 1L)) + fl <- flashier::flash(X, var_type = var_type, ebnm_fn = ebnm_fn, + greedy_Kmax = greedy_Kmax, backfit = backfit, + verbose = verbose, ...) + # Posterior fitted value: L_pm %*% t(F_pm). When no factors are found, + # fall back to column means. + if (is.null(fl$n_factors) || fl$n_factors == 0) { + fitted <- matrix(colMeans(X, na.rm = TRUE), + nrow = n, ncol = p, byrow = TRUE) + } else { + fitted <- tcrossprod(fl$L_pm, fl$F_pm) + } + Ximp <- X + na_idx <- is.na(X) + Ximp[na_idx] <- fitted[na_idx] + if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X) + if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X) + Ximp +} + +#' Pipeline for molecular trait imputation +#' +#' Unified entry point for imputing missing values in a molecular trait +#' matrix (e.g. expression, splicing, protein abundance). Dispatches to +#' one or more underlying methods and optionally evaluates imputation +#' quality on a held-out random mask of observed entries. +#' +#' Available methods: +#' \itemize{ +#' \item \code{"xgboost"} - \code{\link{xgboost_imputation}}, iterative +#' XGBoost per-column models. No structural assumption on the matrix. +#' \item \code{"softimpute"} - \code{\link{softimpute_imputation}}, +#' soft-thresholded SVD via \code{softImpute::softImpute}. Assumes a +#' low-rank smooth signal. +#' \item \code{"flashier"} - \code{\link{flashier_imputation}}, +#' empirical Bayes matrix factorisation via \code{flashier::flash} +#' with sparse + scale-mixture priors. +#' } +#' +#' @param data Numeric matrix (samples x features) with missing values. +#' @param methods Character vector of methods to fit. Any subset of +#' \code{c("xgboost", "softimpute", "flashier")}. Default fits all +#' three. +#' @param method_args Named list of per-method argument lists. List +#' names must match \code{methods}. Default empty. +#' @param evaluate Logical. If TRUE, mask a random fraction of observed +#' entries, run each method, and report root-mean-squared error on the +#' masked cells. Default FALSE. +#' @param eval_fraction Fraction of observed cells to hold out for +#' evaluation. Default 0.05. +#' @param eval_seed Random seed for the held-out mask. Default 42. +#' @param verbose Logical, print progress. Default FALSE. +#' +#' @return A list with: +#' \describe{ +#' \item{imputed}{Named list of imputed matrices, one per requested +#' method.} +#' \item{missingness}{Per-column NA count and overall NA fraction.} +#' \item{evaluation}{When \code{evaluate = TRUE}, a data.frame with +#' one row per method: \code{method}, \code{rmse}, \code{n_held_out}, +#' \code{time_seconds}.} +#' \item{method_args}{The resolved per-method arguments used.} +#' } +#' @export +molecular_trait_imputation_pipeline <- function( + data, + methods = c("xgboost", "softimpute", "flashier"), + method_args = list(), + evaluate = FALSE, + eval_fraction = 0.05, + eval_seed = 42L, + verbose = FALSE) { + valid <- c("xgboost", "softimpute", "flashier") + if (!is.character(methods) || length(methods) == 0L) { + stop("methods must be a non-empty character vector.") + } + bad <- setdiff(methods, valid) + if (length(bad) > 0) { + stop("Unknown imputation method(s): ", paste(bad, collapse = ", "), + ". Valid options: ", paste(valid, collapse = ", ")) + } + methods <- unique(methods) + + Xfull <- as.matrix(data) + n <- nrow(Xfull); p <- ncol(Xfull) + na_mask_full <- is.na(Xfull) + na_per_col <- colSums(na_mask_full) + missingness <- list( + per_column = na_per_col, + n_missing = sum(na_mask_full), + fraction = sum(na_mask_full) / (n * p) + ) + if (verbose) { + message(sprintf("molecular_trait_imputation_pipeline: %d x %d matrix, %.2f%% missing.", + n, p, 100 * missingness$fraction)) + } + + dispatch <- list(xgboost = xgboost_imputation, + softimpute = softimpute_imputation, + flashier = flashier_imputation) + + run_one <- function(m, X_in) { + args <- method_args[[m]] %||% list() + do.call(dispatch[[m]], c(list(X_in), args)) + } + + # If evaluating, hold out a random subset of observed cells, run each + # method against the masked matrix, then score RMSE on the held-out + # cells. Final imputation is fit against the original matrix so the + # returned matrices use every observed cell. + eval_df <- NULL + if (isTRUE(evaluate)) { + obs_idx <- which(!na_mask_full) + n_hold <- max(1L, floor(eval_fraction * length(obs_idx))) + if (n_hold >= length(obs_idx)) { + stop("eval_fraction too large; would mask all observed entries.") + } + set.seed(eval_seed) + hold_idx <- sample(obs_idx, size = n_hold, replace = FALSE) + Xmasked <- Xfull + held_truth <- Xfull[hold_idx] + Xmasked[hold_idx] <- NA_real_ + eval_records <- vector("list", length(methods)) + for (i in seq_along(methods)) { + m <- methods[i] + if (verbose) message("Evaluating ", m, " on held-out mask ...") + t0 <- proc.time() + Ximp_m <- run_one(m, Xmasked) + t1 <- proc.time() + pred <- Ximp_m[hold_idx] + rmse <- sqrt(mean((pred - held_truth)^2, na.rm = TRUE)) + eval_records[[i]] <- data.frame( + method = m, rmse = rmse, n_held_out = n_hold, + time_seconds = as.numeric((t1 - t0)["elapsed"]), + stringsAsFactors = FALSE + ) + } + eval_df <- do.call(rbind, eval_records) + rownames(eval_df) <- NULL + } + + # Final imputation: run each method on the full matrix + imputed <- list() + for (m in methods) { + if (verbose) message("Imputing with ", m, " ...") + imputed[[m]] <- run_one(m, Xfull) + } + + list( + imputed = imputed, + missingness = missingness, + evaluation = eval_df, + method_args = method_args + ) +} + #' Robust Mahalanobis Distance #' #' Drop-in replacement for \code{\link[stats]{mahalanobis}} that handles diff --git a/_pkgdown.yml b/_pkgdown.yml index e8bdeade..efcf0b98 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -45,6 +45,12 @@ navbar: aria-label: View source on GitHub articles: + - title: "Data Preparation" + desc: > + Imputing missing values in molecular trait matrices before + fine-mapping and TWAS weight training. + contents: + - molecular-imputation - title: "Quality Control" desc: > Allele alignment and LD-mismatch QC for GWAS and QTL summary diff --git a/man/flashier_imputation.Rd b/man/flashier_imputation.Rd new file mode 100644 index 00000000..90578084 --- /dev/null +++ b/man/flashier_imputation.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/misc.R +\name{flashier_imputation} +\alias{flashier_imputation} +\title{flashier-based matrix imputation} +\usage{ +flashier_imputation( + data, + var_type = 2, + ebnm_fn = c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture), + greedy_Kmax = NULL, + backfit = TRUE, + verbose = 0, + ... +) +} +\arguments{ +\item{data}{Numeric matrix (samples x features) with missing values +(NA).} + +\item{var_type}{Variance structure for the residuals, passed to +\code{flashier::flash}. Default 2 (per-column).} + +\item{ebnm_fn}{Prior family. Default +\code{c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture)}.} + +\item{greedy_Kmax}{Maximum number of greedy factors. Default +\code{min(20L, min(dim(data)) - 1L)}.} + +\item{backfit}{Logical; run a final backfit pass. Default TRUE.} + +\item{verbose}{Numeric verbosity (0 silent, default).} + +\item{...}{Additional arguments forwarded to \code{flashier::flash}.} +} +\value{ +The imputed matrix with the same dimensions as the input + (minus any all-NA columns). +} +\description{ +Imputes missing values in a numeric matrix via empirical Bayes +matrix factorisation (\code{flashier::flash}). \code{flash} natively +supports NA entries and reconstructs them from the posterior fitted +values. The default prior families match those used by +\code{\link{compute_cov_flash}}: \code{ebnm::ebnm_normal} on the row +factor and \code{ebnm::ebnm_normal_scale_mixture} on the column +factor. +} diff --git a/man/molecular_trait_imputation_pipeline.Rd b/man/molecular_trait_imputation_pipeline.Rd new file mode 100644 index 00000000..68a74d6a --- /dev/null +++ b/man/molecular_trait_imputation_pipeline.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/misc.R +\name{molecular_trait_imputation_pipeline} +\alias{molecular_trait_imputation_pipeline} +\title{Pipeline for molecular trait imputation} +\usage{ +molecular_trait_imputation_pipeline( + data, + methods = c("xgboost", "softimpute", "flashier"), + method_args = list(), + evaluate = FALSE, + eval_fraction = 0.05, + eval_seed = 42L, + verbose = FALSE +) +} +\arguments{ +\item{data}{Numeric matrix (samples x features) with missing values.} + +\item{methods}{Character vector of methods to fit. Any subset of +\code{c("xgboost", "softimpute", "flashier")}. Default fits all +three.} + +\item{method_args}{Named list of per-method argument lists. List +names must match \code{methods}. Default empty.} + +\item{evaluate}{Logical. If TRUE, mask a random fraction of observed +entries, run each method, and report root-mean-squared error on the +masked cells. Default FALSE.} + +\item{eval_fraction}{Fraction of observed cells to hold out for +evaluation. Default 0.05.} + +\item{eval_seed}{Random seed for the held-out mask. Default 42.} + +\item{verbose}{Logical, print progress. Default FALSE.} +} +\value{ +A list with: +\describe{ + \item{imputed}{Named list of imputed matrices, one per requested + method.} + \item{missingness}{Per-column NA count and overall NA fraction.} + \item{evaluation}{When \code{evaluate = TRUE}, a data.frame with + one row per method: \code{method}, \code{rmse}, \code{n_held_out}, + \code{time_seconds}.} + \item{method_args}{The resolved per-method arguments used.} +} +} +\description{ +Unified entry point for imputing missing values in a molecular trait +matrix (e.g. expression, splicing, protein abundance). Dispatches to +one or more underlying methods and optionally evaluates imputation +quality on a held-out random mask of observed entries. +} +\details{ +Available methods: +\itemize{ + \item \code{"xgboost"} - \code{\link{xgboost_imputation}}, iterative + XGBoost per-column models. No structural assumption on the matrix. + \item \code{"softimpute"} - \code{\link{softimpute_imputation}}, + soft-thresholded SVD via \code{softImpute::softImpute}. Assumes a + low-rank smooth signal. + \item \code{"flashier"} - \code{\link{flashier_imputation}}, + empirical Bayes matrix factorisation via \code{flashier::flash} + with sparse + scale-mixture priors. +} +} diff --git a/man/softimpute_imputation.Rd b/man/softimpute_imputation.Rd new file mode 100644 index 00000000..efb08fa5 --- /dev/null +++ b/man/softimpute_imputation.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/misc.R +\name{softimpute_imputation} +\alias{softimpute_imputation} +\title{softImpute-based matrix imputation} +\usage{ +softimpute_imputation( + data, + rank.max = NULL, + lambda = NULL, + type = c("als", "svd"), + thresh = 1e-05, + maxit = 100, + verbose = FALSE +) +} +\arguments{ +\item{data}{Numeric matrix (samples x features) with missing values +(NA).} + +\item{rank.max}{Maximum rank of the SVD solution. Default +\code{min(dim(data) - 1)}.} + +\item{lambda}{Nuclear-norm penalty. NULL (default) chooses +\code{0.5 * lambda0(data)} so the solution is not degenerate.} + +\item{type}{Optimisation type passed to \code{softImpute::softImpute}; +one of \code{"als"} (default, faster) or \code{"svd"}.} + +\item{thresh}{Convergence threshold. Default 1e-5.} + +\item{maxit}{Maximum iterations. Default 100.} + +\item{verbose}{Logical, print progress. Default FALSE.} +} +\value{ +The imputed matrix with the same dimensions as the input + (minus any all-NA columns). +} +\description{ +Imputes missing values in a numeric matrix via the soft-thresholded SVD +algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"} +alternating-least-squares solver by default; \code{lambda} defaults to +a sensible point on the lambda grid (\code{0.5 * lambda0(x)}) when not +supplied. Columns with all entries missing are removed before fitting +and a one-line message records the count. +} diff --git a/pixi.toml b/pixi.toml index 62a2cca3..b2afbe2a 100644 --- a/pixi.toml +++ b/pixi.toml @@ -92,6 +92,7 @@ r45 = {features = ["r45"]} "r-readr" = "*" "r-rfast" = "*" "r-rlang" = "*" +"r-softimpute" = "*" "r-stringr" = "*" "r-susier" = "*" "r-tibble" = "*" diff --git a/tests/testthat/test_molecular_imputation.R b/tests/testthat/test_molecular_imputation.R new file mode 100644 index 00000000..2c68b808 --- /dev/null +++ b/tests/testthat/test_molecular_imputation.R @@ -0,0 +1,140 @@ +context("Molecular trait imputation pipeline") + +# Helper: low-rank synthetic matrix with NA mask. +.make_imputation_inputs <- function(n = 60, p = 15, rank = 3, na_frac = 0.1, + seed = 1) { + set.seed(seed) + U <- matrix(rnorm(n * rank), n, rank) + V <- matrix(rnorm(p * rank), p, rank) + truth <- U %*% t(V) + matrix(rnorm(n * p, sd = 0.1), n, p) + X <- truth + na_mask <- matrix(runif(n * p) < na_frac, n, p) + X[na_mask] <- NA + rownames(X) <- paste0("S", seq_len(n)) + colnames(X) <- paste0("F", seq_len(p)) + list(X = X, truth = truth, na_mask = na_mask) +} + +# ============================================================================= +# softimpute_imputation +# ============================================================================= + +test_that("softimpute_imputation: returns matrix with same dims, no NAs", { + skip_if_not_installed("softImpute") + inp <- .make_imputation_inputs() + Xi <- softimpute_imputation(inp$X) + expect_equal(dim(Xi), dim(inp$X)) + expect_false(any(is.na(Xi))) + expect_equal(rownames(Xi), rownames(inp$X)) +}) + +test_that("softimpute_imputation: handles all-NA columns by removing them", { + skip_if_not_installed("softImpute") + inp <- .make_imputation_inputs() + X <- inp$X + X[, 5] <- NA + expect_message( + Xi <- softimpute_imputation(X, verbose = TRUE), + "Removed 1 column" + ) + expect_equal(ncol(Xi), ncol(X) - 1L) +}) + +# ============================================================================= +# flashier_imputation +# ============================================================================= + +test_that("flashier_imputation: returns matrix with same dims, no NAs", { + skip_if_not_installed("flashier") + skip_if_not_installed("ebnm") + inp <- .make_imputation_inputs() + Xi <- flashier_imputation(inp$X, verbose = 0) + expect_equal(dim(Xi), dim(inp$X)) + expect_false(any(is.na(Xi))) +}) + +test_that("flashier_imputation: recovers low-rank truth better than column means", { + skip_if_not_installed("flashier") + skip_if_not_installed("ebnm") + inp <- .make_imputation_inputs() + Xi <- flashier_imputation(inp$X, verbose = 0) + # Baseline: column-mean imputation + Xbase <- inp$X + cm <- colMeans(Xbase, na.rm = TRUE) + for (j in seq_len(ncol(Xbase))) Xbase[is.na(Xbase[, j]), j] <- cm[j] + rmse_flash <- sqrt(mean((Xi[inp$na_mask] - inp$truth[inp$na_mask])^2)) + rmse_mean <- sqrt(mean((Xbase[inp$na_mask] - inp$truth[inp$na_mask])^2)) + expect_lt(rmse_flash, rmse_mean) +}) + +# ============================================================================= +# molecular_trait_imputation_pipeline +# ============================================================================= + +test_that("pipeline: single-method dispatch returns named list keyed by method", { + skip_if_not_installed("softImpute") + inp <- .make_imputation_inputs() + res <- molecular_trait_imputation_pipeline(inp$X, methods = "softimpute") + expect_named(res$imputed, "softimpute") + expect_equal(dim(res$imputed[["softimpute"]]), dim(inp$X)) + expect_null(res$evaluation) +}) + +test_that("pipeline: multi-method runs all three and returns one matrix per method", { + skip_if_not_installed("softImpute") + skip_if_not_installed("flashier") + skip_if_not_installed("xgboost") + inp <- .make_imputation_inputs() + res <- molecular_trait_imputation_pipeline( + inp$X, methods = c("xgboost", "softimpute", "flashier"), + method_args = list( + xgboost = list(maxiter = 2, nrounds = 10, verbose = FALSE) + ) + ) + expect_named(res$imputed, c("xgboost", "softimpute", "flashier")) + for (m in names(res$imputed)) { + expect_equal(dim(res$imputed[[m]]), dim(inp$X)) + expect_false(any(is.na(res$imputed[[m]]))) + } +}) + +test_that("pipeline: evaluate = TRUE reports RMSE on held-out cells", { + skip_if_not_installed("softImpute") + skip_if_not_installed("flashier") + inp <- .make_imputation_inputs() + res <- molecular_trait_imputation_pipeline( + inp$X, methods = c("softimpute", "flashier"), + evaluate = TRUE, eval_fraction = 0.1, eval_seed = 7 + ) + expect_s3_class(res$evaluation, "data.frame") + expect_equal(nrow(res$evaluation), 2L) + expect_named(res$evaluation, + c("method", "rmse", "n_held_out", "time_seconds")) + expect_true(all(res$evaluation$rmse > 0)) + expect_true(all(res$evaluation$n_held_out > 0)) +}) + +test_that("pipeline: missingness summary matches input NA count", { + skip_if_not_installed("softImpute") + inp <- .make_imputation_inputs() + res <- molecular_trait_imputation_pipeline(inp$X, methods = "softimpute") + expect_equal(res$missingness$n_missing, sum(is.na(inp$X))) + expect_equal(res$missingness$fraction, + sum(is.na(inp$X)) / prod(dim(inp$X))) +}) + +test_that("pipeline: rejects unknown method", { + inp <- .make_imputation_inputs() + expect_error( + molecular_trait_imputation_pipeline(inp$X, methods = "knnimpute"), + "Unknown imputation method" + ) +}) + +test_that("pipeline: rejects empty methods vector", { + inp <- .make_imputation_inputs() + expect_error( + molecular_trait_imputation_pipeline(inp$X, methods = character(0)), + "non-empty character vector" + ) +}) diff --git a/vignettes/molecular-imputation.Rmd b/vignettes/molecular-imputation.Rmd new file mode 100644 index 00000000..bbd7fd14 --- /dev/null +++ b/vignettes/molecular-imputation.Rmd @@ -0,0 +1,185 @@ +--- +title: "Imputing Missing Values in Molecular Trait Matrices" +author: "pecotmr authors" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Imputing Missing Values in Molecular Trait Matrices} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, + fig.width = 7, fig.height = 4.5) +``` + +## Overview + +Real molecular trait data (expression, splicing, protein abundance, +chromatin accessibility) almost always has missing values: low-coverage +samples, transcripts below detection in some tissues, batch dropouts. +Many downstream pecotmr workflows expect a complete matrix: + +- [Fine-mapping](fine-mapping.html) — `univariate_analysis_pipeline()` + requires a numeric `Y` vector with no NAs. +- [TWAS weights](twas-weights.html) — `twas_weights_pipeline()` and + `twas_multivariate_weights_pipeline()` require complete `Y`. +- Multivariate fits — `mrmash_wrapper()` and `multivariate_analysis_pipeline()` + reject NA in `Y`. + +`pecotmr` provides three imputation methods plus a unified pipeline. +The methods differ in what they assume about the data: + +| Method | Function | Assumption | When to use | +|---|---|---|---| +| **XGBoost** | `xgboost_imputation()` | None — fits arbitrary nonlinear per-column models | Heterogeneous matrices where columns have different relationships. Slowest but most flexible. | +| **softImpute** | `softimpute_imputation()` | Low-rank smooth signal | Expression matrices with a small number of dominant axes of variation. Fastest. | +| **flashier** | `flashier_imputation()` | Sparse + scale-mixture EBMF factors | Matrices with sparse + diffuse signal mixtures. Good middle ground; reuses the same prior families as `compute_cov_flash()` for mr.mash priors. | + +All three are exposed through `molecular_trait_imputation_pipeline()`, +which can fit multiple methods in one call and evaluate them on a +held-out random mask of observed entries. + +```{r load-packages} +library(pecotmr) +``` + +## Quick start: one method on a synthetic matrix + +A low-rank synthetic example illustrates the contract. We build a +60 × 15 matrix with rank-3 truth + Gaussian noise, then mask 10% of +entries. + +```{r make-synthetic} +set.seed(1) +n <- 60; p <- 15; rank_true <- 3 +U <- matrix(rnorm(n * rank_true), n, rank_true) +V <- matrix(rnorm(p * rank_true), p, rank_true) +truth <- U %*% t(V) + matrix(rnorm(n * p, sd = 0.1), n, p) +rownames(truth) <- paste0("S", seq_len(n)) +colnames(truth) <- paste0("F", seq_len(p)) + +# Mask 10% of entries +na_mask <- matrix(runif(n * p) < 0.1, n, p) +X <- truth +X[na_mask] <- NA +sprintf("Missingness: %.1f%%", 100 * mean(is.na(X))) +``` + +Run the three methods individually: + +```{r single-method-runs} +Xi_softimpute <- softimpute_imputation(X) +Xi_flashier <- flashier_imputation(X, verbose = 0) +# XGBoost can be slow; reduce iterations for the vignette. +Xi_xgboost <- xgboost_imputation(X, maxiter = 3, nrounds = 20, + verbose = FALSE) + +# RMSE against the truth on the masked cells +rmse <- function(Xi) sqrt(mean((Xi[na_mask] - truth[na_mask])^2)) +cat(sprintf("softImpute RMSE: %.3f\n", rmse(Xi_softimpute))) +cat(sprintf("flashier RMSE: %.3f\n", rmse(Xi_flashier))) +cat(sprintf("xgboost RMSE: %.3f\n", rmse(Xi_xgboost))) +``` + +On rank-3 truth, flashier and softImpute both exploit the low-rank +structure; XGBoost has no structural assumption and pays for that on +this kind of input. Real expression matrices tend to be closer to the +low-rank regime, so softImpute and flashier are usually competitive. + +## The pipeline: multi-method + held-out CV + +For real data you typically want to compare methods before picking one. +`molecular_trait_imputation_pipeline()` does this in a single call. +With `evaluate = TRUE` it holds out a random fraction of observed cells, +fits each method against the masked matrix, and reports RMSE on the +masked cells. + +```{r pipeline-multi} +res <- molecular_trait_imputation_pipeline( + X, + methods = c("xgboost", "softimpute", "flashier"), + method_args = list( + xgboost = list(maxiter = 3, nrounds = 20, verbose = FALSE) + ), + evaluate = TRUE, + eval_fraction = 0.05, + eval_seed = 7 +) + +res$missingness$fraction # overall missingness +res$evaluation # per-method RMSE + time +str(res$imputed, max.level = 1) # final imputed matrices, one per method +``` + +The pipeline reports three things: + +- **`imputed`** — named list of imputed matrices, one per requested + method. These were fit on the **full** input (including the cells + that were held out for CV), so they are what you would use + downstream. +- **`missingness`** — per-column NA counts and overall NA fraction. +- **`evaluation`** — when `evaluate = TRUE`, a data.frame with one row + per method: `method`, `rmse`, `n_held_out`, `time_seconds`. Use it to + pick the method that best handles your data's structure. + +## Choosing a method + +| Method | Typical RMSE behaviour | Cost | Notes | +|---|---|---|---| +| **softimpute** | Best on smooth low-rank matrices. Sensitive to the choice of `lambda`; the default `0.5 * lambda0(x)` is a reasonable starting point. | Very fast (seconds). | Tune `lambda` and `rank.max` if RMSE is poor on your data. | +| **flashier** | Best on mixed sparse + diffuse signal. Default priors (`ebnm_normal` + `ebnm_normal_scale_mixture`) work well for most expression-like matrices. | Moderate (seconds-to-minutes). | Reuses the same FLASH defaults as `compute_cov_flash()` (mr.mash prior step). | +| **xgboost** | Robust to non-linear column relationships. Can overfit on small matrices. | Slowest (minutes-to-hours for large matrices). | Set `num_workers > 1` for parallel per-column fits. Tune `max_depth` and `nrounds`. | + +A simple rule of thumb: run all three with `evaluate = TRUE` on a +small subset of your data, then use the best for the full matrix. + +## Integration with the rest of pecotmr + +The imputed matrix slots directly into the downstream pipelines: + +```{r downstream, eval=FALSE} +# Imputed phenotype matrix from the pipeline +Y_complete <- res$imputed[["flashier"]] + +# Fit fine-mapping on a single context (one column of Y) +fit <- univariate_analysis_pipeline( + X = genotype_matrix, Y = Y_complete[, "F1"], maf = maf_vector, + twas_weights = TRUE +) + +# Or fit multivariate models on the full matrix +mnm_fit <- multivariate_analysis_pipeline( + X = genotype_matrix, Y = Y_complete, + data_driven_prior_matrices = prior_matrices +) +mt <- twas_multivariate_weights_pipeline( + X = genotype_matrix, Y = Y_complete, mnm_fit = mnm_fit +) +``` + +Imputation happens once, upstream of fine-mapping and TWAS weight +training, on the full phenotype matrix across contexts. The pipelines +do not re-impute internally. + +## Caveats + +1. **Don't over-impute.** A column that's missing in most samples won't + recover useful structure no matter which method you use. Drop + columns with high missingness (`> 50%` is a common cutoff) before + imputing. +2. **All-NA columns are dropped.** Each method removes columns where + every entry is missing and emits a one-line message. The returned + matrix has fewer columns than the input in that case. +3. **CV evaluation uses one random mask.** For more stable RMSE + estimates, run the pipeline a few times with different `eval_seed` + values and average. +4. **softImpute lambda tuning matters.** The default + `0.5 * lambda0(x)` is a starting point. If softImpute looks + uncompetitive on your data, sweep `lambda` over a grid (e.g. + `lambda = lambda0(x) * c(0.1, 0.25, 0.5, 1, 2)`). + +```{r session-info} +sessionInfo() +``` From 1a61facac4b5dba5b8f685ee51f4dba01e6133c5 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Fri, 5 Jun 2026 01:18:14 -0700 Subject: [PATCH 3/6] add impute pipeline --- DESCRIPTION | 1 + R/imputation.R | 468 +++++++++++++++++++++ R/misc.R | 359 ---------------- man/flashier_imputation.Rd | 2 +- man/molecular_trait_imputation_pipeline.Rd | 2 +- man/softimpute_imputation.Rd | 38 +- man/xgboost_imputation.Rd | 2 +- tests/testthat/test_molecular_imputation.R | 45 ++ vignettes/molecular-imputation.Rmd | 14 +- 9 files changed, 555 insertions(+), 376 deletions(-) create mode 100644 R/imputation.R diff --git a/DESCRIPTION b/DESCRIPTION index 3ca4b8c9..830def16 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -112,6 +112,7 @@ Collate: 'h2_hdl.R' 'h2_lder.R' 'h2_utils.R' + 'imputation.R' 'ld_loader.R' 'mash_wrapper.R' 'misc.R' diff --git a/R/imputation.R b/R/imputation.R new file mode 100644 index 00000000..226d6be7 --- /dev/null +++ b/R/imputation.R @@ -0,0 +1,468 @@ +# ============================================================================= +# Molecular trait imputation +# +# User-facing imputation methods for filling missing values in molecular trait +# matrices (expression, splicing, protein abundance) before fine-mapping and +# TWAS-weight training. Three method wrappers plus a unified pipeline; see the +# molecular-imputation vignette for when to use which method. +# +# The genotype-side mean_impute() helper lives in misc.R because it is an +# internal building block of compute_LD() and genotype loading, not a +# user-facing imputer. +# ============================================================================= + +#' XGBoost-based iterative imputation of missing values +#' +#' Imputes missing values in a numeric matrix by iteratively training +#' per-column XGBoost models on observed entries and predicting missing ones. +#' Columns that are entirely missing are removed. Initial imputation uses +#' column means. +#' +#' @param data Numeric matrix with missing values (NA). +#' @param maxiter Maximum number of imputation iterations (default 10). +#' @param max_depth Maximum tree depth for XGBoost (default 2). +#' @param nrounds Number of boosting rounds per variable (default 50). +#' @param decreasing Logical. If TRUE, impute variables with most missing +#' values first. Default FALSE (fewest missing first). +#' @param num_workers Number of parallel workers for BiocParallel. Default 1 +#' (sequential). +#' @param verbose Logical, print progress (default TRUE). +#' @return The imputed matrix with the same dimensions as the input (minus +#' any all-NA columns). +#' @importFrom BiocParallel MulticoreParam SerialParam bplapply +#' @export +xgboost_imputation <- function(data, maxiter = 10L, max_depth = 2L, + nrounds = 50L, decreasing = FALSE, + num_workers = 1L, verbose = TRUE) { + if (!requireNamespace("xgboost", quietly = TRUE)) + stop("Package 'xgboost' is required for xgboost_imputation") + + xmis <- as.matrix(data) + n <- nrow(xmis) + p <- ncol(xmis) + + # Remove completely missing columns + all_na <- colSums(is.na(xmis)) == n + if (any(all_na)) { + if (verbose) + message("Removed ", sum(all_na), " column(s) with all entries missing.") + xmis <- xmis[, !all_na, drop = FALSE] + p <- ncol(xmis) + } + + # Initial mean imputation + ximp <- xmis + col_means <- colMeans(xmis, na.rm = TRUE) + for (j in seq_len(p)) { + ximp[is.na(xmis[, j]), j] <- col_means[j] + } + + # Missing value locations + NAloc <- is.na(xmis) + noNAvar <- colSums(NAloc) + sort_j <- order(noNAvar, decreasing = decreasing) + nzsort_j <- sort_j[noNAvar[sort_j] > 0] + + if (length(nzsort_j) == 0) { + if (verbose) message("No missing values to impute.") + return(ximp) + } + + # Set up BiocParallel + if (num_workers > 1L) { + BPPARAM <- MulticoreParam(workers = num_workers) + } else { + BPPARAM <- SerialParam() + } + + iter <- 0L + conv_new <- 0 + conv_old <- Inf + ximp_history <- vector("list", maxiter) + + while (conv_new < conv_old && iter < maxiter) { + if (iter > 0) conv_old <- conv_new + if (verbose) message(" XGBoost iteration ", iter + 1L, " in progress...") + + ximp_old <- ximp + + # Impute each variable with missing values + impute_one <- function(var_idx) { + obsi <- !NAloc[, var_idx] + misi <- NAloc[, var_idx] + obsY <- ximp[obsi, var_idx] + obsX <- ximp[obsi, -var_idx, drop = FALSE] + misX <- ximp[misi, -var_idx, drop = FALSE] + + xgb_train <- xgboost::xgb.DMatrix(data = obsX, label = obsY) + xgb_pred <- xgboost::xgb.DMatrix(data = misX) + model <- xgboost::xgb.train( + params = list(max_depth = max_depth, verbosity = 0), + data = xgb_train, nrounds = nrounds) + list(var_idx = var_idx, predicted = predict(model, xgb_pred)) + } + + results <- bplapply(nzsort_j, impute_one, BPPARAM = BPPARAM) + + for (res in results) { + misi <- NAloc[, res$var_idx] + ximp[misi, res$var_idx] <- res$predicted + } + + iter <- iter + 1L + ximp_history[[iter]] <- ximp + + # Convergence: relative change in imputed values + conv_new <- sum((ximp - ximp_old)^2) / sum(ximp^2) + } + + # Return last improving iteration + if (iter == maxiter) ximp_history[[iter]] else ximp_history[[max(iter - 1L, 1L)]] +} + +#' softImpute-based matrix imputation +#' +#' Imputes missing values in a numeric matrix via the soft-thresholded SVD +#' algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"} +#' alternating-least-squares solver by default. +#' +#' \code{lambda} drives the bias-variance trade-off. By default the +#' function tunes it via held-out cross-validation: a random fraction of +#' observed cells is masked, softImpute is fit across a geometric grid of +#' lambdas with warm-start, and the lambda minimising RMSE on the masked +#' cells is selected. The final imputation then uses that lambda on the +#' full input. Pass a scalar \code{lambda} to skip tuning. +#' +#' Columns with all entries missing are removed before fitting and a +#' one-line message records the count. +#' +#' @param data Numeric matrix (samples x features) with missing values +#' (NA). +#' @param rank.max Maximum rank of the SVD solution. Default +#' \code{min(dim(data) - 1)}. +#' @param lambda Nuclear-norm penalty. \code{NULL} (default) tunes over +#' a default geometric grid via held-out CV. A scalar uses that lambda +#' directly with no tuning. A vector defines the tuning grid. +#' @param tune_fraction Fraction of observed cells held out for CV. +#' Default 0.05. Ignored when \code{lambda} is a scalar. +#' @param tune_seed Seed for the held-out mask. Default 42L. +#' @param type Optimisation type passed to \code{softImpute::softImpute}; +#' one of \code{"als"} (default, faster) or \code{"svd"}. +#' @param thresh Convergence threshold. Default 1e-5. +#' @param maxit Maximum iterations. Default 100. +#' @param verbose Logical, print progress (including per-lambda CV RMSE). +#' Default FALSE. +#' @return The imputed matrix with the same dimensions as the input +#' (minus any all-NA columns). Carries attributes +#' \code{"softimpute_lambda"} (the lambda used for the final fit) and, +#' when tuning was performed, \code{"softimpute_lambda_cv"} (a +#' data.frame of grid-search RMSE values). +#' @export +softimpute_imputation <- function(data, rank.max = NULL, lambda = NULL, + tune_fraction = 0.05, + tune_seed = 42L, + type = c("als", "svd"), + thresh = 1e-5, maxit = 100, + verbose = FALSE) { + if (!requireNamespace("softImpute", quietly = TRUE)) { + stop("Package 'softImpute' is required for softimpute_imputation.") + } + type <- match.arg(type) + X <- as.matrix(data) + n <- nrow(X); p <- ncol(X) + all_na <- colSums(is.na(X)) == n + if (any(all_na)) { + if (verbose) message("Removed ", sum(all_na), " column(s) with all entries missing.") + X <- X[, !all_na, drop = FALSE] + p <- ncol(X) + } + if (is.null(rank.max)) rank.max <- max(1L, min(n, p) - 1L) + lambda0_val <- softImpute::lambda0(X) + + # Decide whether to tune. lambda = scalar => no tuning; NULL/vector => tune. + tune <- is.null(lambda) || length(lambda) > 1L + if (tune) { + if (is.null(lambda)) { + # Default geometric grid from lambda0 down to 1e-3 * lambda0. + lambda_grid <- exp(seq(log(lambda0_val), + log(max(1e-3 * lambda0_val, .Machine$double.eps)), + length.out = 10L)) + } else { + lambda_grid <- sort(as.numeric(lambda), decreasing = TRUE) + } + cv_res <- .softimpute_cv_lambda(X, lambda_grid = lambda_grid, + rank.max = rank.max, + tune_fraction = tune_fraction, + tune_seed = tune_seed, + type = type, thresh = thresh, maxit = maxit, + verbose = verbose) + lambda <- cv_res$best_lambda + if (verbose) { + message(sprintf("softimpute_imputation: tuned lambda = %.4g (CV RMSE %.4f over %d cells)", + lambda, cv_res$best_rmse, cv_res$n_held_out)) + } + } + + fit <- softImpute::softImpute(X, rank.max = rank.max, lambda = lambda, + type = type, thresh = thresh, maxit = maxit, + trace.it = isTRUE(verbose)) + Ximp <- softImpute::complete(X, fit) + if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X) + if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X) + attr(Ximp, "softimpute_lambda") <- lambda + if (tune) attr(Ximp, "softimpute_lambda_cv") <- cv_res$grid + Ximp +} + +# Internal: CV-tune softImpute's lambda over a grid using a held-out mask. +# Uses softImpute's warm.start feature to chain fits along the lambda path +# (largest lambda first). Returns the chosen lambda, its CV RMSE, and the +# full grid scores. +.softimpute_cv_lambda <- function(X, lambda_grid, rank.max, + tune_fraction, tune_seed, + type, thresh, maxit, verbose) { + na_mask <- is.na(X) + obs_idx <- which(!na_mask) + n_hold <- max(1L, floor(tune_fraction * length(obs_idx))) + if (n_hold >= length(obs_idx)) { + stop("tune_fraction too large; would mask all observed entries.") + } + set.seed(tune_seed) + hold_idx <- sample(obs_idx, size = n_hold, replace = FALSE) + truth <- X[hold_idx] + Xmasked <- X + Xmasked[hold_idx] <- NA_real_ + + lambdas <- sort(as.numeric(lambda_grid), decreasing = TRUE) + rmse_vec <- numeric(length(lambdas)) + warm <- NULL + for (i in seq_along(lambdas)) { + fit_args <- list(x = Xmasked, rank.max = rank.max, lambda = lambdas[i], + type = type, thresh = thresh, maxit = maxit, + trace.it = FALSE) + if (!is.null(warm)) fit_args$warm.start <- warm + fit <- tryCatch(do.call(softImpute::softImpute, fit_args), + error = function(e) NULL) + if (is.null(fit)) { + rmse_vec[i] <- NA_real_ + next + } + Ximp <- softImpute::complete(Xmasked, fit) + rmse_vec[i] <- sqrt(mean((Ximp[hold_idx] - truth)^2, na.rm = TRUE)) + warm <- fit + if (verbose) { + message(sprintf(" softimpute CV lambda = %.4g -> RMSE %.4f", + lambdas[i], rmse_vec[i])) + } + } + grid <- data.frame(lambda = lambdas, rmse = rmse_vec, + stringsAsFactors = FALSE) + ok <- which(is.finite(rmse_vec)) + if (length(ok) == 0L) { + stop("All softImpute CV fits failed; supply lambda explicitly.") + } + best <- ok[which.min(rmse_vec[ok])] + list(best_lambda = lambdas[best], best_rmse = rmse_vec[best], + n_held_out = n_hold, grid = grid) +} + +#' flashier-based matrix imputation +#' +#' Imputes missing values in a numeric matrix via empirical Bayes +#' matrix factorisation (\code{flashier::flash}). \code{flash} natively +#' supports NA entries and reconstructs them from the posterior fitted +#' values. The default prior families match those used by +#' \code{\link{compute_cov_flash}}: \code{ebnm::ebnm_normal} on the row +#' factor and \code{ebnm::ebnm_normal_scale_mixture} on the column +#' factor. +#' +#' @param data Numeric matrix (samples x features) with missing values +#' (NA). +#' @param var_type Variance structure for the residuals, passed to +#' \code{flashier::flash}. Default 2 (per-column). +#' @param ebnm_fn Prior family. Default +#' \code{c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture)}. +#' @param greedy_Kmax Maximum number of greedy factors. Default +#' \code{min(20L, min(dim(data)) - 1L)}. +#' @param backfit Logical; run a final backfit pass. Default TRUE. +#' @param verbose Numeric verbosity (0 silent, default). +#' @param ... Additional arguments forwarded to \code{flashier::flash}. +#' @return The imputed matrix with the same dimensions as the input +#' (minus any all-NA columns). +#' @export +flashier_imputation <- function(data, var_type = 2, + ebnm_fn = c(ebnm::ebnm_normal, + ebnm::ebnm_normal_scale_mixture), + greedy_Kmax = NULL, + backfit = TRUE, verbose = 0, ...) { + if (!requireNamespace("flashier", quietly = TRUE)) { + stop("Package 'flashier' is required for flashier_imputation.") + } + if (!requireNamespace("ebnm", quietly = TRUE)) { + stop("Package 'ebnm' is required for flashier_imputation.") + } + X <- as.matrix(data) + n <- nrow(X); p <- ncol(X) + all_na <- colSums(is.na(X)) == n + if (any(all_na)) { + if (isTRUE(verbose) || (is.numeric(verbose) && verbose >= 1)) { + message("Removed ", sum(all_na), " column(s) with all entries missing.") + } + X <- X[, !all_na, drop = FALSE] + p <- ncol(X) + } + if (is.null(greedy_Kmax)) greedy_Kmax <- max(1L, min(20L, min(n, p) - 1L)) + fl <- flashier::flash(X, var_type = var_type, ebnm_fn = ebnm_fn, + greedy_Kmax = greedy_Kmax, backfit = backfit, + verbose = verbose, ...) + # Posterior fitted value: L_pm %*% t(F_pm). When no factors are found, + # fall back to column means. + if (is.null(fl$n_factors) || fl$n_factors == 0) { + fitted <- matrix(colMeans(X, na.rm = TRUE), + nrow = n, ncol = p, byrow = TRUE) + } else { + fitted <- tcrossprod(fl$L_pm, fl$F_pm) + } + Ximp <- X + na_idx <- is.na(X) + Ximp[na_idx] <- fitted[na_idx] + if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X) + if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X) + Ximp +} + +#' Pipeline for molecular trait imputation +#' +#' Unified entry point for imputing missing values in a molecular trait +#' matrix (e.g. expression, splicing, protein abundance). Dispatches to +#' one or more underlying methods and optionally evaluates imputation +#' quality on a held-out random mask of observed entries. +#' +#' Available methods: +#' \itemize{ +#' \item \code{"xgboost"} - \code{\link{xgboost_imputation}}, iterative +#' XGBoost per-column models. No structural assumption on the matrix. +#' \item \code{"softimpute"} - \code{\link{softimpute_imputation}}, +#' soft-thresholded SVD via \code{softImpute::softImpute}. Assumes a +#' low-rank smooth signal. +#' \item \code{"flashier"} - \code{\link{flashier_imputation}}, +#' empirical Bayes matrix factorisation via \code{flashier::flash} +#' with sparse + scale-mixture priors. +#' } +#' +#' @param data Numeric matrix (samples x features) with missing values. +#' @param methods Character vector of methods to fit. Any subset of +#' \code{c("xgboost", "softimpute", "flashier")}. Default fits all +#' three. +#' @param method_args Named list of per-method argument lists. List +#' names must match \code{methods}. Default empty. +#' @param evaluate Logical. If TRUE, mask a random fraction of observed +#' entries, run each method, and report root-mean-squared error on the +#' masked cells. Default FALSE. +#' @param eval_fraction Fraction of observed cells to hold out for +#' evaluation. Default 0.05. +#' @param eval_seed Random seed for the held-out mask. Default 42. +#' @param verbose Logical, print progress. Default FALSE. +#' +#' @return A list with: +#' \describe{ +#' \item{imputed}{Named list of imputed matrices, one per requested +#' method.} +#' \item{missingness}{Per-column NA count and overall NA fraction.} +#' \item{evaluation}{When \code{evaluate = TRUE}, a data.frame with +#' one row per method: \code{method}, \code{rmse}, \code{n_held_out}, +#' \code{time_seconds}.} +#' \item{method_args}{The resolved per-method arguments used.} +#' } +#' @export +molecular_trait_imputation_pipeline <- function( + data, + methods = c("xgboost", "softimpute", "flashier"), + method_args = list(), + evaluate = FALSE, + eval_fraction = 0.05, + eval_seed = 42L, + verbose = FALSE) { + valid <- c("xgboost", "softimpute", "flashier") + if (!is.character(methods) || length(methods) == 0L) { + stop("methods must be a non-empty character vector.") + } + bad <- setdiff(methods, valid) + if (length(bad) > 0) { + stop("Unknown imputation method(s): ", paste(bad, collapse = ", "), + ". Valid options: ", paste(valid, collapse = ", ")) + } + methods <- unique(methods) + + Xfull <- as.matrix(data) + n <- nrow(Xfull); p <- ncol(Xfull) + na_mask_full <- is.na(Xfull) + na_per_col <- colSums(na_mask_full) + missingness <- list( + per_column = na_per_col, + n_missing = sum(na_mask_full), + fraction = sum(na_mask_full) / (n * p) + ) + if (verbose) { + message(sprintf("molecular_trait_imputation_pipeline: %d x %d matrix, %.2f%% missing.", + n, p, 100 * missingness$fraction)) + } + + dispatch <- list(xgboost = xgboost_imputation, + softimpute = softimpute_imputation, + flashier = flashier_imputation) + + run_one <- function(m, X_in) { + args <- method_args[[m]] %||% list() + do.call(dispatch[[m]], c(list(X_in), args)) + } + + # If evaluating, hold out a random subset of observed cells, run each + # method against the masked matrix, then score RMSE on the held-out + # cells. Final imputation is fit against the original matrix so the + # returned matrices use every observed cell. + eval_df <- NULL + if (isTRUE(evaluate)) { + obs_idx <- which(!na_mask_full) + n_hold <- max(1L, floor(eval_fraction * length(obs_idx))) + if (n_hold >= length(obs_idx)) { + stop("eval_fraction too large; would mask all observed entries.") + } + set.seed(eval_seed) + hold_idx <- sample(obs_idx, size = n_hold, replace = FALSE) + Xmasked <- Xfull + held_truth <- Xfull[hold_idx] + Xmasked[hold_idx] <- NA_real_ + eval_records <- vector("list", length(methods)) + for (i in seq_along(methods)) { + m <- methods[i] + if (verbose) message("Evaluating ", m, " on held-out mask ...") + t0 <- proc.time() + Ximp_m <- run_one(m, Xmasked) + t1 <- proc.time() + pred <- Ximp_m[hold_idx] + rmse <- sqrt(mean((pred - held_truth)^2, na.rm = TRUE)) + eval_records[[i]] <- data.frame( + method = m, rmse = rmse, n_held_out = n_hold, + time_seconds = as.numeric((t1 - t0)["elapsed"]), + stringsAsFactors = FALSE + ) + } + eval_df <- do.call(rbind, eval_records) + rownames(eval_df) <- NULL + } + + # Final imputation: run each method on the full matrix + imputed <- list() + for (m in methods) { + if (verbose) message("Imputing with ", m, " ...") + imputed[[m]] <- run_one(m, Xfull) + } + + list( + imputed = imputed, + missingness = missingness, + evaluation = eval_df, + method_args = method_args + ) +} diff --git a/R/misc.R b/R/misc.R index 2c135c42..d9505296 100644 --- a/R/misc.R +++ b/R/misc.R @@ -850,365 +850,6 @@ filter_molecular_events <- function(events, filters, condition = NULL, remove_al return(filtered_events) } -#' XGBoost-based iterative imputation of missing values -#' -#' Imputes missing values in a numeric matrix by iteratively training -#' per-column XGBoost models on observed entries and predicting missing ones. -#' Columns that are entirely missing are removed. Initial imputation uses -#' column means. -#' -#' @param data Numeric matrix with missing values (NA). -#' @param maxiter Maximum number of imputation iterations (default 10). -#' @param max_depth Maximum tree depth for XGBoost (default 2). -#' @param nrounds Number of boosting rounds per variable (default 50). -#' @param decreasing Logical. If TRUE, impute variables with most missing -#' values first. Default FALSE (fewest missing first). -#' @param num_workers Number of parallel workers for BiocParallel. Default 1 -#' (sequential). -#' @param verbose Logical, print progress (default TRUE). -#' @return The imputed matrix with the same dimensions as the input (minus -#' any all-NA columns). -#' @importFrom BiocParallel MulticoreParam SerialParam bplapply -#' @export -xgboost_imputation <- function(data, maxiter = 10L, max_depth = 2L, - nrounds = 50L, decreasing = FALSE, - num_workers = 1L, verbose = TRUE) { - if (!requireNamespace("xgboost", quietly = TRUE)) - stop("Package 'xgboost' is required for xgboost_imputation") - - xmis <- as.matrix(data) - n <- nrow(xmis) - p <- ncol(xmis) - - # Remove completely missing columns - all_na <- colSums(is.na(xmis)) == n - if (any(all_na)) { - if (verbose) - message("Removed ", sum(all_na), " column(s) with all entries missing.") - xmis <- xmis[, !all_na, drop = FALSE] - p <- ncol(xmis) - } - - # Initial mean imputation - ximp <- xmis - col_means <- colMeans(xmis, na.rm = TRUE) - for (j in seq_len(p)) { - ximp[is.na(xmis[, j]), j] <- col_means[j] - } - - # Missing value locations - NAloc <- is.na(xmis) - noNAvar <- colSums(NAloc) - sort_j <- order(noNAvar, decreasing = decreasing) - nzsort_j <- sort_j[noNAvar[sort_j] > 0] - - if (length(nzsort_j) == 0) { - if (verbose) message("No missing values to impute.") - return(ximp) - } - - # Set up BiocParallel - if (num_workers > 1L) { - BPPARAM <- MulticoreParam(workers = num_workers) - } else { - BPPARAM <- SerialParam() - } - - iter <- 0L - conv_new <- 0 - conv_old <- Inf - ximp_history <- vector("list", maxiter) - - while (conv_new < conv_old && iter < maxiter) { - if (iter > 0) conv_old <- conv_new - if (verbose) message(" XGBoost iteration ", iter + 1L, " in progress...") - - ximp_old <- ximp - - # Impute each variable with missing values - impute_one <- function(var_idx) { - obsi <- !NAloc[, var_idx] - misi <- NAloc[, var_idx] - obsY <- ximp[obsi, var_idx] - obsX <- ximp[obsi, -var_idx, drop = FALSE] - misX <- ximp[misi, -var_idx, drop = FALSE] - - xgb_train <- xgboost::xgb.DMatrix(data = obsX, label = obsY) - xgb_pred <- xgboost::xgb.DMatrix(data = misX) - model <- xgboost::xgb.train( - params = list(max_depth = max_depth, verbosity = 0), - data = xgb_train, nrounds = nrounds) - list(var_idx = var_idx, predicted = predict(model, xgb_pred)) - } - - results <- bplapply(nzsort_j, impute_one, BPPARAM = BPPARAM) - - for (res in results) { - misi <- NAloc[, res$var_idx] - ximp[misi, res$var_idx] <- res$predicted - } - - iter <- iter + 1L - ximp_history[[iter]] <- ximp - - # Convergence: relative change in imputed values - conv_new <- sum((ximp - ximp_old)^2) / sum(ximp^2) - } - - # Return last improving iteration - if (iter == maxiter) ximp_history[[iter]] else ximp_history[[max(iter - 1L, 1L)]] -} - -#' softImpute-based matrix imputation -#' -#' Imputes missing values in a numeric matrix via the soft-thresholded SVD -#' algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"} -#' alternating-least-squares solver by default; \code{lambda} defaults to -#' a sensible point on the lambda grid (\code{0.5 * lambda0(x)}) when not -#' supplied. Columns with all entries missing are removed before fitting -#' and a one-line message records the count. -#' -#' @param data Numeric matrix (samples x features) with missing values -#' (NA). -#' @param rank.max Maximum rank of the SVD solution. Default -#' \code{min(dim(data) - 1)}. -#' @param lambda Nuclear-norm penalty. NULL (default) chooses -#' \code{0.5 * lambda0(data)} so the solution is not degenerate. -#' @param type Optimisation type passed to \code{softImpute::softImpute}; -#' one of \code{"als"} (default, faster) or \code{"svd"}. -#' @param thresh Convergence threshold. Default 1e-5. -#' @param maxit Maximum iterations. Default 100. -#' @param verbose Logical, print progress. Default FALSE. -#' @return The imputed matrix with the same dimensions as the input -#' (minus any all-NA columns). -#' @export -softimpute_imputation <- function(data, rank.max = NULL, lambda = NULL, - type = c("als", "svd"), - thresh = 1e-5, maxit = 100, - verbose = FALSE) { - if (!requireNamespace("softImpute", quietly = TRUE)) { - stop("Package 'softImpute' is required for softimpute_imputation.") - } - type <- match.arg(type) - X <- as.matrix(data) - n <- nrow(X); p <- ncol(X) - all_na <- colSums(is.na(X)) == n - if (any(all_na)) { - if (verbose) message("Removed ", sum(all_na), " column(s) with all entries missing.") - X <- X[, !all_na, drop = FALSE] - p <- ncol(X) - } - if (is.null(rank.max)) rank.max <- max(1L, min(n, p) - 1L) - if (is.null(lambda)) lambda <- 0.5 * softImpute::lambda0(X) - fit <- softImpute::softImpute(X, rank.max = rank.max, lambda = lambda, - type = type, thresh = thresh, maxit = maxit, - trace.it = isTRUE(verbose)) - Ximp <- softImpute::complete(X, fit) - if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X) - if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X) - Ximp -} - -#' flashier-based matrix imputation -#' -#' Imputes missing values in a numeric matrix via empirical Bayes -#' matrix factorisation (\code{flashier::flash}). \code{flash} natively -#' supports NA entries and reconstructs them from the posterior fitted -#' values. The default prior families match those used by -#' \code{\link{compute_cov_flash}}: \code{ebnm::ebnm_normal} on the row -#' factor and \code{ebnm::ebnm_normal_scale_mixture} on the column -#' factor. -#' -#' @param data Numeric matrix (samples x features) with missing values -#' (NA). -#' @param var_type Variance structure for the residuals, passed to -#' \code{flashier::flash}. Default 2 (per-column). -#' @param ebnm_fn Prior family. Default -#' \code{c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture)}. -#' @param greedy_Kmax Maximum number of greedy factors. Default -#' \code{min(20L, min(dim(data)) - 1L)}. -#' @param backfit Logical; run a final backfit pass. Default TRUE. -#' @param verbose Numeric verbosity (0 silent, default). -#' @param ... Additional arguments forwarded to \code{flashier::flash}. -#' @return The imputed matrix with the same dimensions as the input -#' (minus any all-NA columns). -#' @export -flashier_imputation <- function(data, var_type = 2, - ebnm_fn = c(ebnm::ebnm_normal, - ebnm::ebnm_normal_scale_mixture), - greedy_Kmax = NULL, - backfit = TRUE, verbose = 0, ...) { - if (!requireNamespace("flashier", quietly = TRUE)) { - stop("Package 'flashier' is required for flashier_imputation.") - } - if (!requireNamespace("ebnm", quietly = TRUE)) { - stop("Package 'ebnm' is required for flashier_imputation.") - } - X <- as.matrix(data) - n <- nrow(X); p <- ncol(X) - all_na <- colSums(is.na(X)) == n - if (any(all_na)) { - if (isTRUE(verbose) || (is.numeric(verbose) && verbose >= 1)) { - message("Removed ", sum(all_na), " column(s) with all entries missing.") - } - X <- X[, !all_na, drop = FALSE] - p <- ncol(X) - } - if (is.null(greedy_Kmax)) greedy_Kmax <- max(1L, min(20L, min(n, p) - 1L)) - fl <- flashier::flash(X, var_type = var_type, ebnm_fn = ebnm_fn, - greedy_Kmax = greedy_Kmax, backfit = backfit, - verbose = verbose, ...) - # Posterior fitted value: L_pm %*% t(F_pm). When no factors are found, - # fall back to column means. - if (is.null(fl$n_factors) || fl$n_factors == 0) { - fitted <- matrix(colMeans(X, na.rm = TRUE), - nrow = n, ncol = p, byrow = TRUE) - } else { - fitted <- tcrossprod(fl$L_pm, fl$F_pm) - } - Ximp <- X - na_idx <- is.na(X) - Ximp[na_idx] <- fitted[na_idx] - if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X) - if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X) - Ximp -} - -#' Pipeline for molecular trait imputation -#' -#' Unified entry point for imputing missing values in a molecular trait -#' matrix (e.g. expression, splicing, protein abundance). Dispatches to -#' one or more underlying methods and optionally evaluates imputation -#' quality on a held-out random mask of observed entries. -#' -#' Available methods: -#' \itemize{ -#' \item \code{"xgboost"} - \code{\link{xgboost_imputation}}, iterative -#' XGBoost per-column models. No structural assumption on the matrix. -#' \item \code{"softimpute"} - \code{\link{softimpute_imputation}}, -#' soft-thresholded SVD via \code{softImpute::softImpute}. Assumes a -#' low-rank smooth signal. -#' \item \code{"flashier"} - \code{\link{flashier_imputation}}, -#' empirical Bayes matrix factorisation via \code{flashier::flash} -#' with sparse + scale-mixture priors. -#' } -#' -#' @param data Numeric matrix (samples x features) with missing values. -#' @param methods Character vector of methods to fit. Any subset of -#' \code{c("xgboost", "softimpute", "flashier")}. Default fits all -#' three. -#' @param method_args Named list of per-method argument lists. List -#' names must match \code{methods}. Default empty. -#' @param evaluate Logical. If TRUE, mask a random fraction of observed -#' entries, run each method, and report root-mean-squared error on the -#' masked cells. Default FALSE. -#' @param eval_fraction Fraction of observed cells to hold out for -#' evaluation. Default 0.05. -#' @param eval_seed Random seed for the held-out mask. Default 42. -#' @param verbose Logical, print progress. Default FALSE. -#' -#' @return A list with: -#' \describe{ -#' \item{imputed}{Named list of imputed matrices, one per requested -#' method.} -#' \item{missingness}{Per-column NA count and overall NA fraction.} -#' \item{evaluation}{When \code{evaluate = TRUE}, a data.frame with -#' one row per method: \code{method}, \code{rmse}, \code{n_held_out}, -#' \code{time_seconds}.} -#' \item{method_args}{The resolved per-method arguments used.} -#' } -#' @export -molecular_trait_imputation_pipeline <- function( - data, - methods = c("xgboost", "softimpute", "flashier"), - method_args = list(), - evaluate = FALSE, - eval_fraction = 0.05, - eval_seed = 42L, - verbose = FALSE) { - valid <- c("xgboost", "softimpute", "flashier") - if (!is.character(methods) || length(methods) == 0L) { - stop("methods must be a non-empty character vector.") - } - bad <- setdiff(methods, valid) - if (length(bad) > 0) { - stop("Unknown imputation method(s): ", paste(bad, collapse = ", "), - ". Valid options: ", paste(valid, collapse = ", ")) - } - methods <- unique(methods) - - Xfull <- as.matrix(data) - n <- nrow(Xfull); p <- ncol(Xfull) - na_mask_full <- is.na(Xfull) - na_per_col <- colSums(na_mask_full) - missingness <- list( - per_column = na_per_col, - n_missing = sum(na_mask_full), - fraction = sum(na_mask_full) / (n * p) - ) - if (verbose) { - message(sprintf("molecular_trait_imputation_pipeline: %d x %d matrix, %.2f%% missing.", - n, p, 100 * missingness$fraction)) - } - - dispatch <- list(xgboost = xgboost_imputation, - softimpute = softimpute_imputation, - flashier = flashier_imputation) - - run_one <- function(m, X_in) { - args <- method_args[[m]] %||% list() - do.call(dispatch[[m]], c(list(X_in), args)) - } - - # If evaluating, hold out a random subset of observed cells, run each - # method against the masked matrix, then score RMSE on the held-out - # cells. Final imputation is fit against the original matrix so the - # returned matrices use every observed cell. - eval_df <- NULL - if (isTRUE(evaluate)) { - obs_idx <- which(!na_mask_full) - n_hold <- max(1L, floor(eval_fraction * length(obs_idx))) - if (n_hold >= length(obs_idx)) { - stop("eval_fraction too large; would mask all observed entries.") - } - set.seed(eval_seed) - hold_idx <- sample(obs_idx, size = n_hold, replace = FALSE) - Xmasked <- Xfull - held_truth <- Xfull[hold_idx] - Xmasked[hold_idx] <- NA_real_ - eval_records <- vector("list", length(methods)) - for (i in seq_along(methods)) { - m <- methods[i] - if (verbose) message("Evaluating ", m, " on held-out mask ...") - t0 <- proc.time() - Ximp_m <- run_one(m, Xmasked) - t1 <- proc.time() - pred <- Ximp_m[hold_idx] - rmse <- sqrt(mean((pred - held_truth)^2, na.rm = TRUE)) - eval_records[[i]] <- data.frame( - method = m, rmse = rmse, n_held_out = n_hold, - time_seconds = as.numeric((t1 - t0)["elapsed"]), - stringsAsFactors = FALSE - ) - } - eval_df <- do.call(rbind, eval_records) - rownames(eval_df) <- NULL - } - - # Final imputation: run each method on the full matrix - imputed <- list() - for (m in methods) { - if (verbose) message("Imputing with ", m, " ...") - imputed[[m]] <- run_one(m, Xfull) - } - - list( - imputed = imputed, - missingness = missingness, - evaluation = eval_df, - method_args = method_args - ) -} #' Robust Mahalanobis Distance #' diff --git a/man/flashier_imputation.Rd b/man/flashier_imputation.Rd index 90578084..7762c785 100644 --- a/man/flashier_imputation.Rd +++ b/man/flashier_imputation.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/misc.R +% Please edit documentation in R/imputation.R \name{flashier_imputation} \alias{flashier_imputation} \title{flashier-based matrix imputation} diff --git a/man/molecular_trait_imputation_pipeline.Rd b/man/molecular_trait_imputation_pipeline.Rd index 68a74d6a..7932803b 100644 --- a/man/molecular_trait_imputation_pipeline.Rd +++ b/man/molecular_trait_imputation_pipeline.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/misc.R +% Please edit documentation in R/imputation.R \name{molecular_trait_imputation_pipeline} \alias{molecular_trait_imputation_pipeline} \title{Pipeline for molecular trait imputation} diff --git a/man/softimpute_imputation.Rd b/man/softimpute_imputation.Rd index efb08fa5..89eadff7 100644 --- a/man/softimpute_imputation.Rd +++ b/man/softimpute_imputation.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/misc.R +% Please edit documentation in R/imputation.R \name{softimpute_imputation} \alias{softimpute_imputation} \title{softImpute-based matrix imputation} @@ -8,6 +8,8 @@ softimpute_imputation( data, rank.max = NULL, lambda = NULL, + tune_fraction = 0.05, + tune_seed = 42L, type = c("als", "svd"), thresh = 1e-05, maxit = 100, @@ -21,8 +23,14 @@ softimpute_imputation( \item{rank.max}{Maximum rank of the SVD solution. Default \code{min(dim(data) - 1)}.} -\item{lambda}{Nuclear-norm penalty. NULL (default) chooses -\code{0.5 * lambda0(data)} so the solution is not degenerate.} +\item{lambda}{Nuclear-norm penalty. \code{NULL} (default) tunes over +a default geometric grid via held-out CV. A scalar uses that lambda +directly with no tuning. A vector defines the tuning grid.} + +\item{tune_fraction}{Fraction of observed cells held out for CV. +Default 0.05. Ignored when \code{lambda} is a scalar.} + +\item{tune_seed}{Seed for the held-out mask. Default 42L.} \item{type}{Optimisation type passed to \code{softImpute::softImpute}; one of \code{"als"} (default, faster) or \code{"svd"}.} @@ -31,17 +39,29 @@ one of \code{"als"} (default, faster) or \code{"svd"}.} \item{maxit}{Maximum iterations. Default 100.} -\item{verbose}{Logical, print progress. Default FALSE.} +\item{verbose}{Logical, print progress (including per-lambda CV RMSE). +Default FALSE.} } \value{ The imputed matrix with the same dimensions as the input - (minus any all-NA columns). + (minus any all-NA columns). Carries attributes + \code{"softimpute_lambda"} (the lambda used for the final fit) and, + when tuning was performed, \code{"softimpute_lambda_cv"} (a + data.frame of grid-search RMSE values). } \description{ Imputes missing values in a numeric matrix via the soft-thresholded SVD algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"} -alternating-least-squares solver by default; \code{lambda} defaults to -a sensible point on the lambda grid (\code{0.5 * lambda0(x)}) when not -supplied. Columns with all entries missing are removed before fitting -and a one-line message records the count. +alternating-least-squares solver by default. +} +\details{ +\code{lambda} drives the bias-variance trade-off. By default the +function tunes it via held-out cross-validation: a random fraction of +observed cells is masked, softImpute is fit across a geometric grid of +lambdas with warm-start, and the lambda minimising RMSE on the masked +cells is selected. The final imputation then uses that lambda on the +full input. Pass a scalar \code{lambda} to skip tuning. + +Columns with all entries missing are removed before fitting and a +one-line message records the count. } diff --git a/man/xgboost_imputation.Rd b/man/xgboost_imputation.Rd index 0bc3ef03..b09309ee 100644 --- a/man/xgboost_imputation.Rd +++ b/man/xgboost_imputation.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/misc.R +% Please edit documentation in R/imputation.R \name{xgboost_imputation} \alias{xgboost_imputation} \title{XGBoost-based iterative imputation of missing values} diff --git a/tests/testthat/test_molecular_imputation.R b/tests/testthat/test_molecular_imputation.R index 2c68b808..54d39a4a 100644 --- a/tests/testthat/test_molecular_imputation.R +++ b/tests/testthat/test_molecular_imputation.R @@ -40,6 +40,51 @@ test_that("softimpute_imputation: handles all-NA columns by removing them", { expect_equal(ncol(Xi), ncol(X) - 1L) }) +test_that("softimpute_imputation: default tunes lambda via CV and attaches grid", { + skip_if_not_installed("softImpute") + inp <- .make_imputation_inputs() + Xi <- softimpute_imputation(inp$X) # default lambda = NULL -> tune + expect_false(is.null(attr(Xi, "softimpute_lambda"))) + grid <- attr(Xi, "softimpute_lambda_cv") + expect_s3_class(grid, "data.frame") + expect_named(grid, c("lambda", "rmse")) + expect_true(all(diff(grid$lambda) <= 0)) # grid sorted descending + expect_true(any(is.finite(grid$rmse))) + # The chosen lambda must be one of the grid points + expect_true(attr(Xi, "softimpute_lambda") %in% grid$lambda) +}) + +test_that("softimpute_imputation: scalar lambda bypasses tuning", { + skip_if_not_installed("softImpute") + inp <- .make_imputation_inputs() + l0 <- softImpute::lambda0(inp$X) + Xi <- softimpute_imputation(inp$X, lambda = 0.3 * l0) + expect_equal(attr(Xi, "softimpute_lambda"), 0.3 * l0) + expect_null(attr(Xi, "softimpute_lambda_cv")) +}) + +test_that("softimpute_imputation: vector lambda restricts tuning to that grid", { + skip_if_not_installed("softImpute") + inp <- .make_imputation_inputs() + l0 <- softImpute::lambda0(inp$X) + custom_grid <- l0 * c(0.05, 0.1, 0.25, 0.5) + Xi <- softimpute_imputation(inp$X, lambda = custom_grid) + grid <- attr(Xi, "softimpute_lambda_cv") + expect_setequal(grid$lambda, custom_grid) + expect_true(attr(Xi, "softimpute_lambda") %in% custom_grid) +}) + +test_that("softimpute_imputation: chosen lambda minimises CV RMSE on the grid", { + skip_if_not_installed("softImpute") + inp <- .make_imputation_inputs(n = 80, p = 20, rank = 3, seed = 11) + Xi <- softimpute_imputation(inp$X) + grid <- attr(Xi, "softimpute_lambda_cv") + chosen <- attr(Xi, "softimpute_lambda") + ok <- which(is.finite(grid$rmse)) + best_idx <- ok[which.min(grid$rmse[ok])] + expect_equal(chosen, grid$lambda[best_idx]) +}) + # ============================================================================= # flashier_imputation # ============================================================================= diff --git a/vignettes/molecular-imputation.Rmd b/vignettes/molecular-imputation.Rmd index bbd7fd14..49334d85 100644 --- a/vignettes/molecular-imputation.Rmd +++ b/vignettes/molecular-imputation.Rmd @@ -128,7 +128,7 @@ The pipeline reports three things: | Method | Typical RMSE behaviour | Cost | Notes | |---|---|---|---| -| **softimpute** | Best on smooth low-rank matrices. Sensitive to the choice of `lambda`; the default `0.5 * lambda0(x)` is a reasonable starting point. | Very fast (seconds). | Tune `lambda` and `rank.max` if RMSE is poor on your data. | +| **softimpute** | Best on smooth low-rank matrices. `lambda` is auto-tuned by held-out CV by default. | Very fast (seconds) plus the lambda-grid pass. | Pass a scalar `lambda` to skip tuning; pass a vector to restrict the CV grid. | | **flashier** | Best on mixed sparse + diffuse signal. Default priors (`ebnm_normal` + `ebnm_normal_scale_mixture`) work well for most expression-like matrices. | Moderate (seconds-to-minutes). | Reuses the same FLASH defaults as `compute_cov_flash()` (mr.mash prior step). | | **xgboost** | Robust to non-linear column relationships. Can overfit on small matrices. | Slowest (minutes-to-hours for large matrices). | Set `num_workers > 1` for parallel per-column fits. Tune `max_depth` and `nrounds`. | @@ -175,10 +175,14 @@ do not re-impute internally. 3. **CV evaluation uses one random mask.** For more stable RMSE estimates, run the pipeline a few times with different `eval_seed` values and average. -4. **softImpute lambda tuning matters.** The default - `0.5 * lambda0(x)` is a starting point. If softImpute looks - uncompetitive on your data, sweep `lambda` over a grid (e.g. - `lambda = lambda0(x) * c(0.1, 0.25, 0.5, 1, 2)`). +4. **softImpute `lambda` is auto-tuned by default.** A random + fraction of observed cells (default 5%) is held out, softImpute is + fit across a geometric grid of lambdas via warm-start, and the + lambda with the lowest held-out RMSE wins. Pass `lambda = ` + to skip tuning; pass `lambda = ` to restrict the grid. + The chosen lambda and the full grid scores are attached to the + returned matrix as the `"softimpute_lambda"` and + `"softimpute_lambda_cv"` attributes. ```{r session-info} sessionInfo() From 3f2218b02cb8cac7f942bc77386c91204ed21c80 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Fri, 5 Jun 2026 15:22:53 -0700 Subject: [PATCH 4/6] clean up imputation and twas z-score --- vignettes/molecular-imputation.Rmd | 189 ----------------------------- 1 file changed, 189 deletions(-) delete mode 100644 vignettes/molecular-imputation.Rmd diff --git a/vignettes/molecular-imputation.Rmd b/vignettes/molecular-imputation.Rmd deleted file mode 100644 index 49334d85..00000000 --- a/vignettes/molecular-imputation.Rmd +++ /dev/null @@ -1,189 +0,0 @@ ---- -title: "Imputing Missing Values in Molecular Trait Matrices" -author: "pecotmr authors" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Imputing Missing Values in Molecular Trait Matrices} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, - fig.width = 7, fig.height = 4.5) -``` - -## Overview - -Real molecular trait data (expression, splicing, protein abundance, -chromatin accessibility) almost always has missing values: low-coverage -samples, transcripts below detection in some tissues, batch dropouts. -Many downstream pecotmr workflows expect a complete matrix: - -- [Fine-mapping](fine-mapping.html) — `univariate_analysis_pipeline()` - requires a numeric `Y` vector with no NAs. -- [TWAS weights](twas-weights.html) — `twas_weights_pipeline()` and - `twas_multivariate_weights_pipeline()` require complete `Y`. -- Multivariate fits — `mrmash_wrapper()` and `multivariate_analysis_pipeline()` - reject NA in `Y`. - -`pecotmr` provides three imputation methods plus a unified pipeline. -The methods differ in what they assume about the data: - -| Method | Function | Assumption | When to use | -|---|---|---|---| -| **XGBoost** | `xgboost_imputation()` | None — fits arbitrary nonlinear per-column models | Heterogeneous matrices where columns have different relationships. Slowest but most flexible. | -| **softImpute** | `softimpute_imputation()` | Low-rank smooth signal | Expression matrices with a small number of dominant axes of variation. Fastest. | -| **flashier** | `flashier_imputation()` | Sparse + scale-mixture EBMF factors | Matrices with sparse + diffuse signal mixtures. Good middle ground; reuses the same prior families as `compute_cov_flash()` for mr.mash priors. | - -All three are exposed through `molecular_trait_imputation_pipeline()`, -which can fit multiple methods in one call and evaluate them on a -held-out random mask of observed entries. - -```{r load-packages} -library(pecotmr) -``` - -## Quick start: one method on a synthetic matrix - -A low-rank synthetic example illustrates the contract. We build a -60 × 15 matrix with rank-3 truth + Gaussian noise, then mask 10% of -entries. - -```{r make-synthetic} -set.seed(1) -n <- 60; p <- 15; rank_true <- 3 -U <- matrix(rnorm(n * rank_true), n, rank_true) -V <- matrix(rnorm(p * rank_true), p, rank_true) -truth <- U %*% t(V) + matrix(rnorm(n * p, sd = 0.1), n, p) -rownames(truth) <- paste0("S", seq_len(n)) -colnames(truth) <- paste0("F", seq_len(p)) - -# Mask 10% of entries -na_mask <- matrix(runif(n * p) < 0.1, n, p) -X <- truth -X[na_mask] <- NA -sprintf("Missingness: %.1f%%", 100 * mean(is.na(X))) -``` - -Run the three methods individually: - -```{r single-method-runs} -Xi_softimpute <- softimpute_imputation(X) -Xi_flashier <- flashier_imputation(X, verbose = 0) -# XGBoost can be slow; reduce iterations for the vignette. -Xi_xgboost <- xgboost_imputation(X, maxiter = 3, nrounds = 20, - verbose = FALSE) - -# RMSE against the truth on the masked cells -rmse <- function(Xi) sqrt(mean((Xi[na_mask] - truth[na_mask])^2)) -cat(sprintf("softImpute RMSE: %.3f\n", rmse(Xi_softimpute))) -cat(sprintf("flashier RMSE: %.3f\n", rmse(Xi_flashier))) -cat(sprintf("xgboost RMSE: %.3f\n", rmse(Xi_xgboost))) -``` - -On rank-3 truth, flashier and softImpute both exploit the low-rank -structure; XGBoost has no structural assumption and pays for that on -this kind of input. Real expression matrices tend to be closer to the -low-rank regime, so softImpute and flashier are usually competitive. - -## The pipeline: multi-method + held-out CV - -For real data you typically want to compare methods before picking one. -`molecular_trait_imputation_pipeline()` does this in a single call. -With `evaluate = TRUE` it holds out a random fraction of observed cells, -fits each method against the masked matrix, and reports RMSE on the -masked cells. - -```{r pipeline-multi} -res <- molecular_trait_imputation_pipeline( - X, - methods = c("xgboost", "softimpute", "flashier"), - method_args = list( - xgboost = list(maxiter = 3, nrounds = 20, verbose = FALSE) - ), - evaluate = TRUE, - eval_fraction = 0.05, - eval_seed = 7 -) - -res$missingness$fraction # overall missingness -res$evaluation # per-method RMSE + time -str(res$imputed, max.level = 1) # final imputed matrices, one per method -``` - -The pipeline reports three things: - -- **`imputed`** — named list of imputed matrices, one per requested - method. These were fit on the **full** input (including the cells - that were held out for CV), so they are what you would use - downstream. -- **`missingness`** — per-column NA counts and overall NA fraction. -- **`evaluation`** — when `evaluate = TRUE`, a data.frame with one row - per method: `method`, `rmse`, `n_held_out`, `time_seconds`. Use it to - pick the method that best handles your data's structure. - -## Choosing a method - -| Method | Typical RMSE behaviour | Cost | Notes | -|---|---|---|---| -| **softimpute** | Best on smooth low-rank matrices. `lambda` is auto-tuned by held-out CV by default. | Very fast (seconds) plus the lambda-grid pass. | Pass a scalar `lambda` to skip tuning; pass a vector to restrict the CV grid. | -| **flashier** | Best on mixed sparse + diffuse signal. Default priors (`ebnm_normal` + `ebnm_normal_scale_mixture`) work well for most expression-like matrices. | Moderate (seconds-to-minutes). | Reuses the same FLASH defaults as `compute_cov_flash()` (mr.mash prior step). | -| **xgboost** | Robust to non-linear column relationships. Can overfit on small matrices. | Slowest (minutes-to-hours for large matrices). | Set `num_workers > 1` for parallel per-column fits. Tune `max_depth` and `nrounds`. | - -A simple rule of thumb: run all three with `evaluate = TRUE` on a -small subset of your data, then use the best for the full matrix. - -## Integration with the rest of pecotmr - -The imputed matrix slots directly into the downstream pipelines: - -```{r downstream, eval=FALSE} -# Imputed phenotype matrix from the pipeline -Y_complete <- res$imputed[["flashier"]] - -# Fit fine-mapping on a single context (one column of Y) -fit <- univariate_analysis_pipeline( - X = genotype_matrix, Y = Y_complete[, "F1"], maf = maf_vector, - twas_weights = TRUE -) - -# Or fit multivariate models on the full matrix -mnm_fit <- multivariate_analysis_pipeline( - X = genotype_matrix, Y = Y_complete, - data_driven_prior_matrices = prior_matrices -) -mt <- twas_multivariate_weights_pipeline( - X = genotype_matrix, Y = Y_complete, mnm_fit = mnm_fit -) -``` - -Imputation happens once, upstream of fine-mapping and TWAS weight -training, on the full phenotype matrix across contexts. The pipelines -do not re-impute internally. - -## Caveats - -1. **Don't over-impute.** A column that's missing in most samples won't - recover useful structure no matter which method you use. Drop - columns with high missingness (`> 50%` is a common cutoff) before - imputing. -2. **All-NA columns are dropped.** Each method removes columns where - every entry is missing and emits a one-line message. The returned - matrix has fewer columns than the input in that case. -3. **CV evaluation uses one random mask.** For more stable RMSE - estimates, run the pipeline a few times with different `eval_seed` - values and average. -4. **softImpute `lambda` is auto-tuned by default.** A random - fraction of observed cells (default 5%) is held out, softImpute is - fit across a geometric grid of lambdas via warm-start, and the - lambda with the lowest held-out RMSE wins. Pass `lambda = ` - to skip tuning; pass `lambda = ` to restrict the grid. - The chosen lambda and the full grid scores are attached to the - returned matrix as the `"softimpute_lambda"` and - `"softimpute_lambda_cv"` attributes. - -```{r session-info} -sessionInfo() -``` From 894dd3ad64a54ebc1b46062dfac6112a27da1c18 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Fri, 5 Jun 2026 19:10:13 -0700 Subject: [PATCH 5/6] vignette updates --- DESCRIPTION | 5 +- NAMESPACE | 5 - R/imputation.R | 468 --------------------- _pkgdown.yml | 6 - man/flashier_imputation.Rd | 48 --- man/molecular_trait_imputation_pipeline.Rd | 68 --- man/softimpute_imputation.Rd | 67 --- man/xgboost_imputation.Rd | 43 -- tests/testthat/test_misc.R | 44 -- tests/testthat/test_molecular_imputation.R | 185 -------- 10 files changed, 1 insertion(+), 938 deletions(-) delete mode 100644 R/imputation.R delete mode 100644 man/flashier_imputation.Rd delete mode 100644 man/molecular_trait_imputation_pipeline.Rd delete mode 100644 man/softimpute_imputation.Rd delete mode 100644 man/xgboost_imputation.Rd delete mode 100644 tests/testthat/test_molecular_imputation.R diff --git a/DESCRIPTION b/DESCRIPTION index 830def16..acc98593 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -74,10 +74,8 @@ Suggests: rtracklayer, SNPRelate, snpStats, - softImpute, testthat, - VariantAnnotation, - xgboost + VariantAnnotation Remotes: stephenslab/fsusieR, stephenslab/mvsusieR, @@ -112,7 +110,6 @@ Collate: 'h2_hdl.R' 'h2_lder.R' 'h2_utils.R' - 'imputation.R' 'ld_loader.R' 'mash_wrapper.R' 'misc.R' diff --git a/NAMESPACE b/NAMESPACE index 0bb64eff..8752a7c7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -72,7 +72,6 @@ export(find_overlapping_regions) export(fine_mr) export(fit_mash_contrast) export(fit_susie_inf_then_susie_rss) -export(flashier_imputation) export(format_finemapping_output) export(fsusie_get_cs) export(fsusie_wrapper) @@ -174,7 +173,6 @@ export(merge_sumstats_matrices) export(merge_variant_info) export(meta_analysis_per_cell) export(meta_sldsc_random) -export(molecular_trait_imputation_pipeline) export(mr_analysis) export(mr_ash_rss_weights) export(mr_format) @@ -218,7 +216,6 @@ export(sdpr_weights) export(slalom) export(sldsc_postprocessing_pipeline) export(slice_mash_data) -export(softimpute_imputation) export(standardise_sumstats_columns) export(standardize_sldsc_trait) export(subsetChr) @@ -247,7 +244,6 @@ export(univariate_analysis_pipeline) export(update_mash_model_cov) export(wald_test_pval) export(writeSumstatsVcf) -export(xgboost_imputation) export(xqtl_enrichment_wrapper) export(z_to_pvalue) exportClasses(AlleleQCResult) @@ -327,7 +323,6 @@ import(dplyr) import(tibble) import(tidyr) importFrom(BiocParallel,MulticoreParam) -importFrom(BiocParallel,SerialParam) importFrom(BiocParallel,bplapply) importFrom(BiocParallel,bpparam) importFrom(BiocParallel,bpworkers) diff --git a/R/imputation.R b/R/imputation.R deleted file mode 100644 index 226d6be7..00000000 --- a/R/imputation.R +++ /dev/null @@ -1,468 +0,0 @@ -# ============================================================================= -# Molecular trait imputation -# -# User-facing imputation methods for filling missing values in molecular trait -# matrices (expression, splicing, protein abundance) before fine-mapping and -# TWAS-weight training. Three method wrappers plus a unified pipeline; see the -# molecular-imputation vignette for when to use which method. -# -# The genotype-side mean_impute() helper lives in misc.R because it is an -# internal building block of compute_LD() and genotype loading, not a -# user-facing imputer. -# ============================================================================= - -#' XGBoost-based iterative imputation of missing values -#' -#' Imputes missing values in a numeric matrix by iteratively training -#' per-column XGBoost models on observed entries and predicting missing ones. -#' Columns that are entirely missing are removed. Initial imputation uses -#' column means. -#' -#' @param data Numeric matrix with missing values (NA). -#' @param maxiter Maximum number of imputation iterations (default 10). -#' @param max_depth Maximum tree depth for XGBoost (default 2). -#' @param nrounds Number of boosting rounds per variable (default 50). -#' @param decreasing Logical. If TRUE, impute variables with most missing -#' values first. Default FALSE (fewest missing first). -#' @param num_workers Number of parallel workers for BiocParallel. Default 1 -#' (sequential). -#' @param verbose Logical, print progress (default TRUE). -#' @return The imputed matrix with the same dimensions as the input (minus -#' any all-NA columns). -#' @importFrom BiocParallel MulticoreParam SerialParam bplapply -#' @export -xgboost_imputation <- function(data, maxiter = 10L, max_depth = 2L, - nrounds = 50L, decreasing = FALSE, - num_workers = 1L, verbose = TRUE) { - if (!requireNamespace("xgboost", quietly = TRUE)) - stop("Package 'xgboost' is required for xgboost_imputation") - - xmis <- as.matrix(data) - n <- nrow(xmis) - p <- ncol(xmis) - - # Remove completely missing columns - all_na <- colSums(is.na(xmis)) == n - if (any(all_na)) { - if (verbose) - message("Removed ", sum(all_na), " column(s) with all entries missing.") - xmis <- xmis[, !all_na, drop = FALSE] - p <- ncol(xmis) - } - - # Initial mean imputation - ximp <- xmis - col_means <- colMeans(xmis, na.rm = TRUE) - for (j in seq_len(p)) { - ximp[is.na(xmis[, j]), j] <- col_means[j] - } - - # Missing value locations - NAloc <- is.na(xmis) - noNAvar <- colSums(NAloc) - sort_j <- order(noNAvar, decreasing = decreasing) - nzsort_j <- sort_j[noNAvar[sort_j] > 0] - - if (length(nzsort_j) == 0) { - if (verbose) message("No missing values to impute.") - return(ximp) - } - - # Set up BiocParallel - if (num_workers > 1L) { - BPPARAM <- MulticoreParam(workers = num_workers) - } else { - BPPARAM <- SerialParam() - } - - iter <- 0L - conv_new <- 0 - conv_old <- Inf - ximp_history <- vector("list", maxiter) - - while (conv_new < conv_old && iter < maxiter) { - if (iter > 0) conv_old <- conv_new - if (verbose) message(" XGBoost iteration ", iter + 1L, " in progress...") - - ximp_old <- ximp - - # Impute each variable with missing values - impute_one <- function(var_idx) { - obsi <- !NAloc[, var_idx] - misi <- NAloc[, var_idx] - obsY <- ximp[obsi, var_idx] - obsX <- ximp[obsi, -var_idx, drop = FALSE] - misX <- ximp[misi, -var_idx, drop = FALSE] - - xgb_train <- xgboost::xgb.DMatrix(data = obsX, label = obsY) - xgb_pred <- xgboost::xgb.DMatrix(data = misX) - model <- xgboost::xgb.train( - params = list(max_depth = max_depth, verbosity = 0), - data = xgb_train, nrounds = nrounds) - list(var_idx = var_idx, predicted = predict(model, xgb_pred)) - } - - results <- bplapply(nzsort_j, impute_one, BPPARAM = BPPARAM) - - for (res in results) { - misi <- NAloc[, res$var_idx] - ximp[misi, res$var_idx] <- res$predicted - } - - iter <- iter + 1L - ximp_history[[iter]] <- ximp - - # Convergence: relative change in imputed values - conv_new <- sum((ximp - ximp_old)^2) / sum(ximp^2) - } - - # Return last improving iteration - if (iter == maxiter) ximp_history[[iter]] else ximp_history[[max(iter - 1L, 1L)]] -} - -#' softImpute-based matrix imputation -#' -#' Imputes missing values in a numeric matrix via the soft-thresholded SVD -#' algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"} -#' alternating-least-squares solver by default. -#' -#' \code{lambda} drives the bias-variance trade-off. By default the -#' function tunes it via held-out cross-validation: a random fraction of -#' observed cells is masked, softImpute is fit across a geometric grid of -#' lambdas with warm-start, and the lambda minimising RMSE on the masked -#' cells is selected. The final imputation then uses that lambda on the -#' full input. Pass a scalar \code{lambda} to skip tuning. -#' -#' Columns with all entries missing are removed before fitting and a -#' one-line message records the count. -#' -#' @param data Numeric matrix (samples x features) with missing values -#' (NA). -#' @param rank.max Maximum rank of the SVD solution. Default -#' \code{min(dim(data) - 1)}. -#' @param lambda Nuclear-norm penalty. \code{NULL} (default) tunes over -#' a default geometric grid via held-out CV. A scalar uses that lambda -#' directly with no tuning. A vector defines the tuning grid. -#' @param tune_fraction Fraction of observed cells held out for CV. -#' Default 0.05. Ignored when \code{lambda} is a scalar. -#' @param tune_seed Seed for the held-out mask. Default 42L. -#' @param type Optimisation type passed to \code{softImpute::softImpute}; -#' one of \code{"als"} (default, faster) or \code{"svd"}. -#' @param thresh Convergence threshold. Default 1e-5. -#' @param maxit Maximum iterations. Default 100. -#' @param verbose Logical, print progress (including per-lambda CV RMSE). -#' Default FALSE. -#' @return The imputed matrix with the same dimensions as the input -#' (minus any all-NA columns). Carries attributes -#' \code{"softimpute_lambda"} (the lambda used for the final fit) and, -#' when tuning was performed, \code{"softimpute_lambda_cv"} (a -#' data.frame of grid-search RMSE values). -#' @export -softimpute_imputation <- function(data, rank.max = NULL, lambda = NULL, - tune_fraction = 0.05, - tune_seed = 42L, - type = c("als", "svd"), - thresh = 1e-5, maxit = 100, - verbose = FALSE) { - if (!requireNamespace("softImpute", quietly = TRUE)) { - stop("Package 'softImpute' is required for softimpute_imputation.") - } - type <- match.arg(type) - X <- as.matrix(data) - n <- nrow(X); p <- ncol(X) - all_na <- colSums(is.na(X)) == n - if (any(all_na)) { - if (verbose) message("Removed ", sum(all_na), " column(s) with all entries missing.") - X <- X[, !all_na, drop = FALSE] - p <- ncol(X) - } - if (is.null(rank.max)) rank.max <- max(1L, min(n, p) - 1L) - lambda0_val <- softImpute::lambda0(X) - - # Decide whether to tune. lambda = scalar => no tuning; NULL/vector => tune. - tune <- is.null(lambda) || length(lambda) > 1L - if (tune) { - if (is.null(lambda)) { - # Default geometric grid from lambda0 down to 1e-3 * lambda0. - lambda_grid <- exp(seq(log(lambda0_val), - log(max(1e-3 * lambda0_val, .Machine$double.eps)), - length.out = 10L)) - } else { - lambda_grid <- sort(as.numeric(lambda), decreasing = TRUE) - } - cv_res <- .softimpute_cv_lambda(X, lambda_grid = lambda_grid, - rank.max = rank.max, - tune_fraction = tune_fraction, - tune_seed = tune_seed, - type = type, thresh = thresh, maxit = maxit, - verbose = verbose) - lambda <- cv_res$best_lambda - if (verbose) { - message(sprintf("softimpute_imputation: tuned lambda = %.4g (CV RMSE %.4f over %d cells)", - lambda, cv_res$best_rmse, cv_res$n_held_out)) - } - } - - fit <- softImpute::softImpute(X, rank.max = rank.max, lambda = lambda, - type = type, thresh = thresh, maxit = maxit, - trace.it = isTRUE(verbose)) - Ximp <- softImpute::complete(X, fit) - if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X) - if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X) - attr(Ximp, "softimpute_lambda") <- lambda - if (tune) attr(Ximp, "softimpute_lambda_cv") <- cv_res$grid - Ximp -} - -# Internal: CV-tune softImpute's lambda over a grid using a held-out mask. -# Uses softImpute's warm.start feature to chain fits along the lambda path -# (largest lambda first). Returns the chosen lambda, its CV RMSE, and the -# full grid scores. -.softimpute_cv_lambda <- function(X, lambda_grid, rank.max, - tune_fraction, tune_seed, - type, thresh, maxit, verbose) { - na_mask <- is.na(X) - obs_idx <- which(!na_mask) - n_hold <- max(1L, floor(tune_fraction * length(obs_idx))) - if (n_hold >= length(obs_idx)) { - stop("tune_fraction too large; would mask all observed entries.") - } - set.seed(tune_seed) - hold_idx <- sample(obs_idx, size = n_hold, replace = FALSE) - truth <- X[hold_idx] - Xmasked <- X - Xmasked[hold_idx] <- NA_real_ - - lambdas <- sort(as.numeric(lambda_grid), decreasing = TRUE) - rmse_vec <- numeric(length(lambdas)) - warm <- NULL - for (i in seq_along(lambdas)) { - fit_args <- list(x = Xmasked, rank.max = rank.max, lambda = lambdas[i], - type = type, thresh = thresh, maxit = maxit, - trace.it = FALSE) - if (!is.null(warm)) fit_args$warm.start <- warm - fit <- tryCatch(do.call(softImpute::softImpute, fit_args), - error = function(e) NULL) - if (is.null(fit)) { - rmse_vec[i] <- NA_real_ - next - } - Ximp <- softImpute::complete(Xmasked, fit) - rmse_vec[i] <- sqrt(mean((Ximp[hold_idx] - truth)^2, na.rm = TRUE)) - warm <- fit - if (verbose) { - message(sprintf(" softimpute CV lambda = %.4g -> RMSE %.4f", - lambdas[i], rmse_vec[i])) - } - } - grid <- data.frame(lambda = lambdas, rmse = rmse_vec, - stringsAsFactors = FALSE) - ok <- which(is.finite(rmse_vec)) - if (length(ok) == 0L) { - stop("All softImpute CV fits failed; supply lambda explicitly.") - } - best <- ok[which.min(rmse_vec[ok])] - list(best_lambda = lambdas[best], best_rmse = rmse_vec[best], - n_held_out = n_hold, grid = grid) -} - -#' flashier-based matrix imputation -#' -#' Imputes missing values in a numeric matrix via empirical Bayes -#' matrix factorisation (\code{flashier::flash}). \code{flash} natively -#' supports NA entries and reconstructs them from the posterior fitted -#' values. The default prior families match those used by -#' \code{\link{compute_cov_flash}}: \code{ebnm::ebnm_normal} on the row -#' factor and \code{ebnm::ebnm_normal_scale_mixture} on the column -#' factor. -#' -#' @param data Numeric matrix (samples x features) with missing values -#' (NA). -#' @param var_type Variance structure for the residuals, passed to -#' \code{flashier::flash}. Default 2 (per-column). -#' @param ebnm_fn Prior family. Default -#' \code{c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture)}. -#' @param greedy_Kmax Maximum number of greedy factors. Default -#' \code{min(20L, min(dim(data)) - 1L)}. -#' @param backfit Logical; run a final backfit pass. Default TRUE. -#' @param verbose Numeric verbosity (0 silent, default). -#' @param ... Additional arguments forwarded to \code{flashier::flash}. -#' @return The imputed matrix with the same dimensions as the input -#' (minus any all-NA columns). -#' @export -flashier_imputation <- function(data, var_type = 2, - ebnm_fn = c(ebnm::ebnm_normal, - ebnm::ebnm_normal_scale_mixture), - greedy_Kmax = NULL, - backfit = TRUE, verbose = 0, ...) { - if (!requireNamespace("flashier", quietly = TRUE)) { - stop("Package 'flashier' is required for flashier_imputation.") - } - if (!requireNamespace("ebnm", quietly = TRUE)) { - stop("Package 'ebnm' is required for flashier_imputation.") - } - X <- as.matrix(data) - n <- nrow(X); p <- ncol(X) - all_na <- colSums(is.na(X)) == n - if (any(all_na)) { - if (isTRUE(verbose) || (is.numeric(verbose) && verbose >= 1)) { - message("Removed ", sum(all_na), " column(s) with all entries missing.") - } - X <- X[, !all_na, drop = FALSE] - p <- ncol(X) - } - if (is.null(greedy_Kmax)) greedy_Kmax <- max(1L, min(20L, min(n, p) - 1L)) - fl <- flashier::flash(X, var_type = var_type, ebnm_fn = ebnm_fn, - greedy_Kmax = greedy_Kmax, backfit = backfit, - verbose = verbose, ...) - # Posterior fitted value: L_pm %*% t(F_pm). When no factors are found, - # fall back to column means. - if (is.null(fl$n_factors) || fl$n_factors == 0) { - fitted <- matrix(colMeans(X, na.rm = TRUE), - nrow = n, ncol = p, byrow = TRUE) - } else { - fitted <- tcrossprod(fl$L_pm, fl$F_pm) - } - Ximp <- X - na_idx <- is.na(X) - Ximp[na_idx] <- fitted[na_idx] - if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X) - if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X) - Ximp -} - -#' Pipeline for molecular trait imputation -#' -#' Unified entry point for imputing missing values in a molecular trait -#' matrix (e.g. expression, splicing, protein abundance). Dispatches to -#' one or more underlying methods and optionally evaluates imputation -#' quality on a held-out random mask of observed entries. -#' -#' Available methods: -#' \itemize{ -#' \item \code{"xgboost"} - \code{\link{xgboost_imputation}}, iterative -#' XGBoost per-column models. No structural assumption on the matrix. -#' \item \code{"softimpute"} - \code{\link{softimpute_imputation}}, -#' soft-thresholded SVD via \code{softImpute::softImpute}. Assumes a -#' low-rank smooth signal. -#' \item \code{"flashier"} - \code{\link{flashier_imputation}}, -#' empirical Bayes matrix factorisation via \code{flashier::flash} -#' with sparse + scale-mixture priors. -#' } -#' -#' @param data Numeric matrix (samples x features) with missing values. -#' @param methods Character vector of methods to fit. Any subset of -#' \code{c("xgboost", "softimpute", "flashier")}. Default fits all -#' three. -#' @param method_args Named list of per-method argument lists. List -#' names must match \code{methods}. Default empty. -#' @param evaluate Logical. If TRUE, mask a random fraction of observed -#' entries, run each method, and report root-mean-squared error on the -#' masked cells. Default FALSE. -#' @param eval_fraction Fraction of observed cells to hold out for -#' evaluation. Default 0.05. -#' @param eval_seed Random seed for the held-out mask. Default 42. -#' @param verbose Logical, print progress. Default FALSE. -#' -#' @return A list with: -#' \describe{ -#' \item{imputed}{Named list of imputed matrices, one per requested -#' method.} -#' \item{missingness}{Per-column NA count and overall NA fraction.} -#' \item{evaluation}{When \code{evaluate = TRUE}, a data.frame with -#' one row per method: \code{method}, \code{rmse}, \code{n_held_out}, -#' \code{time_seconds}.} -#' \item{method_args}{The resolved per-method arguments used.} -#' } -#' @export -molecular_trait_imputation_pipeline <- function( - data, - methods = c("xgboost", "softimpute", "flashier"), - method_args = list(), - evaluate = FALSE, - eval_fraction = 0.05, - eval_seed = 42L, - verbose = FALSE) { - valid <- c("xgboost", "softimpute", "flashier") - if (!is.character(methods) || length(methods) == 0L) { - stop("methods must be a non-empty character vector.") - } - bad <- setdiff(methods, valid) - if (length(bad) > 0) { - stop("Unknown imputation method(s): ", paste(bad, collapse = ", "), - ". Valid options: ", paste(valid, collapse = ", ")) - } - methods <- unique(methods) - - Xfull <- as.matrix(data) - n <- nrow(Xfull); p <- ncol(Xfull) - na_mask_full <- is.na(Xfull) - na_per_col <- colSums(na_mask_full) - missingness <- list( - per_column = na_per_col, - n_missing = sum(na_mask_full), - fraction = sum(na_mask_full) / (n * p) - ) - if (verbose) { - message(sprintf("molecular_trait_imputation_pipeline: %d x %d matrix, %.2f%% missing.", - n, p, 100 * missingness$fraction)) - } - - dispatch <- list(xgboost = xgboost_imputation, - softimpute = softimpute_imputation, - flashier = flashier_imputation) - - run_one <- function(m, X_in) { - args <- method_args[[m]] %||% list() - do.call(dispatch[[m]], c(list(X_in), args)) - } - - # If evaluating, hold out a random subset of observed cells, run each - # method against the masked matrix, then score RMSE on the held-out - # cells. Final imputation is fit against the original matrix so the - # returned matrices use every observed cell. - eval_df <- NULL - if (isTRUE(evaluate)) { - obs_idx <- which(!na_mask_full) - n_hold <- max(1L, floor(eval_fraction * length(obs_idx))) - if (n_hold >= length(obs_idx)) { - stop("eval_fraction too large; would mask all observed entries.") - } - set.seed(eval_seed) - hold_idx <- sample(obs_idx, size = n_hold, replace = FALSE) - Xmasked <- Xfull - held_truth <- Xfull[hold_idx] - Xmasked[hold_idx] <- NA_real_ - eval_records <- vector("list", length(methods)) - for (i in seq_along(methods)) { - m <- methods[i] - if (verbose) message("Evaluating ", m, " on held-out mask ...") - t0 <- proc.time() - Ximp_m <- run_one(m, Xmasked) - t1 <- proc.time() - pred <- Ximp_m[hold_idx] - rmse <- sqrt(mean((pred - held_truth)^2, na.rm = TRUE)) - eval_records[[i]] <- data.frame( - method = m, rmse = rmse, n_held_out = n_hold, - time_seconds = as.numeric((t1 - t0)["elapsed"]), - stringsAsFactors = FALSE - ) - } - eval_df <- do.call(rbind, eval_records) - rownames(eval_df) <- NULL - } - - # Final imputation: run each method on the full matrix - imputed <- list() - for (m in methods) { - if (verbose) message("Imputing with ", m, " ...") - imputed[[m]] <- run_one(m, Xfull) - } - - list( - imputed = imputed, - missingness = missingness, - evaluation = eval_df, - method_args = method_args - ) -} diff --git a/_pkgdown.yml b/_pkgdown.yml index efcf0b98..e8bdeade 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -45,12 +45,6 @@ navbar: aria-label: View source on GitHub articles: - - title: "Data Preparation" - desc: > - Imputing missing values in molecular trait matrices before - fine-mapping and TWAS weight training. - contents: - - molecular-imputation - title: "Quality Control" desc: > Allele alignment and LD-mismatch QC for GWAS and QTL summary diff --git a/man/flashier_imputation.Rd b/man/flashier_imputation.Rd deleted file mode 100644 index 7762c785..00000000 --- a/man/flashier_imputation.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/imputation.R -\name{flashier_imputation} -\alias{flashier_imputation} -\title{flashier-based matrix imputation} -\usage{ -flashier_imputation( - data, - var_type = 2, - ebnm_fn = c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture), - greedy_Kmax = NULL, - backfit = TRUE, - verbose = 0, - ... -) -} -\arguments{ -\item{data}{Numeric matrix (samples x features) with missing values -(NA).} - -\item{var_type}{Variance structure for the residuals, passed to -\code{flashier::flash}. Default 2 (per-column).} - -\item{ebnm_fn}{Prior family. Default -\code{c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture)}.} - -\item{greedy_Kmax}{Maximum number of greedy factors. Default -\code{min(20L, min(dim(data)) - 1L)}.} - -\item{backfit}{Logical; run a final backfit pass. Default TRUE.} - -\item{verbose}{Numeric verbosity (0 silent, default).} - -\item{...}{Additional arguments forwarded to \code{flashier::flash}.} -} -\value{ -The imputed matrix with the same dimensions as the input - (minus any all-NA columns). -} -\description{ -Imputes missing values in a numeric matrix via empirical Bayes -matrix factorisation (\code{flashier::flash}). \code{flash} natively -supports NA entries and reconstructs them from the posterior fitted -values. The default prior families match those used by -\code{\link{compute_cov_flash}}: \code{ebnm::ebnm_normal} on the row -factor and \code{ebnm::ebnm_normal_scale_mixture} on the column -factor. -} diff --git a/man/molecular_trait_imputation_pipeline.Rd b/man/molecular_trait_imputation_pipeline.Rd deleted file mode 100644 index 7932803b..00000000 --- a/man/molecular_trait_imputation_pipeline.Rd +++ /dev/null @@ -1,68 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/imputation.R -\name{molecular_trait_imputation_pipeline} -\alias{molecular_trait_imputation_pipeline} -\title{Pipeline for molecular trait imputation} -\usage{ -molecular_trait_imputation_pipeline( - data, - methods = c("xgboost", "softimpute", "flashier"), - method_args = list(), - evaluate = FALSE, - eval_fraction = 0.05, - eval_seed = 42L, - verbose = FALSE -) -} -\arguments{ -\item{data}{Numeric matrix (samples x features) with missing values.} - -\item{methods}{Character vector of methods to fit. Any subset of -\code{c("xgboost", "softimpute", "flashier")}. Default fits all -three.} - -\item{method_args}{Named list of per-method argument lists. List -names must match \code{methods}. Default empty.} - -\item{evaluate}{Logical. If TRUE, mask a random fraction of observed -entries, run each method, and report root-mean-squared error on the -masked cells. Default FALSE.} - -\item{eval_fraction}{Fraction of observed cells to hold out for -evaluation. Default 0.05.} - -\item{eval_seed}{Random seed for the held-out mask. Default 42.} - -\item{verbose}{Logical, print progress. Default FALSE.} -} -\value{ -A list with: -\describe{ - \item{imputed}{Named list of imputed matrices, one per requested - method.} - \item{missingness}{Per-column NA count and overall NA fraction.} - \item{evaluation}{When \code{evaluate = TRUE}, a data.frame with - one row per method: \code{method}, \code{rmse}, \code{n_held_out}, - \code{time_seconds}.} - \item{method_args}{The resolved per-method arguments used.} -} -} -\description{ -Unified entry point for imputing missing values in a molecular trait -matrix (e.g. expression, splicing, protein abundance). Dispatches to -one or more underlying methods and optionally evaluates imputation -quality on a held-out random mask of observed entries. -} -\details{ -Available methods: -\itemize{ - \item \code{"xgboost"} - \code{\link{xgboost_imputation}}, iterative - XGBoost per-column models. No structural assumption on the matrix. - \item \code{"softimpute"} - \code{\link{softimpute_imputation}}, - soft-thresholded SVD via \code{softImpute::softImpute}. Assumes a - low-rank smooth signal. - \item \code{"flashier"} - \code{\link{flashier_imputation}}, - empirical Bayes matrix factorisation via \code{flashier::flash} - with sparse + scale-mixture priors. -} -} diff --git a/man/softimpute_imputation.Rd b/man/softimpute_imputation.Rd deleted file mode 100644 index 89eadff7..00000000 --- a/man/softimpute_imputation.Rd +++ /dev/null @@ -1,67 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/imputation.R -\name{softimpute_imputation} -\alias{softimpute_imputation} -\title{softImpute-based matrix imputation} -\usage{ -softimpute_imputation( - data, - rank.max = NULL, - lambda = NULL, - tune_fraction = 0.05, - tune_seed = 42L, - type = c("als", "svd"), - thresh = 1e-05, - maxit = 100, - verbose = FALSE -) -} -\arguments{ -\item{data}{Numeric matrix (samples x features) with missing values -(NA).} - -\item{rank.max}{Maximum rank of the SVD solution. Default -\code{min(dim(data) - 1)}.} - -\item{lambda}{Nuclear-norm penalty. \code{NULL} (default) tunes over -a default geometric grid via held-out CV. A scalar uses that lambda -directly with no tuning. A vector defines the tuning grid.} - -\item{tune_fraction}{Fraction of observed cells held out for CV. -Default 0.05. Ignored when \code{lambda} is a scalar.} - -\item{tune_seed}{Seed for the held-out mask. Default 42L.} - -\item{type}{Optimisation type passed to \code{softImpute::softImpute}; -one of \code{"als"} (default, faster) or \code{"svd"}.} - -\item{thresh}{Convergence threshold. Default 1e-5.} - -\item{maxit}{Maximum iterations. Default 100.} - -\item{verbose}{Logical, print progress (including per-lambda CV RMSE). -Default FALSE.} -} -\value{ -The imputed matrix with the same dimensions as the input - (minus any all-NA columns). Carries attributes - \code{"softimpute_lambda"} (the lambda used for the final fit) and, - when tuning was performed, \code{"softimpute_lambda_cv"} (a - data.frame of grid-search RMSE values). -} -\description{ -Imputes missing values in a numeric matrix via the soft-thresholded SVD -algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"} -alternating-least-squares solver by default. -} -\details{ -\code{lambda} drives the bias-variance trade-off. By default the -function tunes it via held-out cross-validation: a random fraction of -observed cells is masked, softImpute is fit across a geometric grid of -lambdas with warm-start, and the lambda minimising RMSE on the masked -cells is selected. The final imputation then uses that lambda on the -full input. Pass a scalar \code{lambda} to skip tuning. - -Columns with all entries missing are removed before fitting and a -one-line message records the count. -} diff --git a/man/xgboost_imputation.Rd b/man/xgboost_imputation.Rd deleted file mode 100644 index b09309ee..00000000 --- a/man/xgboost_imputation.Rd +++ /dev/null @@ -1,43 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/imputation.R -\name{xgboost_imputation} -\alias{xgboost_imputation} -\title{XGBoost-based iterative imputation of missing values} -\usage{ -xgboost_imputation( - data, - maxiter = 10L, - max_depth = 2L, - nrounds = 50L, - decreasing = FALSE, - num_workers = 1L, - verbose = TRUE -) -} -\arguments{ -\item{data}{Numeric matrix with missing values (NA).} - -\item{maxiter}{Maximum number of imputation iterations (default 10).} - -\item{max_depth}{Maximum tree depth for XGBoost (default 2).} - -\item{nrounds}{Number of boosting rounds per variable (default 50).} - -\item{decreasing}{Logical. If TRUE, impute variables with most missing -values first. Default FALSE (fewest missing first).} - -\item{num_workers}{Number of parallel workers for BiocParallel. Default 1 -(sequential).} - -\item{verbose}{Logical, print progress (default TRUE).} -} -\value{ -The imputed matrix with the same dimensions as the input (minus - any all-NA columns). -} -\description{ -Imputes missing values in a numeric matrix by iteratively training -per-column XGBoost models on observed entries and predicting missing ones. -Columns that are entirely missing are removed. Initial imputation uses -column means. -} diff --git a/tests/testthat/test_misc.R b/tests/testthat/test_misc.R index 7dfb1068..013db71d 100644 --- a/tests/testthat/test_misc.R +++ b/tests/testthat/test_misc.R @@ -1684,50 +1684,6 @@ test_that("twas_method_cor with diagonal LD", { expect_equal(result[1, 2], 0) }) -# ============================================================================= -# xgboost_imputation -# ============================================================================= - -test_that("xgboost_imputation works on simple matrix", { - skip_if_not_installed("xgboost") - set.seed(42) - mat <- matrix(rnorm(100), nrow = 20, ncol = 5) - colnames(mat) <- paste0("V", 1:5) - # Introduce some missing values - mat[1, 1] <- NA - mat[5, 3] <- NA - mat[10, 2] <- NA - result <- xgboost_imputation(mat, maxiter = 2, nrounds = 10, verbose = FALSE) - expect_equal(dim(result), dim(mat)) - expect_false(anyNA(result)) -}) - -test_that("xgboost_imputation removes all-NA columns", { - skip_if_not_installed("xgboost") - set.seed(42) - mat <- matrix(rnorm(80), nrow = 20, ncol = 4) - colnames(mat) <- paste0("V", 1:4) - mat[, 3] <- NA # entirely missing column - mat[1, 1] <- NA - result <- expect_message( - xgboost_imputation(mat, maxiter = 2, nrounds = 10, verbose = TRUE), - "Removed 1 column" - ) - expect_equal(ncol(result), 3) - expect_false(anyNA(result)) -}) - -test_that("xgboost_imputation with no missing returns data unchanged", { - skip_if_not_installed("xgboost") - set.seed(42) - mat <- matrix(rnorm(60), nrow = 10, ncol = 6) - result <- expect_message( - xgboost_imputation(mat, maxiter = 2, verbose = TRUE), - "No missing values" - ) - expect_equal(result, mat) -}) - # ============================================================================= # compute_qvalues — uncovered branches # ============================================================================= diff --git a/tests/testthat/test_molecular_imputation.R b/tests/testthat/test_molecular_imputation.R deleted file mode 100644 index 54d39a4a..00000000 --- a/tests/testthat/test_molecular_imputation.R +++ /dev/null @@ -1,185 +0,0 @@ -context("Molecular trait imputation pipeline") - -# Helper: low-rank synthetic matrix with NA mask. -.make_imputation_inputs <- function(n = 60, p = 15, rank = 3, na_frac = 0.1, - seed = 1) { - set.seed(seed) - U <- matrix(rnorm(n * rank), n, rank) - V <- matrix(rnorm(p * rank), p, rank) - truth <- U %*% t(V) + matrix(rnorm(n * p, sd = 0.1), n, p) - X <- truth - na_mask <- matrix(runif(n * p) < na_frac, n, p) - X[na_mask] <- NA - rownames(X) <- paste0("S", seq_len(n)) - colnames(X) <- paste0("F", seq_len(p)) - list(X = X, truth = truth, na_mask = na_mask) -} - -# ============================================================================= -# softimpute_imputation -# ============================================================================= - -test_that("softimpute_imputation: returns matrix with same dims, no NAs", { - skip_if_not_installed("softImpute") - inp <- .make_imputation_inputs() - Xi <- softimpute_imputation(inp$X) - expect_equal(dim(Xi), dim(inp$X)) - expect_false(any(is.na(Xi))) - expect_equal(rownames(Xi), rownames(inp$X)) -}) - -test_that("softimpute_imputation: handles all-NA columns by removing them", { - skip_if_not_installed("softImpute") - inp <- .make_imputation_inputs() - X <- inp$X - X[, 5] <- NA - expect_message( - Xi <- softimpute_imputation(X, verbose = TRUE), - "Removed 1 column" - ) - expect_equal(ncol(Xi), ncol(X) - 1L) -}) - -test_that("softimpute_imputation: default tunes lambda via CV and attaches grid", { - skip_if_not_installed("softImpute") - inp <- .make_imputation_inputs() - Xi <- softimpute_imputation(inp$X) # default lambda = NULL -> tune - expect_false(is.null(attr(Xi, "softimpute_lambda"))) - grid <- attr(Xi, "softimpute_lambda_cv") - expect_s3_class(grid, "data.frame") - expect_named(grid, c("lambda", "rmse")) - expect_true(all(diff(grid$lambda) <= 0)) # grid sorted descending - expect_true(any(is.finite(grid$rmse))) - # The chosen lambda must be one of the grid points - expect_true(attr(Xi, "softimpute_lambda") %in% grid$lambda) -}) - -test_that("softimpute_imputation: scalar lambda bypasses tuning", { - skip_if_not_installed("softImpute") - inp <- .make_imputation_inputs() - l0 <- softImpute::lambda0(inp$X) - Xi <- softimpute_imputation(inp$X, lambda = 0.3 * l0) - expect_equal(attr(Xi, "softimpute_lambda"), 0.3 * l0) - expect_null(attr(Xi, "softimpute_lambda_cv")) -}) - -test_that("softimpute_imputation: vector lambda restricts tuning to that grid", { - skip_if_not_installed("softImpute") - inp <- .make_imputation_inputs() - l0 <- softImpute::lambda0(inp$X) - custom_grid <- l0 * c(0.05, 0.1, 0.25, 0.5) - Xi <- softimpute_imputation(inp$X, lambda = custom_grid) - grid <- attr(Xi, "softimpute_lambda_cv") - expect_setequal(grid$lambda, custom_grid) - expect_true(attr(Xi, "softimpute_lambda") %in% custom_grid) -}) - -test_that("softimpute_imputation: chosen lambda minimises CV RMSE on the grid", { - skip_if_not_installed("softImpute") - inp <- .make_imputation_inputs(n = 80, p = 20, rank = 3, seed = 11) - Xi <- softimpute_imputation(inp$X) - grid <- attr(Xi, "softimpute_lambda_cv") - chosen <- attr(Xi, "softimpute_lambda") - ok <- which(is.finite(grid$rmse)) - best_idx <- ok[which.min(grid$rmse[ok])] - expect_equal(chosen, grid$lambda[best_idx]) -}) - -# ============================================================================= -# flashier_imputation -# ============================================================================= - -test_that("flashier_imputation: returns matrix with same dims, no NAs", { - skip_if_not_installed("flashier") - skip_if_not_installed("ebnm") - inp <- .make_imputation_inputs() - Xi <- flashier_imputation(inp$X, verbose = 0) - expect_equal(dim(Xi), dim(inp$X)) - expect_false(any(is.na(Xi))) -}) - -test_that("flashier_imputation: recovers low-rank truth better than column means", { - skip_if_not_installed("flashier") - skip_if_not_installed("ebnm") - inp <- .make_imputation_inputs() - Xi <- flashier_imputation(inp$X, verbose = 0) - # Baseline: column-mean imputation - Xbase <- inp$X - cm <- colMeans(Xbase, na.rm = TRUE) - for (j in seq_len(ncol(Xbase))) Xbase[is.na(Xbase[, j]), j] <- cm[j] - rmse_flash <- sqrt(mean((Xi[inp$na_mask] - inp$truth[inp$na_mask])^2)) - rmse_mean <- sqrt(mean((Xbase[inp$na_mask] - inp$truth[inp$na_mask])^2)) - expect_lt(rmse_flash, rmse_mean) -}) - -# ============================================================================= -# molecular_trait_imputation_pipeline -# ============================================================================= - -test_that("pipeline: single-method dispatch returns named list keyed by method", { - skip_if_not_installed("softImpute") - inp <- .make_imputation_inputs() - res <- molecular_trait_imputation_pipeline(inp$X, methods = "softimpute") - expect_named(res$imputed, "softimpute") - expect_equal(dim(res$imputed[["softimpute"]]), dim(inp$X)) - expect_null(res$evaluation) -}) - -test_that("pipeline: multi-method runs all three and returns one matrix per method", { - skip_if_not_installed("softImpute") - skip_if_not_installed("flashier") - skip_if_not_installed("xgboost") - inp <- .make_imputation_inputs() - res <- molecular_trait_imputation_pipeline( - inp$X, methods = c("xgboost", "softimpute", "flashier"), - method_args = list( - xgboost = list(maxiter = 2, nrounds = 10, verbose = FALSE) - ) - ) - expect_named(res$imputed, c("xgboost", "softimpute", "flashier")) - for (m in names(res$imputed)) { - expect_equal(dim(res$imputed[[m]]), dim(inp$X)) - expect_false(any(is.na(res$imputed[[m]]))) - } -}) - -test_that("pipeline: evaluate = TRUE reports RMSE on held-out cells", { - skip_if_not_installed("softImpute") - skip_if_not_installed("flashier") - inp <- .make_imputation_inputs() - res <- molecular_trait_imputation_pipeline( - inp$X, methods = c("softimpute", "flashier"), - evaluate = TRUE, eval_fraction = 0.1, eval_seed = 7 - ) - expect_s3_class(res$evaluation, "data.frame") - expect_equal(nrow(res$evaluation), 2L) - expect_named(res$evaluation, - c("method", "rmse", "n_held_out", "time_seconds")) - expect_true(all(res$evaluation$rmse > 0)) - expect_true(all(res$evaluation$n_held_out > 0)) -}) - -test_that("pipeline: missingness summary matches input NA count", { - skip_if_not_installed("softImpute") - inp <- .make_imputation_inputs() - res <- molecular_trait_imputation_pipeline(inp$X, methods = "softimpute") - expect_equal(res$missingness$n_missing, sum(is.na(inp$X))) - expect_equal(res$missingness$fraction, - sum(is.na(inp$X)) / prod(dim(inp$X))) -}) - -test_that("pipeline: rejects unknown method", { - inp <- .make_imputation_inputs() - expect_error( - molecular_trait_imputation_pipeline(inp$X, methods = "knnimpute"), - "Unknown imputation method" - ) -}) - -test_that("pipeline: rejects empty methods vector", { - inp <- .make_imputation_inputs() - expect_error( - molecular_trait_imputation_pipeline(inp$X, methods = character(0)), - "non-empty character vector" - ) -}) From 032f5cc553d10fcbbef745b290bfc9edba65bfb6 Mon Sep 17 00:00:00 2001 From: danielnachun Date: Sat, 6 Jun 2026 02:11:40 +0000 Subject: [PATCH 6/6] Update documentation --- man/load_multitask_regional_data.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/load_multitask_regional_data.Rd b/man/load_multitask_regional_data.Rd index 57b231ab..8f74cb72 100644 --- a/man/load_multitask_regional_data.Rd +++ b/man/load_multitask_regional_data.Rd @@ -129,10 +129,10 @@ This function loads a mixture data sets for a specific region, including individ or summary statistics (sumstats, LD). Run \code{load_regional_univariate_data} and \code{load_rss_data} multiple times for different datasets } \section{Loading individual level data from multiple corhorts}{ - +NA } \section{Loading summary statistics from multiple corhorts or data set}{ - +NA }