From 4a99b87ceb8332531ae708929af8870d2d950e92 Mon Sep 17 00:00:00 2001
From: Daniel Nachun <daniel.nachun@gmail.com>
Date: Thu, 4 Jun 2026 22:19:33 -0700
Subject: [PATCH 1/6] allow susie-inf to initiliaze susie-ash

---
 R/susie_wrapper.R                   | 53 +++++++++++-----
 R/univariate_pipeline.R             | 57 +++++++++++------
 man/rss_analysis_pipeline.Rd        | 12 ++--
 man/susie_rss_pipeline.Rd           | 17 ++---
 man/univariate_analysis_pipeline.Rd |  4 +-
 tests/testthat/test_methods_param.R | 96 +++++++++++++++++++++++++++++
 vignettes/fine-mapping.Rmd          | 31 +++++++---
 7 files changed, 213 insertions(+), 57 deletions(-)

diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R
index 7dbdaffd..167905a9 100644
--- a/R/susie_wrapper.R
+++ b/R/susie_wrapper.R
@@ -111,12 +111,21 @@ format_cs_column <- function(coverage, method) {
   fit
 }
 
-prepare_susie_from_inf_args <- function(args, susie_inf_fit, refine_default = NULL) {
+# Build the argument list for a SuSiE / SuSiE-ash fit initialised from a
+# prior SuSiE-inf fit. `unmappable_effects` controls which branch the
+# downstream fit takes: "none" yields the standard SuSiE-inf-initialised
+# SuSiE; "ash" yields SuSiE-ash with the SuSiE-inf warm start.
+prepare_susie_from_inf_args <- function(args, susie_inf_fit, refine_default = NULL,
+                                        unmappable_effects = c("none", "ash")) {
+  unmappable_effects <- match.arg(unmappable_effects)
   L <- args[["L"]]
   if (is.null(L)) L <- length(susie_inf_fit$V)
   if (is.null(args[["refine"]]) && !is.null(refine_default)) args[["refine"]] <- refine_default
-  args[["unmappable_effects"]] <- "none"
+  args[["unmappable_effects"]] <- unmappable_effects
   args[["model_init"]] <- susie_inf_fit
+  if (unmappable_effects == "ash") {
+    args[["convergence_method"]] <- args[["convergence_method"]] %||% "pip"
+  }
   if (!is.null(args[["L_greedy"]])) args[["L_greedy"]] <- min(length(susie_inf_fit$V), L)
   args
 }
@@ -893,14 +902,15 @@ adjust_susie_weights <- function(twas_weights_results, keep_variants, run_allele
 #'   fit. Any subset of \code{c("susie_rss", "susie_inf_rss",
 #'   "susie_ash_rss")}. Default \code{NULL} falls back to a single-method fit
 #'   driven by \code{analysis_method} (backward-compatible behavior). When
-#'   \code{methods} is passed explicitly, each requested method is fitted; if
-#'   both \code{"susie_inf_rss"} and \code{"susie_rss"} are present and
-#'   \code{add_susie_inf = TRUE}, SuSiE-RSS is initialised from the
-#'   SuSiE-inf-RSS result.
-#' @param add_susie_inf Logical. When \code{methods} contains both
-#'   \code{"susie_inf_rss"} and \code{"susie_rss"}, controls whether the
-#'   SuSiE-RSS fit uses the SuSiE-inf-RSS result as initialisation. Default
-#'   \code{TRUE}.
+#'   \code{methods} is passed explicitly, each requested method is fitted;
+#'   if \code{"susie_inf_rss"} is paired with \code{"susie_rss"} or
+#'   \code{"susie_ash_rss"} (or both) and \code{add_susie_inf = TRUE}, the
+#'   SuSiE-inf-RSS fit initialises the downstream method. This exposes five
+#'   distinct fitting modes mirroring the individual-level pipeline.
+#' @param add_susie_inf Logical. When \code{methods} contains
+#'   \code{"susie_inf_rss"} alongside \code{"susie_rss"} and/or
+#'   \code{"susie_ash_rss"}, controls whether SuSiE-inf-RSS is chained into
+#'   the downstream method(s) as initialisation. Default \code{TRUE}.
 #' @param coverage Coverage level (default: 0.95).
 #' @param secondary_coverage Secondary coverage levels (default: c(0.7, 0.5)).
 #' @param signal_cutoff PIP cutoff for selecting top loci (default: 0.1).
@@ -949,8 +959,11 @@ susie_rss_pipeline <- function(sumstats, LD_mat = NULL, X_mat = NULL, n = NULL,
     }
     fit_methods <- unique(methods)
   }
-  chain_inf_to_susie_rss <- isTRUE(add_susie_inf) &&
+  chain_inf_to_susie_rss     <- isTRUE(add_susie_inf) &&
     all(c("susie_inf_rss", "susie_rss") %in% fit_methods)
+  chain_inf_to_susie_ash_rss <- isTRUE(add_susie_inf) &&
+    all(c("susie_inf_rss", "susie_ash_rss") %in% fit_methods)
+  any_chained_init_rss <- chain_inf_to_susie_rss || chain_inf_to_susie_ash_rss
 
   if (!is.null(sumstats$z)) {
     z <- sumstats$z
@@ -992,7 +1005,7 @@ susie_rss_pipeline <- function(sumstats, LD_mat = NULL, X_mat = NULL, n = NULL,
   }
 
   fitted_models <- list()
-  if ("susie_inf_rss" %in% fit_methods || chain_inf_to_susie_rss) {
+  if ("susie_inf_rss" %in% fit_methods || any_chained_init_rss) {
     inf_fit <- fit_one_susie_inf_rss()
     fitted_models[["susie_inf_rss"]] <- .set_finemapping_fit_class(inf_fit, "susie_inf_rss")
   }
@@ -1002,7 +1015,8 @@ susie_rss_pipeline <- function(sumstats, LD_mat = NULL, X_mat = NULL, n = NULL,
     if (chain_inf_to_susie_rss) {
       chained_args <- prepare_susie_from_inf_args(
         list(L = L, L_greedy = L_greedy),
-        fitted_models[["susie_inf_rss"]], refine_default = TRUE
+        fitted_models[["susie_inf_rss"]], refine_default = TRUE,
+        unmappable_effects = "none"
       )
       rss_fit <- do.call(susie_rss, c(common, chained_args))
     } else {
@@ -1013,12 +1027,21 @@ susie_rss_pipeline <- function(sumstats, LD_mat = NULL, X_mat = NULL, n = NULL,
     fitted_models[[rss_label]] <- .set_finemapping_fit_class(rss_fit, rss_label)
   }
   if ("susie_ash_rss" %in% fit_methods) {
-    ash_fit <- fit_one_susie_ash_rss()
+    if (chain_inf_to_susie_ash_rss) {
+      chained_args <- prepare_susie_from_inf_args(
+        list(L = L, L_greedy = L_greedy),
+        fitted_models[["susie_inf_rss"]], refine_default = NULL,
+        unmappable_effects = "ash"
+      )
+      ash_fit <- do.call(susie_rss, c(common, chained_args))
+    } else {
+      ash_fit <- fit_one_susie_ash_rss()
+    }
     fitted_models[["susie_ash_rss"]] <- .set_finemapping_fit_class(ash_fit, "susie_ash_rss")
   }
 
   # Drop SuSiE-inf-RSS from post-processing if it was only fit for init
-  if (chain_inf_to_susie_rss && !("susie_inf_rss" %in% fit_methods)) {
+  if (any_chained_init_rss && !("susie_inf_rss" %in% fit_methods)) {
     fitted_models[["susie_inf_rss"]] <- NULL
   }
 
diff --git a/R/univariate_pipeline.R b/R/univariate_pipeline.R
index f8ee219c..3ee6fdf9 100644
--- a/R/univariate_pipeline.R
+++ b/R/univariate_pipeline.R
@@ -37,7 +37,9 @@
 #'   passed explicitly, each requested method is fitted; if
 #'   \code{"susie_inf"} is paired with \code{"susie"} or \code{"susie_ash"}
 #'   (or both) and \code{add_susie_inf = TRUE}, the SuSiE-inf fit
-#'   initialises each chained downstream method.
+#'   initialises each chained downstream method. This gives five distinct
+#'   fitting modes: SuSiE alone, SuSiE with SuSiE-inf init, SuSiE-inf alone,
+#'   SuSiE-ash alone, and SuSiE-ash with SuSiE-inf init.
 #' @param add_susie_inf When \code{methods} is \code{NULL}, controls whether
 #'   SuSiE-inf is fitted and chained into SuSiE. When \code{methods} is set
 #'   explicitly, controls whether the chained-init shortcut is applied to
@@ -116,8 +118,13 @@ univariate_analysis_pipeline <- function(
     }
     methods <- unique(methods)
   }
-  chain_inf_to_susie <- isTRUE(add_susie_inf) &&
+  # SuSiE-inf initialisation chains into SuSiE and/or SuSiE-ash whenever
+  # either of them is requested alongside SuSiE-inf and add_susie_inf is TRUE.
+  chain_inf_to_susie     <- isTRUE(add_susie_inf) &&
     all(c("susie_inf", "susie") %in% methods)
+  chain_inf_to_susie_ash <- isTRUE(add_susie_inf) &&
+    all(c("susie_inf", "susie_ash") %in% methods)
+  any_chained_init <- chain_inf_to_susie || chain_inf_to_susie_ash
   if (isTRUE(twas_weights) && !("susie" %in% methods)) {
     stop("twas_weights = TRUE requires \"susie\" to be in methods.")
   }
@@ -169,7 +176,7 @@ univariate_analysis_pipeline <- function(
   )
   fitted_models <- list()
 
-  if ("susie_inf" %in% methods || chain_inf_to_susie) {
+  if ("susie_inf" %in% methods || any_chained_init) {
     message("Fitting SuSiE-inf model on input data ...")
     inf_args <- modifyList(susie_args, list(
       X = X, y = Y,
@@ -186,7 +193,8 @@ univariate_analysis_pipeline <- function(
       message("Fitting SuSiE model initialized by SuSiE-inf ...")
       su_args <- prepare_susie_from_inf_args(susie_args,
                                              fitted_models[["susie_inf"]],
-                                             refine_default = TRUE)
+                                             refine_default = TRUE,
+                                             unmappable_effects = "none")
       su_fit <- do.call(susie, c(list(X = X, y = Y), su_args))
     } else {
       message("Fitting SuSiE model on input data ...")
@@ -196,19 +204,28 @@ univariate_analysis_pipeline <- function(
   }
 
   if ("susie_ash" %in% methods) {
-    message("Fitting SuSiE-ash model on input data ...")
-    ash_args <- modifyList(susie_args, list(
-      X = X, y = Y,
-      unmappable_effects = "ash",
-      convergence_method = "pip"
-    ))
-    ash_fit <- do.call(susie, ash_args)
+    if (chain_inf_to_susie_ash) {
+      message("Fitting SuSiE-ash model initialized by SuSiE-inf ...")
+      ash_args <- prepare_susie_from_inf_args(susie_args,
+                                              fitted_models[["susie_inf"]],
+                                              refine_default = NULL,
+                                              unmappable_effects = "ash")
+      ash_fit <- do.call(susie, c(list(X = X, y = Y), ash_args))
+    } else {
+      message("Fitting SuSiE-ash model on input data ...")
+      ash_args <- modifyList(susie_args, list(
+        X = X, y = Y,
+        unmappable_effects = "ash",
+        convergence_method = "pip"
+      ))
+      ash_fit <- do.call(susie, ash_args)
+    }
     fitted_models[["susie_ash"]] <- .set_finemapping_fit_class(ash_fit, "susie_ash")
   }
 
-  # Drop susie_inf from post-processing if it was only fit to provide init for SuSiE
-  # (i.e. caller did not request "susie_inf" in methods)
-  if (chain_inf_to_susie && !("susie_inf" %in% methods)) {
+  # Drop susie_inf from post-processing if it was only fit to provide init for
+  # SuSiE / SuSiE-ash (i.e. caller did not request "susie_inf" in methods).
+  if (any_chained_init && !("susie_inf" %in% methods)) {
     fitted_models[["susie_inf"]] <- NULL
   }
 
@@ -321,12 +338,12 @@ load_study_LD <- function(ld_path, region) {
 #'   "susie_ash_rss")}. Default \code{NULL} preserves legacy single-method
 #'   behavior via \code{finemapping_method}. When set explicitly, every
 #'   requested method contributes rows to the unified \code{top_loci}; when
-#'   both \code{"susie_inf_rss"} and \code{"susie_rss"} are requested and
-#'   \code{add_susie_inf = TRUE}, SuSiE-RSS is initialised from the
-#'   SuSiE-inf-RSS result.
-#' @param add_susie_inf Logical controlling chained init when both
-#'   \code{"susie_inf_rss"} and \code{"susie_rss"} are in \code{methods}.
-#'   Default \code{TRUE}.
+#'   \code{"susie_inf_rss"} is paired with \code{"susie_rss"} or
+#'   \code{"susie_ash_rss"} (or both) and \code{add_susie_inf = TRUE}, the
+#'   SuSiE-inf-RSS fit initialises the chained downstream method(s).
+#' @param add_susie_inf Logical controlling chained init when
+#'   \code{"susie_inf_rss"} is in \code{methods} alongside
+#'   \code{"susie_rss"} and/or \code{"susie_ash_rss"}. Default \code{TRUE}.
 #' @param finemapping_opts List of fine-mapping options (L, L_greedy, coverage,
 #'   signal_cutoff, min_abs_corr).
 #' @param impute Whether to impute missing variants via RAISS (default TRUE).
diff --git a/man/rss_analysis_pipeline.Rd b/man/rss_analysis_pipeline.Rd
index a9b87813..e2a78d75 100644
--- a/man/rss_analysis_pipeline.Rd
+++ b/man/rss_analysis_pipeline.Rd
@@ -69,13 +69,13 @@ variants to fit. Any subset of \code{c("susie_rss", "susie_inf_rss",
 "susie_ash_rss")}. Default \code{NULL} preserves legacy single-method
 behavior via \code{finemapping_method}. When set explicitly, every
 requested method contributes rows to the unified \code{top_loci}; when
-both \code{"susie_inf_rss"} and \code{"susie_rss"} are requested and
-\code{add_susie_inf = TRUE}, SuSiE-RSS is initialised from the
-SuSiE-inf-RSS result.}
+\code{"susie_inf_rss"} is paired with \code{"susie_rss"} or
+\code{"susie_ash_rss"} (or both) and \code{add_susie_inf = TRUE}, the
+SuSiE-inf-RSS fit initialises the chained downstream method(s).}
 
-\item{add_susie_inf}{Logical controlling chained init when both
-\code{"susie_inf_rss"} and \code{"susie_rss"} are in \code{methods}.
-Default \code{TRUE}.}
+\item{add_susie_inf}{Logical controlling chained init when
+\code{"susie_inf_rss"} is in \code{methods} alongside
+\code{"susie_rss"} and/or \code{"susie_ash_rss"}. Default \code{TRUE}.}
 
 \item{finemapping_opts}{List of fine-mapping options (L, L_greedy, coverage,
 signal_cutoff, min_abs_corr).}
diff --git a/man/susie_rss_pipeline.Rd b/man/susie_rss_pipeline.Rd
index bbbe6705..375837e3 100644
--- a/man/susie_rss_pipeline.Rd
+++ b/man/susie_rss_pipeline.Rd
@@ -46,15 +46,16 @@ method; ignored for \code{"susie_inf_rss"} and \code{"susie_ash_rss"}.}
 fit. Any subset of \code{c("susie_rss", "susie_inf_rss",
 "susie_ash_rss")}. Default \code{NULL} falls back to a single-method fit
 driven by \code{analysis_method} (backward-compatible behavior). When
-\code{methods} is passed explicitly, each requested method is fitted; if
-both \code{"susie_inf_rss"} and \code{"susie_rss"} are present and
-\code{add_susie_inf = TRUE}, SuSiE-RSS is initialised from the
-SuSiE-inf-RSS result.}
+\code{methods} is passed explicitly, each requested method is fitted;
+if \code{"susie_inf_rss"} is paired with \code{"susie_rss"} or
+\code{"susie_ash_rss"} (or both) and \code{add_susie_inf = TRUE}, the
+SuSiE-inf-RSS fit initialises the downstream method. This exposes five
+distinct fitting modes mirroring the individual-level pipeline.}
 
-\item{add_susie_inf}{Logical. When \code{methods} contains both
-\code{"susie_inf_rss"} and \code{"susie_rss"}, controls whether the
-SuSiE-RSS fit uses the SuSiE-inf-RSS result as initialisation. Default
-\code{TRUE}.}
+\item{add_susie_inf}{Logical. When \code{methods} contains
+\code{"susie_inf_rss"} alongside \code{"susie_rss"} and/or
+\code{"susie_ash_rss"}, controls whether SuSiE-inf-RSS is chained into
+the downstream method(s) as initialisation. Default \code{TRUE}.}
 
 \item{coverage}{Coverage level (default: 0.95).}
 
diff --git a/man/univariate_analysis_pipeline.Rd b/man/univariate_analysis_pipeline.Rd
index 5323ac6e..3f82d2d9 100644
--- a/man/univariate_analysis_pipeline.Rd
+++ b/man/univariate_analysis_pipeline.Rd
@@ -89,7 +89,9 @@ SuSiE fit as initialization; \code{add_susie_inf = FALSE} maps to
 passed explicitly, each requested method is fitted; if
 \code{"susie_inf"} is paired with \code{"susie"} or \code{"susie_ash"}
 (or both) and \code{add_susie_inf = TRUE}, the SuSiE-inf fit
-initialises each chained downstream method.}
+initialises each chained downstream method. This gives five distinct
+fitting modes: SuSiE alone, SuSiE with SuSiE-inf init, SuSiE-inf alone,
+SuSiE-ash alone, and SuSiE-ash with SuSiE-inf init.}
 
 \item{add_susie_inf}{When \code{methods} is \code{NULL}, controls whether
 SuSiE-inf is fitted and chained into SuSiE. When \code{methods} is set
diff --git a/tests/testthat/test_methods_param.R b/tests/testthat/test_methods_param.R
index df158b3f..b0a0981a 100644
--- a/tests/testthat/test_methods_param.R
+++ b/tests/testthat/test_methods_param.R
@@ -108,6 +108,102 @@ test_that("univariate_analysis_pipeline: unknown method rejected", {
   )
 })
 
+# Fifth fitting mode: SuSiE-ash with SuSiE-inf init -----------------------
+
+# Helper: signal-rich synthetic data where SuSiE-inf finds non-zero mappable
+# effects, so model_init genuinely carries information into SuSiE / SuSiE-ash.
+# (The shipped eqtl_region_example is too weak: SuSiE-inf converges with all V=0
+# / mu=0, which susieR's extract_model_init_fields correctly returns NULL for,
+# collapsing chained and independent runs to identical results.)
+.make_strong_signal_inputs <- function(seed = 1) {
+  set.seed(seed)
+  n <- 400; p <- 200
+  X <- matrix(rnorm(n * p), n, p)
+  colnames(X) <- paste0("chr1:", seq_len(p) * 1000, ":A:T")
+  rownames(X) <- paste0("S", seq_len(n))
+  beta_sparse <- rep(0, p); beta_sparse[c(20, 70, 130)] <- 2
+  beta_poly   <- rnorm(p, sd = 0.05)
+  y <- as.numeric(X %*% (beta_sparse + beta_poly) + rnorm(n))
+  maf <- rep(0.3, p)
+  list(X = X, y = y, maf = maf)
+}
+
+test_that("univariate_analysis_pipeline: chained SuSiE-inf -> SuSiE-ash differs from independent susie_ash", {
+  skip_if_not_installed("susieR")
+  inp <- .make_strong_signal_inputs()
+  chained <- univariate_analysis_pipeline(
+    X = inp$X, Y = inp$y, maf = inp$maf,
+    methods = c("susie_inf", "susie_ash"), add_susie_inf = TRUE,
+    twas_weights = FALSE,
+    finemapping_extra_opts = list(refine = FALSE),
+    signal_cutoff = 0, verbose = 0
+  )
+  indep <- univariate_analysis_pipeline(
+    X = inp$X, Y = inp$y, maf = inp$maf,
+    methods = c("susie_inf", "susie_ash"), add_susie_inf = FALSE,
+    twas_weights = FALSE,
+    finemapping_extra_opts = list(refine = FALSE),
+    signal_cutoff = 0, verbose = 0
+  )
+  expect_true(all(c("susie_inf", "susie_ash") %in% unique(chained$top_loci$method)))
+  expect_true(all(c("susie_inf", "susie_ash") %in% unique(indep$top_loci$method)))
+  expect_false(is.null(chained$susie_inf_fitted))
+  expect_false(is.null(chained$susie_ash_fitted))
+  # SuSiE-inf must have non-trivial mappable effects for model_init to matter
+  expect_true(any(chained$susie_inf_fitted$V > 0))
+  # With non-trivial init, chained SuSiE-ash PIPs differ from independent run
+  pip_chained <- chained$susie_ash_fitted$pip
+  pip_indep   <- indep$susie_ash_fitted$pip
+  expect_equal(length(pip_chained), length(pip_indep))
+  expect_true(max(abs(pip_chained - pip_indep)) > 1e-3)
+})
+
+test_that("univariate_analysis_pipeline: chain dispatch emits the chained-init message", {
+  skip_if_not_installed("susieR")
+  inp <- .make_strong_signal_inputs()
+  expect_message(
+    univariate_analysis_pipeline(
+      X = inp$X, Y = inp$y, maf = inp$maf,
+      methods = c("susie_inf", "susie_ash"), add_susie_inf = TRUE,
+      twas_weights = FALSE,
+      finemapping_extra_opts = list(refine = FALSE),
+      signal_cutoff = 0, verbose = 0
+    ),
+    "SuSiE-ash model initialized by SuSiE-inf"
+  )
+})
+
+test_that("univariate_analysis_pipeline: add_susie_inf=FALSE fits SuSiE-ash without chained init", {
+  skip_if_not_installed("susieR")
+  inp <- .make_strong_signal_inputs()
+  expect_message(
+    univariate_analysis_pipeline(
+      X = inp$X, Y = inp$y, maf = inp$maf,
+      methods = c("susie_inf", "susie_ash"), add_susie_inf = FALSE,
+      twas_weights = FALSE,
+      finemapping_extra_opts = list(refine = FALSE),
+      signal_cutoff = 0, verbose = 0
+    ),
+    "Fitting SuSiE-ash model on input data"
+  )
+})
+
+test_that("univariate_analysis_pipeline: all three methods + chain initialises both susie and susie_ash", {
+  skip_if_not_installed("susieR")
+  inp <- .make_strong_signal_inputs()
+  r <- univariate_analysis_pipeline(
+    X = inp$X, Y = inp$y, maf = inp$maf,
+    methods = c("susie_inf", "susie", "susie_ash"), add_susie_inf = TRUE,
+    twas_weights = FALSE,
+    finemapping_extra_opts = list(refine = FALSE),
+    signal_cutoff = 0, verbose = 0
+  )
+  expect_false(is.null(r$susie_inf_fitted))
+  expect_false(is.null(r$susie_fitted))
+  expect_false(is.null(r$susie_ash_fitted))
+  expect_true(all(c("susie_inf", "susie", "susie_ash") %in% unique(r$top_loci$method)))
+})
+
 test_that("univariate_analysis_pipeline: twas_weights = TRUE requires chained two-stage", {
   skip_if_not_installed("susieR")
   inp <- .make_uvp_inputs()
diff --git a/vignettes/fine-mapping.Rmd b/vignettes/fine-mapping.Rmd
index 35399bf0..cba698ce 100644
--- a/vignettes/fine-mapping.Rmd
+++ b/vignettes/fine-mapping.Rmd
@@ -95,8 +95,8 @@ rows are useful as a sanity check on inferred non-sparse effects.
 
 ## Choosing among SuSiE variants
 
-`pecotmr` exposes four SuSiE-family fine-mapping variants. Pick which to
-fit via the `methods` parameter on `univariate_analysis_pipeline()`
+`pecotmr` exposes five SuSiE-family fine-mapping variants. Pick which
+to fit via the `methods` parameter on `univariate_analysis_pipeline()`
 (individual-level) or `rss_analysis_pipeline()` / `susie_rss_pipeline()`
 (summary statistics):
 
@@ -106,6 +106,7 @@ fit via the `methods` parameter on `univariate_analysis_pipeline()`
 | **SuSiE with SuSiE-inf init** (default) | `methods = c("susie_inf", "susie")` + `add_susie_inf = TRUE` | `methods = c("susie_inf_rss", "susie_rss")` + `add_susie_inf = TRUE` | Two-stage: SuSiE-inf absorbs background polygenicity first, SuSiE then refines sparse signals from that initialisation. Recommended default for most QTL and GWAS fine-mapping. |
 | **SuSiE-inf alone** | `methods = "susie_inf"` | `methods = "susie_inf_rss"` | Sparse + infinitesimal joint fit. Useful for regions with strong polygenic background where the SuSiE refinement step is undesired. |
 | **SuSiE-ash alone** | `methods = "susie_ash"` | `methods = "susie_ash_rss"` | Sparse + adaptive shrinkage (ASH) mixture. Useful when you suspect a mix of medium-effect and small-effect variants beyond a small handful of large ones. |
+| **SuSiE-ash with SuSiE-inf init** | `methods = c("susie_inf", "susie_ash")` + `add_susie_inf = TRUE` | `methods = c("susie_inf_rss", "susie_ash_rss")` + `add_susie_inf = TRUE` | Same chained two-stage idea as the SuSiE variant but with ASH on the downstream fit: SuSiE-inf absorbs polygenic background first, then SuSiE-ash refines using the mixture prior. |
 
 The chained SuSiE-inf → SuSiE two-stage is the default both for
 backward compatibility and because it produces clean credible sets in
@@ -113,10 +114,26 @@ most settings: SuSiE alone can fold polygenic background into noisy
 "extra" credible sets; SuSiE-inf absorbs that background first so the
 SuSiE refinement concentrates on sparse signals.
 
-All four populate the same 22-column `top_loci` table. When you fit
+`add_susie_inf = TRUE` chains SuSiE-inf into **any** of `"susie"` or
+`"susie_ash"` that appear in `methods`. Listing all three
+(`methods = c("susie_inf", "susie", "susie_ash")` with
+`add_susie_inf = TRUE`) fits SuSiE-inf once, then both SuSiE and
+SuSiE-ash initialised from it — useful for direct head-to-head
+comparison on the same warm start.
+
+All five populate the same 22-column `top_loci` table. When you fit
 multiple methods in one call, the rows are row-bound and labelled by
 the `method` column.
 
+> Note: SuSiE-inf chain init only carries information when the
+> SuSiE-inf fit finds non-zero mappable effects. On a region where
+> SuSiE-inf explains everything via its infinitesimal component (all
+> `V == 0`, all `mu == 0`), upstream `susieR` correctly treats the
+> init as empty, and the chained downstream fit collapses to the
+> independent run. This is the right behaviour, not a bug — but it
+> means head-to-head differences only show up on regions with at
+> least one sparse signal.
+
 ### Backward compatibility with `add_susie_inf`
 
 The pre-existing `add_susie_inf` boolean still works exactly as before
@@ -127,10 +144,10 @@ when `methods` is `NULL` (the default):
 - `add_susie_inf = FALSE` → plain SuSiE alone. Equivalent to
   `methods = "susie"`.
 
-When `methods` is set explicitly, `add_susie_inf` only controls whether
-the chained-init shortcut is applied to a `c("susie_inf", "susie")`
-pair. Set `add_susie_inf = FALSE` with both methods present to fit
-SuSiE-inf and SuSiE independently rather than chained.
+When `methods` is set explicitly, `add_susie_inf` controls whether the
+chained-init shortcut is applied to any `"susie"` or `"susie_ash"`
+method that appears alongside `"susie_inf"`. Set `add_susie_inf = FALSE`
+to fit each requested method independently rather than chained.
 
 ### Fitting multiple variants in one call
 

From 8f5bbe7ecbd800a6dc5cc7a1db4463456c8595e5 Mon Sep 17 00:00:00 2001
From: Daniel Nachun <daniel.nachun@gmail.com>
Date: Thu, 4 Jun 2026 23:25:50 -0700
Subject: [PATCH 2/6] add impute

---
 .github/recipe/recipe.yaml                 |   2 +
 DESCRIPTION                                |   1 +
 NAMESPACE                                  |   3 +
 R/misc.R                                   | 251 +++++++++++++++++++++
 _pkgdown.yml                               |   6 +
 man/flashier_imputation.Rd                 |  48 ++++
 man/molecular_trait_imputation_pipeline.Rd |  68 ++++++
 man/softimpute_imputation.Rd               |  47 ++++
 pixi.toml                                  |   1 +
 tests/testthat/test_molecular_imputation.R | 140 ++++++++++++
 vignettes/molecular-imputation.Rmd         | 185 +++++++++++++++
 11 files changed, 752 insertions(+)
 create mode 100644 man/flashier_imputation.Rd
 create mode 100644 man/molecular_trait_imputation_pipeline.Rd
 create mode 100644 man/softimpute_imputation.Rd
 create mode 100644 tests/testthat/test_molecular_imputation.R
 create mode 100644 vignettes/molecular-imputation.Rmd

diff --git a/.github/recipe/recipe.yaml b/.github/recipe/recipe.yaml
index d7e3ed13..c892356f 100644
--- a/.github/recipe/recipe.yaml
+++ b/.github/recipe/recipe.yaml
@@ -73,6 +73,7 @@ requirements:
     - r-readr
     - r-rfast
     - r-rlang
+    - r-softimpute
     - r-stringr
     - r-susier
     - r-tibble
@@ -131,6 +132,7 @@ requirements:
     - r-readr
     - r-rfast
     - r-rlang
+    - r-softimpute
     - r-stringr
     - r-susier
     - r-tibble
diff --git a/DESCRIPTION b/DESCRIPTION
index 590b817a..3ca4b8c9 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -74,6 +74,7 @@ Suggests:
     rtracklayer,
     SNPRelate,
     snpStats,
+    softImpute,
     testthat,
     VariantAnnotation,
     xgboost
diff --git a/NAMESPACE b/NAMESPACE
index 3b20514b..0bb64eff 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -72,6 +72,7 @@ export(find_overlapping_regions)
 export(fine_mr)
 export(fit_mash_contrast)
 export(fit_susie_inf_then_susie_rss)
+export(flashier_imputation)
 export(format_finemapping_output)
 export(fsusie_get_cs)
 export(fsusie_wrapper)
@@ -173,6 +174,7 @@ export(merge_sumstats_matrices)
 export(merge_variant_info)
 export(meta_analysis_per_cell)
 export(meta_sldsc_random)
+export(molecular_trait_imputation_pipeline)
 export(mr_analysis)
 export(mr_ash_rss_weights)
 export(mr_format)
@@ -216,6 +218,7 @@ export(sdpr_weights)
 export(slalom)
 export(sldsc_postprocessing_pipeline)
 export(slice_mash_data)
+export(softimpute_imputation)
 export(standardise_sumstats_columns)
 export(standardize_sldsc_trait)
 export(subsetChr)
diff --git a/R/misc.R b/R/misc.R
index a1d264a6..2c135c42 100644
--- a/R/misc.R
+++ b/R/misc.R
@@ -959,6 +959,257 @@ xgboost_imputation <- function(data, maxiter = 10L, max_depth = 2L,
   if (iter == maxiter) ximp_history[[iter]] else ximp_history[[max(iter - 1L, 1L)]]
 }
 
+#' softImpute-based matrix imputation
+#'
+#' Imputes missing values in a numeric matrix via the soft-thresholded SVD
+#' algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"}
+#' alternating-least-squares solver by default; \code{lambda} defaults to
+#' a sensible point on the lambda grid (\code{0.5 * lambda0(x)}) when not
+#' supplied. Columns with all entries missing are removed before fitting
+#' and a one-line message records the count.
+#'
+#' @param data Numeric matrix (samples x features) with missing values
+#'   (NA).
+#' @param rank.max Maximum rank of the SVD solution. Default
+#'   \code{min(dim(data) - 1)}.
+#' @param lambda Nuclear-norm penalty. NULL (default) chooses
+#'   \code{0.5 * lambda0(data)} so the solution is not degenerate.
+#' @param type Optimisation type passed to \code{softImpute::softImpute};
+#'   one of \code{"als"} (default, faster) or \code{"svd"}.
+#' @param thresh Convergence threshold. Default 1e-5.
+#' @param maxit Maximum iterations. Default 100.
+#' @param verbose Logical, print progress. Default FALSE.
+#' @return The imputed matrix with the same dimensions as the input
+#'   (minus any all-NA columns).
+#' @export
+softimpute_imputation <- function(data, rank.max = NULL, lambda = NULL,
+                                   type = c("als", "svd"),
+                                   thresh = 1e-5, maxit = 100,
+                                   verbose = FALSE) {
+  if (!requireNamespace("softImpute", quietly = TRUE)) {
+    stop("Package 'softImpute' is required for softimpute_imputation.")
+  }
+  type <- match.arg(type)
+  X <- as.matrix(data)
+  n <- nrow(X); p <- ncol(X)
+  all_na <- colSums(is.na(X)) == n
+  if (any(all_na)) {
+    if (verbose) message("Removed ", sum(all_na), " column(s) with all entries missing.")
+    X <- X[, !all_na, drop = FALSE]
+    p <- ncol(X)
+  }
+  if (is.null(rank.max)) rank.max <- max(1L, min(n, p) - 1L)
+  if (is.null(lambda)) lambda <- 0.5 * softImpute::lambda0(X)
+  fit <- softImpute::softImpute(X, rank.max = rank.max, lambda = lambda,
+                                 type = type, thresh = thresh, maxit = maxit,
+                                 trace.it = isTRUE(verbose))
+  Ximp <- softImpute::complete(X, fit)
+  if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X)
+  if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X)
+  Ximp
+}
+
+#' flashier-based matrix imputation
+#'
+#' Imputes missing values in a numeric matrix via empirical Bayes
+#' matrix factorisation (\code{flashier::flash}). \code{flash} natively
+#' supports NA entries and reconstructs them from the posterior fitted
+#' values. The default prior families match those used by
+#' \code{\link{compute_cov_flash}}: \code{ebnm::ebnm_normal} on the row
+#' factor and \code{ebnm::ebnm_normal_scale_mixture} on the column
+#' factor.
+#'
+#' @param data Numeric matrix (samples x features) with missing values
+#'   (NA).
+#' @param var_type Variance structure for the residuals, passed to
+#'   \code{flashier::flash}. Default 2 (per-column).
+#' @param ebnm_fn Prior family. Default
+#'   \code{c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture)}.
+#' @param greedy_Kmax Maximum number of greedy factors. Default
+#'   \code{min(20L, min(dim(data)) - 1L)}.
+#' @param backfit Logical; run a final backfit pass. Default TRUE.
+#' @param verbose Numeric verbosity (0 silent, default).
+#' @param ... Additional arguments forwarded to \code{flashier::flash}.
+#' @return The imputed matrix with the same dimensions as the input
+#'   (minus any all-NA columns).
+#' @export
+flashier_imputation <- function(data, var_type = 2,
+                                 ebnm_fn = c(ebnm::ebnm_normal,
+                                             ebnm::ebnm_normal_scale_mixture),
+                                 greedy_Kmax = NULL,
+                                 backfit = TRUE, verbose = 0, ...) {
+  if (!requireNamespace("flashier", quietly = TRUE)) {
+    stop("Package 'flashier' is required for flashier_imputation.")
+  }
+  if (!requireNamespace("ebnm", quietly = TRUE)) {
+    stop("Package 'ebnm' is required for flashier_imputation.")
+  }
+  X <- as.matrix(data)
+  n <- nrow(X); p <- ncol(X)
+  all_na <- colSums(is.na(X)) == n
+  if (any(all_na)) {
+    if (isTRUE(verbose) || (is.numeric(verbose) && verbose >= 1)) {
+      message("Removed ", sum(all_na), " column(s) with all entries missing.")
+    }
+    X <- X[, !all_na, drop = FALSE]
+    p <- ncol(X)
+  }
+  if (is.null(greedy_Kmax)) greedy_Kmax <- max(1L, min(20L, min(n, p) - 1L))
+  fl <- flashier::flash(X, var_type = var_type, ebnm_fn = ebnm_fn,
+                         greedy_Kmax = greedy_Kmax, backfit = backfit,
+                         verbose = verbose, ...)
+  # Posterior fitted value: L_pm %*% t(F_pm). When no factors are found,
+  # fall back to column means.
+  if (is.null(fl$n_factors) || fl$n_factors == 0) {
+    fitted <- matrix(colMeans(X, na.rm = TRUE),
+                     nrow = n, ncol = p, byrow = TRUE)
+  } else {
+    fitted <- tcrossprod(fl$L_pm, fl$F_pm)
+  }
+  Ximp <- X
+  na_idx <- is.na(X)
+  Ximp[na_idx] <- fitted[na_idx]
+  if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X)
+  if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X)
+  Ximp
+}
+
+#' Pipeline for molecular trait imputation
+#'
+#' Unified entry point for imputing missing values in a molecular trait
+#' matrix (e.g. expression, splicing, protein abundance). Dispatches to
+#' one or more underlying methods and optionally evaluates imputation
+#' quality on a held-out random mask of observed entries.
+#'
+#' Available methods:
+#' \itemize{
+#'   \item \code{"xgboost"} - \code{\link{xgboost_imputation}}, iterative
+#'     XGBoost per-column models. No structural assumption on the matrix.
+#'   \item \code{"softimpute"} - \code{\link{softimpute_imputation}},
+#'     soft-thresholded SVD via \code{softImpute::softImpute}. Assumes a
+#'     low-rank smooth signal.
+#'   \item \code{"flashier"} - \code{\link{flashier_imputation}},
+#'     empirical Bayes matrix factorisation via \code{flashier::flash}
+#'     with sparse + scale-mixture priors.
+#' }
+#'
+#' @param data Numeric matrix (samples x features) with missing values.
+#' @param methods Character vector of methods to fit. Any subset of
+#'   \code{c("xgboost", "softimpute", "flashier")}. Default fits all
+#'   three.
+#' @param method_args Named list of per-method argument lists. List
+#'   names must match \code{methods}. Default empty.
+#' @param evaluate Logical. If TRUE, mask a random fraction of observed
+#'   entries, run each method, and report root-mean-squared error on the
+#'   masked cells. Default FALSE.
+#' @param eval_fraction Fraction of observed cells to hold out for
+#'   evaluation. Default 0.05.
+#' @param eval_seed Random seed for the held-out mask. Default 42.
+#' @param verbose Logical, print progress. Default FALSE.
+#'
+#' @return A list with:
+#' \describe{
+#'   \item{imputed}{Named list of imputed matrices, one per requested
+#'     method.}
+#'   \item{missingness}{Per-column NA count and overall NA fraction.}
+#'   \item{evaluation}{When \code{evaluate = TRUE}, a data.frame with
+#'     one row per method: \code{method}, \code{rmse}, \code{n_held_out},
+#'     \code{time_seconds}.}
+#'   \item{method_args}{The resolved per-method arguments used.}
+#' }
+#' @export
+molecular_trait_imputation_pipeline <- function(
+    data,
+    methods = c("xgboost", "softimpute", "flashier"),
+    method_args = list(),
+    evaluate = FALSE,
+    eval_fraction = 0.05,
+    eval_seed = 42L,
+    verbose = FALSE) {
+  valid <- c("xgboost", "softimpute", "flashier")
+  if (!is.character(methods) || length(methods) == 0L) {
+    stop("methods must be a non-empty character vector.")
+  }
+  bad <- setdiff(methods, valid)
+  if (length(bad) > 0) {
+    stop("Unknown imputation method(s): ", paste(bad, collapse = ", "),
+         ". Valid options: ", paste(valid, collapse = ", "))
+  }
+  methods <- unique(methods)
+
+  Xfull <- as.matrix(data)
+  n <- nrow(Xfull); p <- ncol(Xfull)
+  na_mask_full <- is.na(Xfull)
+  na_per_col <- colSums(na_mask_full)
+  missingness <- list(
+    per_column = na_per_col,
+    n_missing = sum(na_mask_full),
+    fraction = sum(na_mask_full) / (n * p)
+  )
+  if (verbose) {
+    message(sprintf("molecular_trait_imputation_pipeline: %d x %d matrix, %.2f%% missing.",
+                    n, p, 100 * missingness$fraction))
+  }
+
+  dispatch <- list(xgboost = xgboost_imputation,
+                   softimpute = softimpute_imputation,
+                   flashier = flashier_imputation)
+
+  run_one <- function(m, X_in) {
+    args <- method_args[[m]] %||% list()
+    do.call(dispatch[[m]], c(list(X_in), args))
+  }
+
+  # If evaluating, hold out a random subset of observed cells, run each
+  # method against the masked matrix, then score RMSE on the held-out
+  # cells. Final imputation is fit against the original matrix so the
+  # returned matrices use every observed cell.
+  eval_df <- NULL
+  if (isTRUE(evaluate)) {
+    obs_idx <- which(!na_mask_full)
+    n_hold <- max(1L, floor(eval_fraction * length(obs_idx)))
+    if (n_hold >= length(obs_idx)) {
+      stop("eval_fraction too large; would mask all observed entries.")
+    }
+    set.seed(eval_seed)
+    hold_idx <- sample(obs_idx, size = n_hold, replace = FALSE)
+    Xmasked <- Xfull
+    held_truth <- Xfull[hold_idx]
+    Xmasked[hold_idx] <- NA_real_
+    eval_records <- vector("list", length(methods))
+    for (i in seq_along(methods)) {
+      m <- methods[i]
+      if (verbose) message("Evaluating ", m, " on held-out mask ...")
+      t0 <- proc.time()
+      Ximp_m <- run_one(m, Xmasked)
+      t1 <- proc.time()
+      pred <- Ximp_m[hold_idx]
+      rmse <- sqrt(mean((pred - held_truth)^2, na.rm = TRUE))
+      eval_records[[i]] <- data.frame(
+        method = m, rmse = rmse, n_held_out = n_hold,
+        time_seconds = as.numeric((t1 - t0)["elapsed"]),
+        stringsAsFactors = FALSE
+      )
+    }
+    eval_df <- do.call(rbind, eval_records)
+    rownames(eval_df) <- NULL
+  }
+
+  # Final imputation: run each method on the full matrix
+  imputed <- list()
+  for (m in methods) {
+    if (verbose) message("Imputing with ", m, " ...")
+    imputed[[m]] <- run_one(m, Xfull)
+  }
+
+  list(
+    imputed = imputed,
+    missingness = missingness,
+    evaluation = eval_df,
+    method_args = method_args
+  )
+}
+
 #' Robust Mahalanobis Distance
 #'
 #' Drop-in replacement for \code{\link[stats]{mahalanobis}} that handles
diff --git a/_pkgdown.yml b/_pkgdown.yml
index e8bdeade..efcf0b98 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -45,6 +45,12 @@ navbar:
    aria-label: View source on GitHub
 
 articles:
+  - title: "Data Preparation"
+    desc: >
+      Imputing missing values in molecular trait matrices before
+      fine-mapping and TWAS weight training.
+    contents:
+    - molecular-imputation
   - title: "Quality Control"
     desc: >
       Allele alignment and LD-mismatch QC for GWAS and QTL summary
diff --git a/man/flashier_imputation.Rd b/man/flashier_imputation.Rd
new file mode 100644
index 00000000..90578084
--- /dev/null
+++ b/man/flashier_imputation.Rd
@@ -0,0 +1,48 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/misc.R
+\name{flashier_imputation}
+\alias{flashier_imputation}
+\title{flashier-based matrix imputation}
+\usage{
+flashier_imputation(
+  data,
+  var_type = 2,
+  ebnm_fn = c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture),
+  greedy_Kmax = NULL,
+  backfit = TRUE,
+  verbose = 0,
+  ...
+)
+}
+\arguments{
+\item{data}{Numeric matrix (samples x features) with missing values
+(NA).}
+
+\item{var_type}{Variance structure for the residuals, passed to
+\code{flashier::flash}. Default 2 (per-column).}
+
+\item{ebnm_fn}{Prior family. Default
+\code{c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture)}.}
+
+\item{greedy_Kmax}{Maximum number of greedy factors. Default
+\code{min(20L, min(dim(data)) - 1L)}.}
+
+\item{backfit}{Logical; run a final backfit pass. Default TRUE.}
+
+\item{verbose}{Numeric verbosity (0 silent, default).}
+
+\item{...}{Additional arguments forwarded to \code{flashier::flash}.}
+}
+\value{
+The imputed matrix with the same dimensions as the input
+  (minus any all-NA columns).
+}
+\description{
+Imputes missing values in a numeric matrix via empirical Bayes
+matrix factorisation (\code{flashier::flash}). \code{flash} natively
+supports NA entries and reconstructs them from the posterior fitted
+values. The default prior families match those used by
+\code{\link{compute_cov_flash}}: \code{ebnm::ebnm_normal} on the row
+factor and \code{ebnm::ebnm_normal_scale_mixture} on the column
+factor.
+}
diff --git a/man/molecular_trait_imputation_pipeline.Rd b/man/molecular_trait_imputation_pipeline.Rd
new file mode 100644
index 00000000..68a74d6a
--- /dev/null
+++ b/man/molecular_trait_imputation_pipeline.Rd
@@ -0,0 +1,68 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/misc.R
+\name{molecular_trait_imputation_pipeline}
+\alias{molecular_trait_imputation_pipeline}
+\title{Pipeline for molecular trait imputation}
+\usage{
+molecular_trait_imputation_pipeline(
+  data,
+  methods = c("xgboost", "softimpute", "flashier"),
+  method_args = list(),
+  evaluate = FALSE,
+  eval_fraction = 0.05,
+  eval_seed = 42L,
+  verbose = FALSE
+)
+}
+\arguments{
+\item{data}{Numeric matrix (samples x features) with missing values.}
+
+\item{methods}{Character vector of methods to fit. Any subset of
+\code{c("xgboost", "softimpute", "flashier")}. Default fits all
+three.}
+
+\item{method_args}{Named list of per-method argument lists. List
+names must match \code{methods}. Default empty.}
+
+\item{evaluate}{Logical. If TRUE, mask a random fraction of observed
+entries, run each method, and report root-mean-squared error on the
+masked cells. Default FALSE.}
+
+\item{eval_fraction}{Fraction of observed cells to hold out for
+evaluation. Default 0.05.}
+
+\item{eval_seed}{Random seed for the held-out mask. Default 42.}
+
+\item{verbose}{Logical, print progress. Default FALSE.}
+}
+\value{
+A list with:
+\describe{
+  \item{imputed}{Named list of imputed matrices, one per requested
+    method.}
+  \item{missingness}{Per-column NA count and overall NA fraction.}
+  \item{evaluation}{When \code{evaluate = TRUE}, a data.frame with
+    one row per method: \code{method}, \code{rmse}, \code{n_held_out},
+    \code{time_seconds}.}
+  \item{method_args}{The resolved per-method arguments used.}
+}
+}
+\description{
+Unified entry point for imputing missing values in a molecular trait
+matrix (e.g. expression, splicing, protein abundance). Dispatches to
+one or more underlying methods and optionally evaluates imputation
+quality on a held-out random mask of observed entries.
+}
+\details{
+Available methods:
+\itemize{
+  \item \code{"xgboost"} - \code{\link{xgboost_imputation}}, iterative
+    XGBoost per-column models. No structural assumption on the matrix.
+  \item \code{"softimpute"} - \code{\link{softimpute_imputation}},
+    soft-thresholded SVD via \code{softImpute::softImpute}. Assumes a
+    low-rank smooth signal.
+  \item \code{"flashier"} - \code{\link{flashier_imputation}},
+    empirical Bayes matrix factorisation via \code{flashier::flash}
+    with sparse + scale-mixture priors.
+}
+}
diff --git a/man/softimpute_imputation.Rd b/man/softimpute_imputation.Rd
new file mode 100644
index 00000000..efb08fa5
--- /dev/null
+++ b/man/softimpute_imputation.Rd
@@ -0,0 +1,47 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/misc.R
+\name{softimpute_imputation}
+\alias{softimpute_imputation}
+\title{softImpute-based matrix imputation}
+\usage{
+softimpute_imputation(
+  data,
+  rank.max = NULL,
+  lambda = NULL,
+  type = c("als", "svd"),
+  thresh = 1e-05,
+  maxit = 100,
+  verbose = FALSE
+)
+}
+\arguments{
+\item{data}{Numeric matrix (samples x features) with missing values
+(NA).}
+
+\item{rank.max}{Maximum rank of the SVD solution. Default
+\code{min(dim(data) - 1)}.}
+
+\item{lambda}{Nuclear-norm penalty. NULL (default) chooses
+\code{0.5 * lambda0(data)} so the solution is not degenerate.}
+
+\item{type}{Optimisation type passed to \code{softImpute::softImpute};
+one of \code{"als"} (default, faster) or \code{"svd"}.}
+
+\item{thresh}{Convergence threshold. Default 1e-5.}
+
+\item{maxit}{Maximum iterations. Default 100.}
+
+\item{verbose}{Logical, print progress. Default FALSE.}
+}
+\value{
+The imputed matrix with the same dimensions as the input
+  (minus any all-NA columns).
+}
+\description{
+Imputes missing values in a numeric matrix via the soft-thresholded SVD
+algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"}
+alternating-least-squares solver by default; \code{lambda} defaults to
+a sensible point on the lambda grid (\code{0.5 * lambda0(x)}) when not
+supplied. Columns with all entries missing are removed before fitting
+and a one-line message records the count.
+}
diff --git a/pixi.toml b/pixi.toml
index 62a2cca3..b2afbe2a 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -92,6 +92,7 @@ r45 = {features = ["r45"]}
 "r-readr" = "*"
 "r-rfast" = "*"
 "r-rlang" = "*"
+"r-softimpute" = "*"
 "r-stringr" = "*"
 "r-susier" = "*"
 "r-tibble" = "*"
diff --git a/tests/testthat/test_molecular_imputation.R b/tests/testthat/test_molecular_imputation.R
new file mode 100644
index 00000000..2c68b808
--- /dev/null
+++ b/tests/testthat/test_molecular_imputation.R
@@ -0,0 +1,140 @@
+context("Molecular trait imputation pipeline")
+
+# Helper: low-rank synthetic matrix with NA mask.
+.make_imputation_inputs <- function(n = 60, p = 15, rank = 3, na_frac = 0.1,
+                                     seed = 1) {
+  set.seed(seed)
+  U <- matrix(rnorm(n * rank), n, rank)
+  V <- matrix(rnorm(p * rank), p, rank)
+  truth <- U %*% t(V) + matrix(rnorm(n * p, sd = 0.1), n, p)
+  X <- truth
+  na_mask <- matrix(runif(n * p) < na_frac, n, p)
+  X[na_mask] <- NA
+  rownames(X) <- paste0("S", seq_len(n))
+  colnames(X) <- paste0("F", seq_len(p))
+  list(X = X, truth = truth, na_mask = na_mask)
+}
+
+# =============================================================================
+# softimpute_imputation
+# =============================================================================
+
+test_that("softimpute_imputation: returns matrix with same dims, no NAs", {
+  skip_if_not_installed("softImpute")
+  inp <- .make_imputation_inputs()
+  Xi <- softimpute_imputation(inp$X)
+  expect_equal(dim(Xi), dim(inp$X))
+  expect_false(any(is.na(Xi)))
+  expect_equal(rownames(Xi), rownames(inp$X))
+})
+
+test_that("softimpute_imputation: handles all-NA columns by removing them", {
+  skip_if_not_installed("softImpute")
+  inp <- .make_imputation_inputs()
+  X <- inp$X
+  X[, 5] <- NA
+  expect_message(
+    Xi <- softimpute_imputation(X, verbose = TRUE),
+    "Removed 1 column"
+  )
+  expect_equal(ncol(Xi), ncol(X) - 1L)
+})
+
+# =============================================================================
+# flashier_imputation
+# =============================================================================
+
+test_that("flashier_imputation: returns matrix with same dims, no NAs", {
+  skip_if_not_installed("flashier")
+  skip_if_not_installed("ebnm")
+  inp <- .make_imputation_inputs()
+  Xi <- flashier_imputation(inp$X, verbose = 0)
+  expect_equal(dim(Xi), dim(inp$X))
+  expect_false(any(is.na(Xi)))
+})
+
+test_that("flashier_imputation: recovers low-rank truth better than column means", {
+  skip_if_not_installed("flashier")
+  skip_if_not_installed("ebnm")
+  inp <- .make_imputation_inputs()
+  Xi <- flashier_imputation(inp$X, verbose = 0)
+  # Baseline: column-mean imputation
+  Xbase <- inp$X
+  cm <- colMeans(Xbase, na.rm = TRUE)
+  for (j in seq_len(ncol(Xbase))) Xbase[is.na(Xbase[, j]), j] <- cm[j]
+  rmse_flash <- sqrt(mean((Xi[inp$na_mask] - inp$truth[inp$na_mask])^2))
+  rmse_mean  <- sqrt(mean((Xbase[inp$na_mask] - inp$truth[inp$na_mask])^2))
+  expect_lt(rmse_flash, rmse_mean)
+})
+
+# =============================================================================
+# molecular_trait_imputation_pipeline
+# =============================================================================
+
+test_that("pipeline: single-method dispatch returns named list keyed by method", {
+  skip_if_not_installed("softImpute")
+  inp <- .make_imputation_inputs()
+  res <- molecular_trait_imputation_pipeline(inp$X, methods = "softimpute")
+  expect_named(res$imputed, "softimpute")
+  expect_equal(dim(res$imputed[["softimpute"]]), dim(inp$X))
+  expect_null(res$evaluation)
+})
+
+test_that("pipeline: multi-method runs all three and returns one matrix per method", {
+  skip_if_not_installed("softImpute")
+  skip_if_not_installed("flashier")
+  skip_if_not_installed("xgboost")
+  inp <- .make_imputation_inputs()
+  res <- molecular_trait_imputation_pipeline(
+    inp$X, methods = c("xgboost", "softimpute", "flashier"),
+    method_args = list(
+      xgboost = list(maxiter = 2, nrounds = 10, verbose = FALSE)
+    )
+  )
+  expect_named(res$imputed, c("xgboost", "softimpute", "flashier"))
+  for (m in names(res$imputed)) {
+    expect_equal(dim(res$imputed[[m]]), dim(inp$X))
+    expect_false(any(is.na(res$imputed[[m]])))
+  }
+})
+
+test_that("pipeline: evaluate = TRUE reports RMSE on held-out cells", {
+  skip_if_not_installed("softImpute")
+  skip_if_not_installed("flashier")
+  inp <- .make_imputation_inputs()
+  res <- molecular_trait_imputation_pipeline(
+    inp$X, methods = c("softimpute", "flashier"),
+    evaluate = TRUE, eval_fraction = 0.1, eval_seed = 7
+  )
+  expect_s3_class(res$evaluation, "data.frame")
+  expect_equal(nrow(res$evaluation), 2L)
+  expect_named(res$evaluation,
+               c("method", "rmse", "n_held_out", "time_seconds"))
+  expect_true(all(res$evaluation$rmse > 0))
+  expect_true(all(res$evaluation$n_held_out > 0))
+})
+
+test_that("pipeline: missingness summary matches input NA count", {
+  skip_if_not_installed("softImpute")
+  inp <- .make_imputation_inputs()
+  res <- molecular_trait_imputation_pipeline(inp$X, methods = "softimpute")
+  expect_equal(res$missingness$n_missing, sum(is.na(inp$X)))
+  expect_equal(res$missingness$fraction,
+               sum(is.na(inp$X)) / prod(dim(inp$X)))
+})
+
+test_that("pipeline: rejects unknown method", {
+  inp <- .make_imputation_inputs()
+  expect_error(
+    molecular_trait_imputation_pipeline(inp$X, methods = "knnimpute"),
+    "Unknown imputation method"
+  )
+})
+
+test_that("pipeline: rejects empty methods vector", {
+  inp <- .make_imputation_inputs()
+  expect_error(
+    molecular_trait_imputation_pipeline(inp$X, methods = character(0)),
+    "non-empty character vector"
+  )
+})
diff --git a/vignettes/molecular-imputation.Rmd b/vignettes/molecular-imputation.Rmd
new file mode 100644
index 00000000..bbd7fd14
--- /dev/null
+++ b/vignettes/molecular-imputation.Rmd
@@ -0,0 +1,185 @@
+---
+title: "Imputing Missing Values in Molecular Trait Matrices"
+author: "pecotmr authors"
+date: "`r Sys.Date()`"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Imputing Missing Values in Molecular Trait Matrices}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE,
+                      fig.width = 7, fig.height = 4.5)
+```
+
+## Overview
+
+Real molecular trait data (expression, splicing, protein abundance,
+chromatin accessibility) almost always has missing values: low-coverage
+samples, transcripts below detection in some tissues, batch dropouts.
+Many downstream pecotmr workflows expect a complete matrix:
+
+- [Fine-mapping](fine-mapping.html) — `univariate_analysis_pipeline()`
+  requires a numeric `Y` vector with no NAs.
+- [TWAS weights](twas-weights.html) — `twas_weights_pipeline()` and
+  `twas_multivariate_weights_pipeline()` require complete `Y`.
+- Multivariate fits — `mrmash_wrapper()` and `multivariate_analysis_pipeline()`
+  reject NA in `Y`.
+
+`pecotmr` provides three imputation methods plus a unified pipeline.
+The methods differ in what they assume about the data:
+
+| Method | Function | Assumption | When to use |
+|---|---|---|---|
+| **XGBoost** | `xgboost_imputation()` | None — fits arbitrary nonlinear per-column models | Heterogeneous matrices where columns have different relationships. Slowest but most flexible. |
+| **softImpute** | `softimpute_imputation()` | Low-rank smooth signal | Expression matrices with a small number of dominant axes of variation. Fastest. |
+| **flashier** | `flashier_imputation()` | Sparse + scale-mixture EBMF factors | Matrices with sparse + diffuse signal mixtures. Good middle ground; reuses the same prior families as `compute_cov_flash()` for mr.mash priors. |
+
+All three are exposed through `molecular_trait_imputation_pipeline()`,
+which can fit multiple methods in one call and evaluate them on a
+held-out random mask of observed entries.
+
+```{r load-packages}
+library(pecotmr)
+```
+
+## Quick start: one method on a synthetic matrix
+
+A low-rank synthetic example illustrates the contract. We build a
+60 × 15 matrix with rank-3 truth + Gaussian noise, then mask 10% of
+entries.
+
+```{r make-synthetic}
+set.seed(1)
+n <- 60; p <- 15; rank_true <- 3
+U <- matrix(rnorm(n * rank_true), n, rank_true)
+V <- matrix(rnorm(p * rank_true), p, rank_true)
+truth <- U %*% t(V) + matrix(rnorm(n * p, sd = 0.1), n, p)
+rownames(truth) <- paste0("S", seq_len(n))
+colnames(truth) <- paste0("F", seq_len(p))
+
+# Mask 10% of entries
+na_mask <- matrix(runif(n * p) < 0.1, n, p)
+X <- truth
+X[na_mask] <- NA
+sprintf("Missingness: %.1f%%", 100 * mean(is.na(X)))
+```
+
+Run the three methods individually:
+
+```{r single-method-runs}
+Xi_softimpute <- softimpute_imputation(X)
+Xi_flashier   <- flashier_imputation(X, verbose = 0)
+# XGBoost can be slow; reduce iterations for the vignette.
+Xi_xgboost    <- xgboost_imputation(X, maxiter = 3, nrounds = 20,
+                                     verbose = FALSE)
+
+# RMSE against the truth on the masked cells
+rmse <- function(Xi) sqrt(mean((Xi[na_mask] - truth[na_mask])^2))
+cat(sprintf("softImpute RMSE: %.3f\n", rmse(Xi_softimpute)))
+cat(sprintf("flashier   RMSE: %.3f\n", rmse(Xi_flashier)))
+cat(sprintf("xgboost    RMSE: %.3f\n", rmse(Xi_xgboost)))
+```
+
+On rank-3 truth, flashier and softImpute both exploit the low-rank
+structure; XGBoost has no structural assumption and pays for that on
+this kind of input. Real expression matrices tend to be closer to the
+low-rank regime, so softImpute and flashier are usually competitive.
+
+## The pipeline: multi-method + held-out CV
+
+For real data you typically want to compare methods before picking one.
+`molecular_trait_imputation_pipeline()` does this in a single call.
+With `evaluate = TRUE` it holds out a random fraction of observed cells,
+fits each method against the masked matrix, and reports RMSE on the
+masked cells.
+
+```{r pipeline-multi}
+res <- molecular_trait_imputation_pipeline(
+  X,
+  methods       = c("xgboost", "softimpute", "flashier"),
+  method_args   = list(
+    xgboost = list(maxiter = 3, nrounds = 20, verbose = FALSE)
+  ),
+  evaluate      = TRUE,
+  eval_fraction = 0.05,
+  eval_seed     = 7
+)
+
+res$missingness$fraction          # overall missingness
+res$evaluation                     # per-method RMSE + time
+str(res$imputed, max.level = 1)    # final imputed matrices, one per method
+```
+
+The pipeline reports three things:
+
+- **`imputed`** — named list of imputed matrices, one per requested
+  method. These were fit on the **full** input (including the cells
+  that were held out for CV), so they are what you would use
+  downstream.
+- **`missingness`** — per-column NA counts and overall NA fraction.
+- **`evaluation`** — when `evaluate = TRUE`, a data.frame with one row
+  per method: `method`, `rmse`, `n_held_out`, `time_seconds`. Use it to
+  pick the method that best handles your data's structure.
+
+## Choosing a method
+
+| Method | Typical RMSE behaviour | Cost | Notes |
+|---|---|---|---|
+| **softimpute** | Best on smooth low-rank matrices. Sensitive to the choice of `lambda`; the default `0.5 * lambda0(x)` is a reasonable starting point. | Very fast (seconds). | Tune `lambda` and `rank.max` if RMSE is poor on your data. |
+| **flashier** | Best on mixed sparse + diffuse signal. Default priors (`ebnm_normal` + `ebnm_normal_scale_mixture`) work well for most expression-like matrices. | Moderate (seconds-to-minutes). | Reuses the same FLASH defaults as `compute_cov_flash()` (mr.mash prior step). |
+| **xgboost** | Robust to non-linear column relationships. Can overfit on small matrices. | Slowest (minutes-to-hours for large matrices). | Set `num_workers > 1` for parallel per-column fits. Tune `max_depth` and `nrounds`. |
+
+A simple rule of thumb: run all three with `evaluate = TRUE` on a
+small subset of your data, then use the best for the full matrix.
+
+## Integration with the rest of pecotmr
+
+The imputed matrix slots directly into the downstream pipelines:
+
+```{r downstream, eval=FALSE}
+# Imputed phenotype matrix from the pipeline
+Y_complete <- res$imputed[["flashier"]]
+
+# Fit fine-mapping on a single context (one column of Y)
+fit <- univariate_analysis_pipeline(
+  X = genotype_matrix, Y = Y_complete[, "F1"], maf = maf_vector,
+  twas_weights = TRUE
+)
+
+# Or fit multivariate models on the full matrix
+mnm_fit <- multivariate_analysis_pipeline(
+  X = genotype_matrix, Y = Y_complete,
+  data_driven_prior_matrices = prior_matrices
+)
+mt <- twas_multivariate_weights_pipeline(
+  X = genotype_matrix, Y = Y_complete, mnm_fit = mnm_fit
+)
+```
+
+Imputation happens once, upstream of fine-mapping and TWAS weight
+training, on the full phenotype matrix across contexts. The pipelines
+do not re-impute internally.
+
+## Caveats
+
+1. **Don't over-impute.** A column that's missing in most samples won't
+   recover useful structure no matter which method you use. Drop
+   columns with high missingness (`> 50%` is a common cutoff) before
+   imputing.
+2. **All-NA columns are dropped.** Each method removes columns where
+   every entry is missing and emits a one-line message. The returned
+   matrix has fewer columns than the input in that case.
+3. **CV evaluation uses one random mask.** For more stable RMSE
+   estimates, run the pipeline a few times with different `eval_seed`
+   values and average.
+4. **softImpute lambda tuning matters.** The default
+   `0.5 * lambda0(x)` is a starting point. If softImpute looks
+   uncompetitive on your data, sweep `lambda` over a grid (e.g.
+   `lambda = lambda0(x) * c(0.1, 0.25, 0.5, 1, 2)`).
+
+```{r session-info}
+sessionInfo()
+```

From 1a61facac4b5dba5b8f685ee51f4dba01e6133c5 Mon Sep 17 00:00:00 2001
From: Daniel Nachun <daniel.nachun@gmail.com>
Date: Fri, 5 Jun 2026 01:18:14 -0700
Subject: [PATCH 3/6] add impute pipeline

---
 DESCRIPTION                                |   1 +
 R/imputation.R                             | 468 +++++++++++++++++++++
 R/misc.R                                   | 359 ----------------
 man/flashier_imputation.Rd                 |   2 +-
 man/molecular_trait_imputation_pipeline.Rd |   2 +-
 man/softimpute_imputation.Rd               |  38 +-
 man/xgboost_imputation.Rd                  |   2 +-
 tests/testthat/test_molecular_imputation.R |  45 ++
 vignettes/molecular-imputation.Rmd         |  14 +-
 9 files changed, 555 insertions(+), 376 deletions(-)
 create mode 100644 R/imputation.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 3ca4b8c9..830def16 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -112,6 +112,7 @@ Collate:
     'h2_hdl.R'
     'h2_lder.R'
     'h2_utils.R'
+    'imputation.R'
     'ld_loader.R'
     'mash_wrapper.R'
     'misc.R'
diff --git a/R/imputation.R b/R/imputation.R
new file mode 100644
index 00000000..226d6be7
--- /dev/null
+++ b/R/imputation.R
@@ -0,0 +1,468 @@
+# =============================================================================
+# Molecular trait imputation
+#
+# User-facing imputation methods for filling missing values in molecular trait
+# matrices (expression, splicing, protein abundance) before fine-mapping and
+# TWAS-weight training. Three method wrappers plus a unified pipeline; see the
+# molecular-imputation vignette for when to use which method.
+#
+# The genotype-side mean_impute() helper lives in misc.R because it is an
+# internal building block of compute_LD() and genotype loading, not a
+# user-facing imputer.
+# =============================================================================
+
+#' XGBoost-based iterative imputation of missing values
+#'
+#' Imputes missing values in a numeric matrix by iteratively training
+#' per-column XGBoost models on observed entries and predicting missing ones.
+#' Columns that are entirely missing are removed. Initial imputation uses
+#' column means.
+#'
+#' @param data Numeric matrix with missing values (NA).
+#' @param maxiter Maximum number of imputation iterations (default 10).
+#' @param max_depth Maximum tree depth for XGBoost (default 2).
+#' @param nrounds Number of boosting rounds per variable (default 50).
+#' @param decreasing Logical. If TRUE, impute variables with most missing
+#'   values first. Default FALSE (fewest missing first).
+#' @param num_workers Number of parallel workers for BiocParallel. Default 1
+#'   (sequential).
+#' @param verbose Logical, print progress (default TRUE).
+#' @return The imputed matrix with the same dimensions as the input (minus
+#'   any all-NA columns).
+#' @importFrom BiocParallel MulticoreParam SerialParam bplapply
+#' @export
+xgboost_imputation <- function(data, maxiter = 10L, max_depth = 2L,
+                                nrounds = 50L, decreasing = FALSE,
+                                num_workers = 1L, verbose = TRUE) {
+  if (!requireNamespace("xgboost", quietly = TRUE))
+    stop("Package 'xgboost' is required for xgboost_imputation")
+
+  xmis <- as.matrix(data)
+  n <- nrow(xmis)
+  p <- ncol(xmis)
+
+  # Remove completely missing columns
+  all_na <- colSums(is.na(xmis)) == n
+  if (any(all_na)) {
+    if (verbose)
+      message("Removed ", sum(all_na), " column(s) with all entries missing.")
+    xmis <- xmis[, !all_na, drop = FALSE]
+    p <- ncol(xmis)
+  }
+
+  # Initial mean imputation
+  ximp <- xmis
+  col_means <- colMeans(xmis, na.rm = TRUE)
+  for (j in seq_len(p)) {
+    ximp[is.na(xmis[, j]), j] <- col_means[j]
+  }
+
+  # Missing value locations
+  NAloc <- is.na(xmis)
+  noNAvar <- colSums(NAloc)
+  sort_j <- order(noNAvar, decreasing = decreasing)
+  nzsort_j <- sort_j[noNAvar[sort_j] > 0]
+
+  if (length(nzsort_j) == 0) {
+    if (verbose) message("No missing values to impute.")
+    return(ximp)
+  }
+
+  # Set up BiocParallel
+  if (num_workers > 1L) {
+    BPPARAM <- MulticoreParam(workers = num_workers)
+  } else {
+    BPPARAM <- SerialParam()
+  }
+
+  iter <- 0L
+  conv_new <- 0
+  conv_old <- Inf
+  ximp_history <- vector("list", maxiter)
+
+  while (conv_new < conv_old && iter < maxiter) {
+    if (iter > 0) conv_old <- conv_new
+    if (verbose) message("  XGBoost iteration ", iter + 1L, " in progress...")
+
+    ximp_old <- ximp
+
+    # Impute each variable with missing values
+    impute_one <- function(var_idx) {
+      obsi <- !NAloc[, var_idx]
+      misi <- NAloc[, var_idx]
+      obsY <- ximp[obsi, var_idx]
+      obsX <- ximp[obsi, -var_idx, drop = FALSE]
+      misX <- ximp[misi, -var_idx, drop = FALSE]
+
+      xgb_train <- xgboost::xgb.DMatrix(data = obsX, label = obsY)
+      xgb_pred <- xgboost::xgb.DMatrix(data = misX)
+      model <- xgboost::xgb.train(
+        params = list(max_depth = max_depth, verbosity = 0),
+        data = xgb_train, nrounds = nrounds)
+      list(var_idx = var_idx, predicted = predict(model, xgb_pred))
+    }
+
+    results <- bplapply(nzsort_j, impute_one, BPPARAM = BPPARAM)
+
+    for (res in results) {
+      misi <- NAloc[, res$var_idx]
+      ximp[misi, res$var_idx] <- res$predicted
+    }
+
+    iter <- iter + 1L
+    ximp_history[[iter]] <- ximp
+
+    # Convergence: relative change in imputed values
+    conv_new <- sum((ximp - ximp_old)^2) / sum(ximp^2)
+  }
+
+  # Return last improving iteration
+  if (iter == maxiter) ximp_history[[iter]] else ximp_history[[max(iter - 1L, 1L)]]
+}
+
+#' softImpute-based matrix imputation
+#'
+#' Imputes missing values in a numeric matrix via the soft-thresholded SVD
+#' algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"}
+#' alternating-least-squares solver by default.
+#'
+#' \code{lambda} drives the bias-variance trade-off. By default the
+#' function tunes it via held-out cross-validation: a random fraction of
+#' observed cells is masked, softImpute is fit across a geometric grid of
+#' lambdas with warm-start, and the lambda minimising RMSE on the masked
+#' cells is selected. The final imputation then uses that lambda on the
+#' full input. Pass a scalar \code{lambda} to skip tuning.
+#'
+#' Columns with all entries missing are removed before fitting and a
+#' one-line message records the count.
+#'
+#' @param data Numeric matrix (samples x features) with missing values
+#'   (NA).
+#' @param rank.max Maximum rank of the SVD solution. Default
+#'   \code{min(dim(data) - 1)}.
+#' @param lambda Nuclear-norm penalty. \code{NULL} (default) tunes over
+#'   a default geometric grid via held-out CV. A scalar uses that lambda
+#'   directly with no tuning. A vector defines the tuning grid.
+#' @param tune_fraction Fraction of observed cells held out for CV.
+#'   Default 0.05. Ignored when \code{lambda} is a scalar.
+#' @param tune_seed Seed for the held-out mask. Default 42L.
+#' @param type Optimisation type passed to \code{softImpute::softImpute};
+#'   one of \code{"als"} (default, faster) or \code{"svd"}.
+#' @param thresh Convergence threshold. Default 1e-5.
+#' @param maxit Maximum iterations. Default 100.
+#' @param verbose Logical, print progress (including per-lambda CV RMSE).
+#'   Default FALSE.
+#' @return The imputed matrix with the same dimensions as the input
+#'   (minus any all-NA columns). Carries attributes
+#'   \code{"softimpute_lambda"} (the lambda used for the final fit) and,
+#'   when tuning was performed, \code{"softimpute_lambda_cv"} (a
+#'   data.frame of grid-search RMSE values).
+#' @export
+softimpute_imputation <- function(data, rank.max = NULL, lambda = NULL,
+                                   tune_fraction = 0.05,
+                                   tune_seed = 42L,
+                                   type = c("als", "svd"),
+                                   thresh = 1e-5, maxit = 100,
+                                   verbose = FALSE) {
+  if (!requireNamespace("softImpute", quietly = TRUE)) {
+    stop("Package 'softImpute' is required for softimpute_imputation.")
+  }
+  type <- match.arg(type)
+  X <- as.matrix(data)
+  n <- nrow(X); p <- ncol(X)
+  all_na <- colSums(is.na(X)) == n
+  if (any(all_na)) {
+    if (verbose) message("Removed ", sum(all_na), " column(s) with all entries missing.")
+    X <- X[, !all_na, drop = FALSE]
+    p <- ncol(X)
+  }
+  if (is.null(rank.max)) rank.max <- max(1L, min(n, p) - 1L)
+  lambda0_val <- softImpute::lambda0(X)
+
+  # Decide whether to tune. lambda = scalar => no tuning; NULL/vector => tune.
+  tune <- is.null(lambda) || length(lambda) > 1L
+  if (tune) {
+    if (is.null(lambda)) {
+      # Default geometric grid from lambda0 down to 1e-3 * lambda0.
+      lambda_grid <- exp(seq(log(lambda0_val),
+                              log(max(1e-3 * lambda0_val, .Machine$double.eps)),
+                              length.out = 10L))
+    } else {
+      lambda_grid <- sort(as.numeric(lambda), decreasing = TRUE)
+    }
+    cv_res <- .softimpute_cv_lambda(X, lambda_grid = lambda_grid,
+                                     rank.max = rank.max,
+                                     tune_fraction = tune_fraction,
+                                     tune_seed = tune_seed,
+                                     type = type, thresh = thresh, maxit = maxit,
+                                     verbose = verbose)
+    lambda <- cv_res$best_lambda
+    if (verbose) {
+      message(sprintf("softimpute_imputation: tuned lambda = %.4g (CV RMSE %.4f over %d cells)",
+                      lambda, cv_res$best_rmse, cv_res$n_held_out))
+    }
+  }
+
+  fit <- softImpute::softImpute(X, rank.max = rank.max, lambda = lambda,
+                                 type = type, thresh = thresh, maxit = maxit,
+                                 trace.it = isTRUE(verbose))
+  Ximp <- softImpute::complete(X, fit)
+  if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X)
+  if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X)
+  attr(Ximp, "softimpute_lambda") <- lambda
+  if (tune) attr(Ximp, "softimpute_lambda_cv") <- cv_res$grid
+  Ximp
+}
+
+# Internal: CV-tune softImpute's lambda over a grid using a held-out mask.
+# Uses softImpute's warm.start feature to chain fits along the lambda path
+# (largest lambda first). Returns the chosen lambda, its CV RMSE, and the
+# full grid scores.
+.softimpute_cv_lambda <- function(X, lambda_grid, rank.max,
+                                   tune_fraction, tune_seed,
+                                   type, thresh, maxit, verbose) {
+  na_mask <- is.na(X)
+  obs_idx <- which(!na_mask)
+  n_hold <- max(1L, floor(tune_fraction * length(obs_idx)))
+  if (n_hold >= length(obs_idx)) {
+    stop("tune_fraction too large; would mask all observed entries.")
+  }
+  set.seed(tune_seed)
+  hold_idx <- sample(obs_idx, size = n_hold, replace = FALSE)
+  truth <- X[hold_idx]
+  Xmasked <- X
+  Xmasked[hold_idx] <- NA_real_
+
+  lambdas <- sort(as.numeric(lambda_grid), decreasing = TRUE)
+  rmse_vec <- numeric(length(lambdas))
+  warm <- NULL
+  for (i in seq_along(lambdas)) {
+    fit_args <- list(x = Xmasked, rank.max = rank.max, lambda = lambdas[i],
+                     type = type, thresh = thresh, maxit = maxit,
+                     trace.it = FALSE)
+    if (!is.null(warm)) fit_args$warm.start <- warm
+    fit <- tryCatch(do.call(softImpute::softImpute, fit_args),
+                    error = function(e) NULL)
+    if (is.null(fit)) {
+      rmse_vec[i] <- NA_real_
+      next
+    }
+    Ximp <- softImpute::complete(Xmasked, fit)
+    rmse_vec[i] <- sqrt(mean((Ximp[hold_idx] - truth)^2, na.rm = TRUE))
+    warm <- fit
+    if (verbose) {
+      message(sprintf("  softimpute CV lambda = %.4g -> RMSE %.4f",
+                      lambdas[i], rmse_vec[i]))
+    }
+  }
+  grid <- data.frame(lambda = lambdas, rmse = rmse_vec,
+                     stringsAsFactors = FALSE)
+  ok <- which(is.finite(rmse_vec))
+  if (length(ok) == 0L) {
+    stop("All softImpute CV fits failed; supply lambda explicitly.")
+  }
+  best <- ok[which.min(rmse_vec[ok])]
+  list(best_lambda = lambdas[best], best_rmse = rmse_vec[best],
+       n_held_out = n_hold, grid = grid)
+}
+
+#' flashier-based matrix imputation
+#'
+#' Imputes missing values in a numeric matrix via empirical Bayes
+#' matrix factorisation (\code{flashier::flash}). \code{flash} natively
+#' supports NA entries and reconstructs them from the posterior fitted
+#' values. The default prior families match those used by
+#' \code{\link{compute_cov_flash}}: \code{ebnm::ebnm_normal} on the row
+#' factor and \code{ebnm::ebnm_normal_scale_mixture} on the column
+#' factor.
+#'
+#' @param data Numeric matrix (samples x features) with missing values
+#'   (NA).
+#' @param var_type Variance structure for the residuals, passed to
+#'   \code{flashier::flash}. Default 2 (per-column).
+#' @param ebnm_fn Prior family. Default
+#'   \code{c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture)}.
+#' @param greedy_Kmax Maximum number of greedy factors. Default
+#'   \code{min(20L, min(dim(data)) - 1L)}.
+#' @param backfit Logical; run a final backfit pass. Default TRUE.
+#' @param verbose Numeric verbosity (0 silent, default).
+#' @param ... Additional arguments forwarded to \code{flashier::flash}.
+#' @return The imputed matrix with the same dimensions as the input
+#'   (minus any all-NA columns).
+#' @export
+flashier_imputation <- function(data, var_type = 2,
+                                 ebnm_fn = c(ebnm::ebnm_normal,
+                                             ebnm::ebnm_normal_scale_mixture),
+                                 greedy_Kmax = NULL,
+                                 backfit = TRUE, verbose = 0, ...) {
+  if (!requireNamespace("flashier", quietly = TRUE)) {
+    stop("Package 'flashier' is required for flashier_imputation.")
+  }
+  if (!requireNamespace("ebnm", quietly = TRUE)) {
+    stop("Package 'ebnm' is required for flashier_imputation.")
+  }
+  X <- as.matrix(data)
+  n <- nrow(X); p <- ncol(X)
+  all_na <- colSums(is.na(X)) == n
+  if (any(all_na)) {
+    if (isTRUE(verbose) || (is.numeric(verbose) && verbose >= 1)) {
+      message("Removed ", sum(all_na), " column(s) with all entries missing.")
+    }
+    X <- X[, !all_na, drop = FALSE]
+    p <- ncol(X)
+  }
+  if (is.null(greedy_Kmax)) greedy_Kmax <- max(1L, min(20L, min(n, p) - 1L))
+  fl <- flashier::flash(X, var_type = var_type, ebnm_fn = ebnm_fn,
+                         greedy_Kmax = greedy_Kmax, backfit = backfit,
+                         verbose = verbose, ...)
+  # Posterior fitted value: L_pm %*% t(F_pm). When no factors are found,
+  # fall back to column means.
+  if (is.null(fl$n_factors) || fl$n_factors == 0) {
+    fitted <- matrix(colMeans(X, na.rm = TRUE),
+                     nrow = n, ncol = p, byrow = TRUE)
+  } else {
+    fitted <- tcrossprod(fl$L_pm, fl$F_pm)
+  }
+  Ximp <- X
+  na_idx <- is.na(X)
+  Ximp[na_idx] <- fitted[na_idx]
+  if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X)
+  if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X)
+  Ximp
+}
+
+#' Pipeline for molecular trait imputation
+#'
+#' Unified entry point for imputing missing values in a molecular trait
+#' matrix (e.g. expression, splicing, protein abundance). Dispatches to
+#' one or more underlying methods and optionally evaluates imputation
+#' quality on a held-out random mask of observed entries.
+#'
+#' Available methods:
+#' \itemize{
+#'   \item \code{"xgboost"} - \code{\link{xgboost_imputation}}, iterative
+#'     XGBoost per-column models. No structural assumption on the matrix.
+#'   \item \code{"softimpute"} - \code{\link{softimpute_imputation}},
+#'     soft-thresholded SVD via \code{softImpute::softImpute}. Assumes a
+#'     low-rank smooth signal.
+#'   \item \code{"flashier"} - \code{\link{flashier_imputation}},
+#'     empirical Bayes matrix factorisation via \code{flashier::flash}
+#'     with sparse + scale-mixture priors.
+#' }
+#'
+#' @param data Numeric matrix (samples x features) with missing values.
+#' @param methods Character vector of methods to fit. Any subset of
+#'   \code{c("xgboost", "softimpute", "flashier")}. Default fits all
+#'   three.
+#' @param method_args Named list of per-method argument lists. List
+#'   names must match \code{methods}. Default empty.
+#' @param evaluate Logical. If TRUE, mask a random fraction of observed
+#'   entries, run each method, and report root-mean-squared error on the
+#'   masked cells. Default FALSE.
+#' @param eval_fraction Fraction of observed cells to hold out for
+#'   evaluation. Default 0.05.
+#' @param eval_seed Random seed for the held-out mask. Default 42.
+#' @param verbose Logical, print progress. Default FALSE.
+#'
+#' @return A list with:
+#' \describe{
+#'   \item{imputed}{Named list of imputed matrices, one per requested
+#'     method.}
+#'   \item{missingness}{Per-column NA count and overall NA fraction.}
+#'   \item{evaluation}{When \code{evaluate = TRUE}, a data.frame with
+#'     one row per method: \code{method}, \code{rmse}, \code{n_held_out},
+#'     \code{time_seconds}.}
+#'   \item{method_args}{The resolved per-method arguments used.}
+#' }
+#' @export
+molecular_trait_imputation_pipeline <- function(
+    data,
+    methods = c("xgboost", "softimpute", "flashier"),
+    method_args = list(),
+    evaluate = FALSE,
+    eval_fraction = 0.05,
+    eval_seed = 42L,
+    verbose = FALSE) {
+  valid <- c("xgboost", "softimpute", "flashier")
+  if (!is.character(methods) || length(methods) == 0L) {
+    stop("methods must be a non-empty character vector.")
+  }
+  bad <- setdiff(methods, valid)
+  if (length(bad) > 0) {
+    stop("Unknown imputation method(s): ", paste(bad, collapse = ", "),
+         ". Valid options: ", paste(valid, collapse = ", "))
+  }
+  methods <- unique(methods)
+
+  Xfull <- as.matrix(data)
+  n <- nrow(Xfull); p <- ncol(Xfull)
+  na_mask_full <- is.na(Xfull)
+  na_per_col <- colSums(na_mask_full)
+  missingness <- list(
+    per_column = na_per_col,
+    n_missing = sum(na_mask_full),
+    fraction = sum(na_mask_full) / (n * p)
+  )
+  if (verbose) {
+    message(sprintf("molecular_trait_imputation_pipeline: %d x %d matrix, %.2f%% missing.",
+                    n, p, 100 * missingness$fraction))
+  }
+
+  dispatch <- list(xgboost = xgboost_imputation,
+                   softimpute = softimpute_imputation,
+                   flashier = flashier_imputation)
+
+  run_one <- function(m, X_in) {
+    args <- method_args[[m]] %||% list()
+    do.call(dispatch[[m]], c(list(X_in), args))
+  }
+
+  # If evaluating, hold out a random subset of observed cells, run each
+  # method against the masked matrix, then score RMSE on the held-out
+  # cells. Final imputation is fit against the original matrix so the
+  # returned matrices use every observed cell.
+  eval_df <- NULL
+  if (isTRUE(evaluate)) {
+    obs_idx <- which(!na_mask_full)
+    n_hold <- max(1L, floor(eval_fraction * length(obs_idx)))
+    if (n_hold >= length(obs_idx)) {
+      stop("eval_fraction too large; would mask all observed entries.")
+    }
+    set.seed(eval_seed)
+    hold_idx <- sample(obs_idx, size = n_hold, replace = FALSE)
+    Xmasked <- Xfull
+    held_truth <- Xfull[hold_idx]
+    Xmasked[hold_idx] <- NA_real_
+    eval_records <- vector("list", length(methods))
+    for (i in seq_along(methods)) {
+      m <- methods[i]
+      if (verbose) message("Evaluating ", m, " on held-out mask ...")
+      t0 <- proc.time()
+      Ximp_m <- run_one(m, Xmasked)
+      t1 <- proc.time()
+      pred <- Ximp_m[hold_idx]
+      rmse <- sqrt(mean((pred - held_truth)^2, na.rm = TRUE))
+      eval_records[[i]] <- data.frame(
+        method = m, rmse = rmse, n_held_out = n_hold,
+        time_seconds = as.numeric((t1 - t0)["elapsed"]),
+        stringsAsFactors = FALSE
+      )
+    }
+    eval_df <- do.call(rbind, eval_records)
+    rownames(eval_df) <- NULL
+  }
+
+  # Final imputation: run each method on the full matrix
+  imputed <- list()
+  for (m in methods) {
+    if (verbose) message("Imputing with ", m, " ...")
+    imputed[[m]] <- run_one(m, Xfull)
+  }
+
+  list(
+    imputed = imputed,
+    missingness = missingness,
+    evaluation = eval_df,
+    method_args = method_args
+  )
+}
diff --git a/R/misc.R b/R/misc.R
index 2c135c42..d9505296 100644
--- a/R/misc.R
+++ b/R/misc.R
@@ -850,365 +850,6 @@ filter_molecular_events <- function(events, filters, condition = NULL, remove_al
   return(filtered_events)
 }
 
-#' XGBoost-based iterative imputation of missing values
-#'
-#' Imputes missing values in a numeric matrix by iteratively training
-#' per-column XGBoost models on observed entries and predicting missing ones.
-#' Columns that are entirely missing are removed. Initial imputation uses
-#' column means.
-#'
-#' @param data Numeric matrix with missing values (NA).
-#' @param maxiter Maximum number of imputation iterations (default 10).
-#' @param max_depth Maximum tree depth for XGBoost (default 2).
-#' @param nrounds Number of boosting rounds per variable (default 50).
-#' @param decreasing Logical. If TRUE, impute variables with most missing
-#'   values first. Default FALSE (fewest missing first).
-#' @param num_workers Number of parallel workers for BiocParallel. Default 1
-#'   (sequential).
-#' @param verbose Logical, print progress (default TRUE).
-#' @return The imputed matrix with the same dimensions as the input (minus
-#'   any all-NA columns).
-#' @importFrom BiocParallel MulticoreParam SerialParam bplapply
-#' @export
-xgboost_imputation <- function(data, maxiter = 10L, max_depth = 2L,
-                                nrounds = 50L, decreasing = FALSE,
-                                num_workers = 1L, verbose = TRUE) {
-  if (!requireNamespace("xgboost", quietly = TRUE))
-    stop("Package 'xgboost' is required for xgboost_imputation")
-
-  xmis <- as.matrix(data)
-  n <- nrow(xmis)
-  p <- ncol(xmis)
-
-  # Remove completely missing columns
-  all_na <- colSums(is.na(xmis)) == n
-  if (any(all_na)) {
-    if (verbose)
-      message("Removed ", sum(all_na), " column(s) with all entries missing.")
-    xmis <- xmis[, !all_na, drop = FALSE]
-    p <- ncol(xmis)
-  }
-
-  # Initial mean imputation
-  ximp <- xmis
-  col_means <- colMeans(xmis, na.rm = TRUE)
-  for (j in seq_len(p)) {
-    ximp[is.na(xmis[, j]), j] <- col_means[j]
-  }
-
-  # Missing value locations
-  NAloc <- is.na(xmis)
-  noNAvar <- colSums(NAloc)
-  sort_j <- order(noNAvar, decreasing = decreasing)
-  nzsort_j <- sort_j[noNAvar[sort_j] > 0]
-
-  if (length(nzsort_j) == 0) {
-    if (verbose) message("No missing values to impute.")
-    return(ximp)
-  }
-
-  # Set up BiocParallel
-  if (num_workers > 1L) {
-    BPPARAM <- MulticoreParam(workers = num_workers)
-  } else {
-    BPPARAM <- SerialParam()
-  }
-
-  iter <- 0L
-  conv_new <- 0
-  conv_old <- Inf
-  ximp_history <- vector("list", maxiter)
-
-  while (conv_new < conv_old && iter < maxiter) {
-    if (iter > 0) conv_old <- conv_new
-    if (verbose) message("  XGBoost iteration ", iter + 1L, " in progress...")
-
-    ximp_old <- ximp
-
-    # Impute each variable with missing values
-    impute_one <- function(var_idx) {
-      obsi <- !NAloc[, var_idx]
-      misi <- NAloc[, var_idx]
-      obsY <- ximp[obsi, var_idx]
-      obsX <- ximp[obsi, -var_idx, drop = FALSE]
-      misX <- ximp[misi, -var_idx, drop = FALSE]
-
-      xgb_train <- xgboost::xgb.DMatrix(data = obsX, label = obsY)
-      xgb_pred <- xgboost::xgb.DMatrix(data = misX)
-      model <- xgboost::xgb.train(
-        params = list(max_depth = max_depth, verbosity = 0),
-        data = xgb_train, nrounds = nrounds)
-      list(var_idx = var_idx, predicted = predict(model, xgb_pred))
-    }
-
-    results <- bplapply(nzsort_j, impute_one, BPPARAM = BPPARAM)
-
-    for (res in results) {
-      misi <- NAloc[, res$var_idx]
-      ximp[misi, res$var_idx] <- res$predicted
-    }
-
-    iter <- iter + 1L
-    ximp_history[[iter]] <- ximp
-
-    # Convergence: relative change in imputed values
-    conv_new <- sum((ximp - ximp_old)^2) / sum(ximp^2)
-  }
-
-  # Return last improving iteration
-  if (iter == maxiter) ximp_history[[iter]] else ximp_history[[max(iter - 1L, 1L)]]
-}
-
-#' softImpute-based matrix imputation
-#'
-#' Imputes missing values in a numeric matrix via the soft-thresholded SVD
-#' algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"}
-#' alternating-least-squares solver by default; \code{lambda} defaults to
-#' a sensible point on the lambda grid (\code{0.5 * lambda0(x)}) when not
-#' supplied. Columns with all entries missing are removed before fitting
-#' and a one-line message records the count.
-#'
-#' @param data Numeric matrix (samples x features) with missing values
-#'   (NA).
-#' @param rank.max Maximum rank of the SVD solution. Default
-#'   \code{min(dim(data) - 1)}.
-#' @param lambda Nuclear-norm penalty. NULL (default) chooses
-#'   \code{0.5 * lambda0(data)} so the solution is not degenerate.
-#' @param type Optimisation type passed to \code{softImpute::softImpute};
-#'   one of \code{"als"} (default, faster) or \code{"svd"}.
-#' @param thresh Convergence threshold. Default 1e-5.
-#' @param maxit Maximum iterations. Default 100.
-#' @param verbose Logical, print progress. Default FALSE.
-#' @return The imputed matrix with the same dimensions as the input
-#'   (minus any all-NA columns).
-#' @export
-softimpute_imputation <- function(data, rank.max = NULL, lambda = NULL,
-                                   type = c("als", "svd"),
-                                   thresh = 1e-5, maxit = 100,
-                                   verbose = FALSE) {
-  if (!requireNamespace("softImpute", quietly = TRUE)) {
-    stop("Package 'softImpute' is required for softimpute_imputation.")
-  }
-  type <- match.arg(type)
-  X <- as.matrix(data)
-  n <- nrow(X); p <- ncol(X)
-  all_na <- colSums(is.na(X)) == n
-  if (any(all_na)) {
-    if (verbose) message("Removed ", sum(all_na), " column(s) with all entries missing.")
-    X <- X[, !all_na, drop = FALSE]
-    p <- ncol(X)
-  }
-  if (is.null(rank.max)) rank.max <- max(1L, min(n, p) - 1L)
-  if (is.null(lambda)) lambda <- 0.5 * softImpute::lambda0(X)
-  fit <- softImpute::softImpute(X, rank.max = rank.max, lambda = lambda,
-                                 type = type, thresh = thresh, maxit = maxit,
-                                 trace.it = isTRUE(verbose))
-  Ximp <- softImpute::complete(X, fit)
-  if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X)
-  if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X)
-  Ximp
-}
-
-#' flashier-based matrix imputation
-#'
-#' Imputes missing values in a numeric matrix via empirical Bayes
-#' matrix factorisation (\code{flashier::flash}). \code{flash} natively
-#' supports NA entries and reconstructs them from the posterior fitted
-#' values. The default prior families match those used by
-#' \code{\link{compute_cov_flash}}: \code{ebnm::ebnm_normal} on the row
-#' factor and \code{ebnm::ebnm_normal_scale_mixture} on the column
-#' factor.
-#'
-#' @param data Numeric matrix (samples x features) with missing values
-#'   (NA).
-#' @param var_type Variance structure for the residuals, passed to
-#'   \code{flashier::flash}. Default 2 (per-column).
-#' @param ebnm_fn Prior family. Default
-#'   \code{c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture)}.
-#' @param greedy_Kmax Maximum number of greedy factors. Default
-#'   \code{min(20L, min(dim(data)) - 1L)}.
-#' @param backfit Logical; run a final backfit pass. Default TRUE.
-#' @param verbose Numeric verbosity (0 silent, default).
-#' @param ... Additional arguments forwarded to \code{flashier::flash}.
-#' @return The imputed matrix with the same dimensions as the input
-#'   (minus any all-NA columns).
-#' @export
-flashier_imputation <- function(data, var_type = 2,
-                                 ebnm_fn = c(ebnm::ebnm_normal,
-                                             ebnm::ebnm_normal_scale_mixture),
-                                 greedy_Kmax = NULL,
-                                 backfit = TRUE, verbose = 0, ...) {
-  if (!requireNamespace("flashier", quietly = TRUE)) {
-    stop("Package 'flashier' is required for flashier_imputation.")
-  }
-  if (!requireNamespace("ebnm", quietly = TRUE)) {
-    stop("Package 'ebnm' is required for flashier_imputation.")
-  }
-  X <- as.matrix(data)
-  n <- nrow(X); p <- ncol(X)
-  all_na <- colSums(is.na(X)) == n
-  if (any(all_na)) {
-    if (isTRUE(verbose) || (is.numeric(verbose) && verbose >= 1)) {
-      message("Removed ", sum(all_na), " column(s) with all entries missing.")
-    }
-    X <- X[, !all_na, drop = FALSE]
-    p <- ncol(X)
-  }
-  if (is.null(greedy_Kmax)) greedy_Kmax <- max(1L, min(20L, min(n, p) - 1L))
-  fl <- flashier::flash(X, var_type = var_type, ebnm_fn = ebnm_fn,
-                         greedy_Kmax = greedy_Kmax, backfit = backfit,
-                         verbose = verbose, ...)
-  # Posterior fitted value: L_pm %*% t(F_pm). When no factors are found,
-  # fall back to column means.
-  if (is.null(fl$n_factors) || fl$n_factors == 0) {
-    fitted <- matrix(colMeans(X, na.rm = TRUE),
-                     nrow = n, ncol = p, byrow = TRUE)
-  } else {
-    fitted <- tcrossprod(fl$L_pm, fl$F_pm)
-  }
-  Ximp <- X
-  na_idx <- is.na(X)
-  Ximp[na_idx] <- fitted[na_idx]
-  if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X)
-  if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X)
-  Ximp
-}
-
-#' Pipeline for molecular trait imputation
-#'
-#' Unified entry point for imputing missing values in a molecular trait
-#' matrix (e.g. expression, splicing, protein abundance). Dispatches to
-#' one or more underlying methods and optionally evaluates imputation
-#' quality on a held-out random mask of observed entries.
-#'
-#' Available methods:
-#' \itemize{
-#'   \item \code{"xgboost"} - \code{\link{xgboost_imputation}}, iterative
-#'     XGBoost per-column models. No structural assumption on the matrix.
-#'   \item \code{"softimpute"} - \code{\link{softimpute_imputation}},
-#'     soft-thresholded SVD via \code{softImpute::softImpute}. Assumes a
-#'     low-rank smooth signal.
-#'   \item \code{"flashier"} - \code{\link{flashier_imputation}},
-#'     empirical Bayes matrix factorisation via \code{flashier::flash}
-#'     with sparse + scale-mixture priors.
-#' }
-#'
-#' @param data Numeric matrix (samples x features) with missing values.
-#' @param methods Character vector of methods to fit. Any subset of
-#'   \code{c("xgboost", "softimpute", "flashier")}. Default fits all
-#'   three.
-#' @param method_args Named list of per-method argument lists. List
-#'   names must match \code{methods}. Default empty.
-#' @param evaluate Logical. If TRUE, mask a random fraction of observed
-#'   entries, run each method, and report root-mean-squared error on the
-#'   masked cells. Default FALSE.
-#' @param eval_fraction Fraction of observed cells to hold out for
-#'   evaluation. Default 0.05.
-#' @param eval_seed Random seed for the held-out mask. Default 42.
-#' @param verbose Logical, print progress. Default FALSE.
-#'
-#' @return A list with:
-#' \describe{
-#'   \item{imputed}{Named list of imputed matrices, one per requested
-#'     method.}
-#'   \item{missingness}{Per-column NA count and overall NA fraction.}
-#'   \item{evaluation}{When \code{evaluate = TRUE}, a data.frame with
-#'     one row per method: \code{method}, \code{rmse}, \code{n_held_out},
-#'     \code{time_seconds}.}
-#'   \item{method_args}{The resolved per-method arguments used.}
-#' }
-#' @export
-molecular_trait_imputation_pipeline <- function(
-    data,
-    methods = c("xgboost", "softimpute", "flashier"),
-    method_args = list(),
-    evaluate = FALSE,
-    eval_fraction = 0.05,
-    eval_seed = 42L,
-    verbose = FALSE) {
-  valid <- c("xgboost", "softimpute", "flashier")
-  if (!is.character(methods) || length(methods) == 0L) {
-    stop("methods must be a non-empty character vector.")
-  }
-  bad <- setdiff(methods, valid)
-  if (length(bad) > 0) {
-    stop("Unknown imputation method(s): ", paste(bad, collapse = ", "),
-         ". Valid options: ", paste(valid, collapse = ", "))
-  }
-  methods <- unique(methods)
-
-  Xfull <- as.matrix(data)
-  n <- nrow(Xfull); p <- ncol(Xfull)
-  na_mask_full <- is.na(Xfull)
-  na_per_col <- colSums(na_mask_full)
-  missingness <- list(
-    per_column = na_per_col,
-    n_missing = sum(na_mask_full),
-    fraction = sum(na_mask_full) / (n * p)
-  )
-  if (verbose) {
-    message(sprintf("molecular_trait_imputation_pipeline: %d x %d matrix, %.2f%% missing.",
-                    n, p, 100 * missingness$fraction))
-  }
-
-  dispatch <- list(xgboost = xgboost_imputation,
-                   softimpute = softimpute_imputation,
-                   flashier = flashier_imputation)
-
-  run_one <- function(m, X_in) {
-    args <- method_args[[m]] %||% list()
-    do.call(dispatch[[m]], c(list(X_in), args))
-  }
-
-  # If evaluating, hold out a random subset of observed cells, run each
-  # method against the masked matrix, then score RMSE on the held-out
-  # cells. Final imputation is fit against the original matrix so the
-  # returned matrices use every observed cell.
-  eval_df <- NULL
-  if (isTRUE(evaluate)) {
-    obs_idx <- which(!na_mask_full)
-    n_hold <- max(1L, floor(eval_fraction * length(obs_idx)))
-    if (n_hold >= length(obs_idx)) {
-      stop("eval_fraction too large; would mask all observed entries.")
-    }
-    set.seed(eval_seed)
-    hold_idx <- sample(obs_idx, size = n_hold, replace = FALSE)
-    Xmasked <- Xfull
-    held_truth <- Xfull[hold_idx]
-    Xmasked[hold_idx] <- NA_real_
-    eval_records <- vector("list", length(methods))
-    for (i in seq_along(methods)) {
-      m <- methods[i]
-      if (verbose) message("Evaluating ", m, " on held-out mask ...")
-      t0 <- proc.time()
-      Ximp_m <- run_one(m, Xmasked)
-      t1 <- proc.time()
-      pred <- Ximp_m[hold_idx]
-      rmse <- sqrt(mean((pred - held_truth)^2, na.rm = TRUE))
-      eval_records[[i]] <- data.frame(
-        method = m, rmse = rmse, n_held_out = n_hold,
-        time_seconds = as.numeric((t1 - t0)["elapsed"]),
-        stringsAsFactors = FALSE
-      )
-    }
-    eval_df <- do.call(rbind, eval_records)
-    rownames(eval_df) <- NULL
-  }
-
-  # Final imputation: run each method on the full matrix
-  imputed <- list()
-  for (m in methods) {
-    if (verbose) message("Imputing with ", m, " ...")
-    imputed[[m]] <- run_one(m, Xfull)
-  }
-
-  list(
-    imputed = imputed,
-    missingness = missingness,
-    evaluation = eval_df,
-    method_args = method_args
-  )
-}
 
 #' Robust Mahalanobis Distance
 #'
diff --git a/man/flashier_imputation.Rd b/man/flashier_imputation.Rd
index 90578084..7762c785 100644
--- a/man/flashier_imputation.Rd
+++ b/man/flashier_imputation.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/misc.R
+% Please edit documentation in R/imputation.R
 \name{flashier_imputation}
 \alias{flashier_imputation}
 \title{flashier-based matrix imputation}
diff --git a/man/molecular_trait_imputation_pipeline.Rd b/man/molecular_trait_imputation_pipeline.Rd
index 68a74d6a..7932803b 100644
--- a/man/molecular_trait_imputation_pipeline.Rd
+++ b/man/molecular_trait_imputation_pipeline.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/misc.R
+% Please edit documentation in R/imputation.R
 \name{molecular_trait_imputation_pipeline}
 \alias{molecular_trait_imputation_pipeline}
 \title{Pipeline for molecular trait imputation}
diff --git a/man/softimpute_imputation.Rd b/man/softimpute_imputation.Rd
index efb08fa5..89eadff7 100644
--- a/man/softimpute_imputation.Rd
+++ b/man/softimpute_imputation.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/misc.R
+% Please edit documentation in R/imputation.R
 \name{softimpute_imputation}
 \alias{softimpute_imputation}
 \title{softImpute-based matrix imputation}
@@ -8,6 +8,8 @@ softimpute_imputation(
   data,
   rank.max = NULL,
   lambda = NULL,
+  tune_fraction = 0.05,
+  tune_seed = 42L,
   type = c("als", "svd"),
   thresh = 1e-05,
   maxit = 100,
@@ -21,8 +23,14 @@ softimpute_imputation(
 \item{rank.max}{Maximum rank of the SVD solution. Default
 \code{min(dim(data) - 1)}.}
 
-\item{lambda}{Nuclear-norm penalty. NULL (default) chooses
-\code{0.5 * lambda0(data)} so the solution is not degenerate.}
+\item{lambda}{Nuclear-norm penalty. \code{NULL} (default) tunes over
+a default geometric grid via held-out CV. A scalar uses that lambda
+directly with no tuning. A vector defines the tuning grid.}
+
+\item{tune_fraction}{Fraction of observed cells held out for CV.
+Default 0.05. Ignored when \code{lambda} is a scalar.}
+
+\item{tune_seed}{Seed for the held-out mask. Default 42L.}
 
 \item{type}{Optimisation type passed to \code{softImpute::softImpute};
 one of \code{"als"} (default, faster) or \code{"svd"}.}
@@ -31,17 +39,29 @@ one of \code{"als"} (default, faster) or \code{"svd"}.}
 
 \item{maxit}{Maximum iterations. Default 100.}
 
-\item{verbose}{Logical, print progress. Default FALSE.}
+\item{verbose}{Logical, print progress (including per-lambda CV RMSE).
+Default FALSE.}
 }
 \value{
 The imputed matrix with the same dimensions as the input
-  (minus any all-NA columns).
+  (minus any all-NA columns). Carries attributes
+  \code{"softimpute_lambda"} (the lambda used for the final fit) and,
+  when tuning was performed, \code{"softimpute_lambda_cv"} (a
+  data.frame of grid-search RMSE values).
 }
 \description{
 Imputes missing values in a numeric matrix via the soft-thresholded SVD
 algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"}
-alternating-least-squares solver by default; \code{lambda} defaults to
-a sensible point on the lambda grid (\code{0.5 * lambda0(x)}) when not
-supplied. Columns with all entries missing are removed before fitting
-and a one-line message records the count.
+alternating-least-squares solver by default.
+}
+\details{
+\code{lambda} drives the bias-variance trade-off. By default the
+function tunes it via held-out cross-validation: a random fraction of
+observed cells is masked, softImpute is fit across a geometric grid of
+lambdas with warm-start, and the lambda minimising RMSE on the masked
+cells is selected. The final imputation then uses that lambda on the
+full input. Pass a scalar \code{lambda} to skip tuning.
+
+Columns with all entries missing are removed before fitting and a
+one-line message records the count.
 }
diff --git a/man/xgboost_imputation.Rd b/man/xgboost_imputation.Rd
index 0bc3ef03..b09309ee 100644
--- a/man/xgboost_imputation.Rd
+++ b/man/xgboost_imputation.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/misc.R
+% Please edit documentation in R/imputation.R
 \name{xgboost_imputation}
 \alias{xgboost_imputation}
 \title{XGBoost-based iterative imputation of missing values}
diff --git a/tests/testthat/test_molecular_imputation.R b/tests/testthat/test_molecular_imputation.R
index 2c68b808..54d39a4a 100644
--- a/tests/testthat/test_molecular_imputation.R
+++ b/tests/testthat/test_molecular_imputation.R
@@ -40,6 +40,51 @@ test_that("softimpute_imputation: handles all-NA columns by removing them", {
   expect_equal(ncol(Xi), ncol(X) - 1L)
 })
 
+test_that("softimpute_imputation: default tunes lambda via CV and attaches grid", {
+  skip_if_not_installed("softImpute")
+  inp <- .make_imputation_inputs()
+  Xi <- softimpute_imputation(inp$X)  # default lambda = NULL -> tune
+  expect_false(is.null(attr(Xi, "softimpute_lambda")))
+  grid <- attr(Xi, "softimpute_lambda_cv")
+  expect_s3_class(grid, "data.frame")
+  expect_named(grid, c("lambda", "rmse"))
+  expect_true(all(diff(grid$lambda) <= 0))  # grid sorted descending
+  expect_true(any(is.finite(grid$rmse)))
+  # The chosen lambda must be one of the grid points
+  expect_true(attr(Xi, "softimpute_lambda") %in% grid$lambda)
+})
+
+test_that("softimpute_imputation: scalar lambda bypasses tuning", {
+  skip_if_not_installed("softImpute")
+  inp <- .make_imputation_inputs()
+  l0 <- softImpute::lambda0(inp$X)
+  Xi <- softimpute_imputation(inp$X, lambda = 0.3 * l0)
+  expect_equal(attr(Xi, "softimpute_lambda"), 0.3 * l0)
+  expect_null(attr(Xi, "softimpute_lambda_cv"))
+})
+
+test_that("softimpute_imputation: vector lambda restricts tuning to that grid", {
+  skip_if_not_installed("softImpute")
+  inp <- .make_imputation_inputs()
+  l0 <- softImpute::lambda0(inp$X)
+  custom_grid <- l0 * c(0.05, 0.1, 0.25, 0.5)
+  Xi <- softimpute_imputation(inp$X, lambda = custom_grid)
+  grid <- attr(Xi, "softimpute_lambda_cv")
+  expect_setequal(grid$lambda, custom_grid)
+  expect_true(attr(Xi, "softimpute_lambda") %in% custom_grid)
+})
+
+test_that("softimpute_imputation: chosen lambda minimises CV RMSE on the grid", {
+  skip_if_not_installed("softImpute")
+  inp <- .make_imputation_inputs(n = 80, p = 20, rank = 3, seed = 11)
+  Xi <- softimpute_imputation(inp$X)
+  grid <- attr(Xi, "softimpute_lambda_cv")
+  chosen <- attr(Xi, "softimpute_lambda")
+  ok <- which(is.finite(grid$rmse))
+  best_idx <- ok[which.min(grid$rmse[ok])]
+  expect_equal(chosen, grid$lambda[best_idx])
+})
+
 # =============================================================================
 # flashier_imputation
 # =============================================================================
diff --git a/vignettes/molecular-imputation.Rmd b/vignettes/molecular-imputation.Rmd
index bbd7fd14..49334d85 100644
--- a/vignettes/molecular-imputation.Rmd
+++ b/vignettes/molecular-imputation.Rmd
@@ -128,7 +128,7 @@ The pipeline reports three things:
 
 | Method | Typical RMSE behaviour | Cost | Notes |
 |---|---|---|---|
-| **softimpute** | Best on smooth low-rank matrices. Sensitive to the choice of `lambda`; the default `0.5 * lambda0(x)` is a reasonable starting point. | Very fast (seconds). | Tune `lambda` and `rank.max` if RMSE is poor on your data. |
+| **softimpute** | Best on smooth low-rank matrices. `lambda` is auto-tuned by held-out CV by default. | Very fast (seconds) plus the lambda-grid pass. | Pass a scalar `lambda` to skip tuning; pass a vector to restrict the CV grid. |
 | **flashier** | Best on mixed sparse + diffuse signal. Default priors (`ebnm_normal` + `ebnm_normal_scale_mixture`) work well for most expression-like matrices. | Moderate (seconds-to-minutes). | Reuses the same FLASH defaults as `compute_cov_flash()` (mr.mash prior step). |
 | **xgboost** | Robust to non-linear column relationships. Can overfit on small matrices. | Slowest (minutes-to-hours for large matrices). | Set `num_workers > 1` for parallel per-column fits. Tune `max_depth` and `nrounds`. |
 
@@ -175,10 +175,14 @@ do not re-impute internally.
 3. **CV evaluation uses one random mask.** For more stable RMSE
    estimates, run the pipeline a few times with different `eval_seed`
    values and average.
-4. **softImpute lambda tuning matters.** The default
-   `0.5 * lambda0(x)` is a starting point. If softImpute looks
-   uncompetitive on your data, sweep `lambda` over a grid (e.g.
-   `lambda = lambda0(x) * c(0.1, 0.25, 0.5, 1, 2)`).
+4. **softImpute `lambda` is auto-tuned by default.** A random
+   fraction of observed cells (default 5%) is held out, softImpute is
+   fit across a geometric grid of lambdas via warm-start, and the
+   lambda with the lowest held-out RMSE wins. Pass `lambda = <scalar>`
+   to skip tuning; pass `lambda = <vector>` to restrict the grid.
+   The chosen lambda and the full grid scores are attached to the
+   returned matrix as the `"softimpute_lambda"` and
+   `"softimpute_lambda_cv"` attributes.
 
 ```{r session-info}
 sessionInfo()

From 3f2218b02cb8cac7f942bc77386c91204ed21c80 Mon Sep 17 00:00:00 2001
From: Daniel Nachun <daniel.nachun@gmail.com>
Date: Fri, 5 Jun 2026 15:22:53 -0700
Subject: [PATCH 4/6] clean up imputation and twas z-score

---
 vignettes/molecular-imputation.Rmd | 189 -----------------------------
 1 file changed, 189 deletions(-)
 delete mode 100644 vignettes/molecular-imputation.Rmd

diff --git a/vignettes/molecular-imputation.Rmd b/vignettes/molecular-imputation.Rmd
deleted file mode 100644
index 49334d85..00000000
--- a/vignettes/molecular-imputation.Rmd
+++ /dev/null
@@ -1,189 +0,0 @@
----
-title: "Imputing Missing Values in Molecular Trait Matrices"
-author: "pecotmr authors"
-date: "`r Sys.Date()`"
-output: rmarkdown::html_vignette
-vignette: >
-  %\VignetteIndexEntry{Imputing Missing Values in Molecular Trait Matrices}
-  %\VignetteEngine{knitr::rmarkdown}
-  %\VignetteEncoding{UTF-8}
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE,
-                      fig.width = 7, fig.height = 4.5)
-```
-
-## Overview
-
-Real molecular trait data (expression, splicing, protein abundance,
-chromatin accessibility) almost always has missing values: low-coverage
-samples, transcripts below detection in some tissues, batch dropouts.
-Many downstream pecotmr workflows expect a complete matrix:
-
-- [Fine-mapping](fine-mapping.html) — `univariate_analysis_pipeline()`
-  requires a numeric `Y` vector with no NAs.
-- [TWAS weights](twas-weights.html) — `twas_weights_pipeline()` and
-  `twas_multivariate_weights_pipeline()` require complete `Y`.
-- Multivariate fits — `mrmash_wrapper()` and `multivariate_analysis_pipeline()`
-  reject NA in `Y`.
-
-`pecotmr` provides three imputation methods plus a unified pipeline.
-The methods differ in what they assume about the data:
-
-| Method | Function | Assumption | When to use |
-|---|---|---|---|
-| **XGBoost** | `xgboost_imputation()` | None — fits arbitrary nonlinear per-column models | Heterogeneous matrices where columns have different relationships. Slowest but most flexible. |
-| **softImpute** | `softimpute_imputation()` | Low-rank smooth signal | Expression matrices with a small number of dominant axes of variation. Fastest. |
-| **flashier** | `flashier_imputation()` | Sparse + scale-mixture EBMF factors | Matrices with sparse + diffuse signal mixtures. Good middle ground; reuses the same prior families as `compute_cov_flash()` for mr.mash priors. |
-
-All three are exposed through `molecular_trait_imputation_pipeline()`,
-which can fit multiple methods in one call and evaluate them on a
-held-out random mask of observed entries.
-
-```{r load-packages}
-library(pecotmr)
-```
-
-## Quick start: one method on a synthetic matrix
-
-A low-rank synthetic example illustrates the contract. We build a
-60 × 15 matrix with rank-3 truth + Gaussian noise, then mask 10% of
-entries.
-
-```{r make-synthetic}
-set.seed(1)
-n <- 60; p <- 15; rank_true <- 3
-U <- matrix(rnorm(n * rank_true), n, rank_true)
-V <- matrix(rnorm(p * rank_true), p, rank_true)
-truth <- U %*% t(V) + matrix(rnorm(n * p, sd = 0.1), n, p)
-rownames(truth) <- paste0("S", seq_len(n))
-colnames(truth) <- paste0("F", seq_len(p))
-
-# Mask 10% of entries
-na_mask <- matrix(runif(n * p) < 0.1, n, p)
-X <- truth
-X[na_mask] <- NA
-sprintf("Missingness: %.1f%%", 100 * mean(is.na(X)))
-```
-
-Run the three methods individually:
-
-```{r single-method-runs}
-Xi_softimpute <- softimpute_imputation(X)
-Xi_flashier   <- flashier_imputation(X, verbose = 0)
-# XGBoost can be slow; reduce iterations for the vignette.
-Xi_xgboost    <- xgboost_imputation(X, maxiter = 3, nrounds = 20,
-                                     verbose = FALSE)
-
-# RMSE against the truth on the masked cells
-rmse <- function(Xi) sqrt(mean((Xi[na_mask] - truth[na_mask])^2))
-cat(sprintf("softImpute RMSE: %.3f\n", rmse(Xi_softimpute)))
-cat(sprintf("flashier   RMSE: %.3f\n", rmse(Xi_flashier)))
-cat(sprintf("xgboost    RMSE: %.3f\n", rmse(Xi_xgboost)))
-```
-
-On rank-3 truth, flashier and softImpute both exploit the low-rank
-structure; XGBoost has no structural assumption and pays for that on
-this kind of input. Real expression matrices tend to be closer to the
-low-rank regime, so softImpute and flashier are usually competitive.
-
-## The pipeline: multi-method + held-out CV
-
-For real data you typically want to compare methods before picking one.
-`molecular_trait_imputation_pipeline()` does this in a single call.
-With `evaluate = TRUE` it holds out a random fraction of observed cells,
-fits each method against the masked matrix, and reports RMSE on the
-masked cells.
-
-```{r pipeline-multi}
-res <- molecular_trait_imputation_pipeline(
-  X,
-  methods       = c("xgboost", "softimpute", "flashier"),
-  method_args   = list(
-    xgboost = list(maxiter = 3, nrounds = 20, verbose = FALSE)
-  ),
-  evaluate      = TRUE,
-  eval_fraction = 0.05,
-  eval_seed     = 7
-)
-
-res$missingness$fraction          # overall missingness
-res$evaluation                     # per-method RMSE + time
-str(res$imputed, max.level = 1)    # final imputed matrices, one per method
-```
-
-The pipeline reports three things:
-
-- **`imputed`** — named list of imputed matrices, one per requested
-  method. These were fit on the **full** input (including the cells
-  that were held out for CV), so they are what you would use
-  downstream.
-- **`missingness`** — per-column NA counts and overall NA fraction.
-- **`evaluation`** — when `evaluate = TRUE`, a data.frame with one row
-  per method: `method`, `rmse`, `n_held_out`, `time_seconds`. Use it to
-  pick the method that best handles your data's structure.
-
-## Choosing a method
-
-| Method | Typical RMSE behaviour | Cost | Notes |
-|---|---|---|---|
-| **softimpute** | Best on smooth low-rank matrices. `lambda` is auto-tuned by held-out CV by default. | Very fast (seconds) plus the lambda-grid pass. | Pass a scalar `lambda` to skip tuning; pass a vector to restrict the CV grid. |
-| **flashier** | Best on mixed sparse + diffuse signal. Default priors (`ebnm_normal` + `ebnm_normal_scale_mixture`) work well for most expression-like matrices. | Moderate (seconds-to-minutes). | Reuses the same FLASH defaults as `compute_cov_flash()` (mr.mash prior step). |
-| **xgboost** | Robust to non-linear column relationships. Can overfit on small matrices. | Slowest (minutes-to-hours for large matrices). | Set `num_workers > 1` for parallel per-column fits. Tune `max_depth` and `nrounds`. |
-
-A simple rule of thumb: run all three with `evaluate = TRUE` on a
-small subset of your data, then use the best for the full matrix.
-
-## Integration with the rest of pecotmr
-
-The imputed matrix slots directly into the downstream pipelines:
-
-```{r downstream, eval=FALSE}
-# Imputed phenotype matrix from the pipeline
-Y_complete <- res$imputed[["flashier"]]
-
-# Fit fine-mapping on a single context (one column of Y)
-fit <- univariate_analysis_pipeline(
-  X = genotype_matrix, Y = Y_complete[, "F1"], maf = maf_vector,
-  twas_weights = TRUE
-)
-
-# Or fit multivariate models on the full matrix
-mnm_fit <- multivariate_analysis_pipeline(
-  X = genotype_matrix, Y = Y_complete,
-  data_driven_prior_matrices = prior_matrices
-)
-mt <- twas_multivariate_weights_pipeline(
-  X = genotype_matrix, Y = Y_complete, mnm_fit = mnm_fit
-)
-```
-
-Imputation happens once, upstream of fine-mapping and TWAS weight
-training, on the full phenotype matrix across contexts. The pipelines
-do not re-impute internally.
-
-## Caveats
-
-1. **Don't over-impute.** A column that's missing in most samples won't
-   recover useful structure no matter which method you use. Drop
-   columns with high missingness (`> 50%` is a common cutoff) before
-   imputing.
-2. **All-NA columns are dropped.** Each method removes columns where
-   every entry is missing and emits a one-line message. The returned
-   matrix has fewer columns than the input in that case.
-3. **CV evaluation uses one random mask.** For more stable RMSE
-   estimates, run the pipeline a few times with different `eval_seed`
-   values and average.
-4. **softImpute `lambda` is auto-tuned by default.** A random
-   fraction of observed cells (default 5%) is held out, softImpute is
-   fit across a geometric grid of lambdas via warm-start, and the
-   lambda with the lowest held-out RMSE wins. Pass `lambda = <scalar>`
-   to skip tuning; pass `lambda = <vector>` to restrict the grid.
-   The chosen lambda and the full grid scores are attached to the
-   returned matrix as the `"softimpute_lambda"` and
-   `"softimpute_lambda_cv"` attributes.
-
-```{r session-info}
-sessionInfo()
-```

From 894dd3ad64a54ebc1b46062dfac6112a27da1c18 Mon Sep 17 00:00:00 2001
From: Daniel Nachun <daniel.nachun@gmail.com>
Date: Fri, 5 Jun 2026 19:10:13 -0700
Subject: [PATCH 5/6] vignette updates

---
 DESCRIPTION                                |   5 +-
 NAMESPACE                                  |   5 -
 R/imputation.R                             | 468 ---------------------
 _pkgdown.yml                               |   6 -
 man/flashier_imputation.Rd                 |  48 ---
 man/molecular_trait_imputation_pipeline.Rd |  68 ---
 man/softimpute_imputation.Rd               |  67 ---
 man/xgboost_imputation.Rd                  |  43 --
 tests/testthat/test_misc.R                 |  44 --
 tests/testthat/test_molecular_imputation.R | 185 --------
 10 files changed, 1 insertion(+), 938 deletions(-)
 delete mode 100644 R/imputation.R
 delete mode 100644 man/flashier_imputation.Rd
 delete mode 100644 man/molecular_trait_imputation_pipeline.Rd
 delete mode 100644 man/softimpute_imputation.Rd
 delete mode 100644 man/xgboost_imputation.Rd
 delete mode 100644 tests/testthat/test_molecular_imputation.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 830def16..acc98593 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -74,10 +74,8 @@ Suggests:
     rtracklayer,
     SNPRelate,
     snpStats,
-    softImpute,
     testthat,
-    VariantAnnotation,
-    xgboost
+    VariantAnnotation
 Remotes:
     stephenslab/fsusieR,
     stephenslab/mvsusieR,
@@ -112,7 +110,6 @@ Collate:
     'h2_hdl.R'
     'h2_lder.R'
     'h2_utils.R'
-    'imputation.R'
     'ld_loader.R'
     'mash_wrapper.R'
     'misc.R'
diff --git a/NAMESPACE b/NAMESPACE
index 0bb64eff..8752a7c7 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -72,7 +72,6 @@ export(find_overlapping_regions)
 export(fine_mr)
 export(fit_mash_contrast)
 export(fit_susie_inf_then_susie_rss)
-export(flashier_imputation)
 export(format_finemapping_output)
 export(fsusie_get_cs)
 export(fsusie_wrapper)
@@ -174,7 +173,6 @@ export(merge_sumstats_matrices)
 export(merge_variant_info)
 export(meta_analysis_per_cell)
 export(meta_sldsc_random)
-export(molecular_trait_imputation_pipeline)
 export(mr_analysis)
 export(mr_ash_rss_weights)
 export(mr_format)
@@ -218,7 +216,6 @@ export(sdpr_weights)
 export(slalom)
 export(sldsc_postprocessing_pipeline)
 export(slice_mash_data)
-export(softimpute_imputation)
 export(standardise_sumstats_columns)
 export(standardize_sldsc_trait)
 export(subsetChr)
@@ -247,7 +244,6 @@ export(univariate_analysis_pipeline)
 export(update_mash_model_cov)
 export(wald_test_pval)
 export(writeSumstatsVcf)
-export(xgboost_imputation)
 export(xqtl_enrichment_wrapper)
 export(z_to_pvalue)
 exportClasses(AlleleQCResult)
@@ -327,7 +323,6 @@ import(dplyr)
 import(tibble)
 import(tidyr)
 importFrom(BiocParallel,MulticoreParam)
-importFrom(BiocParallel,SerialParam)
 importFrom(BiocParallel,bplapply)
 importFrom(BiocParallel,bpparam)
 importFrom(BiocParallel,bpworkers)
diff --git a/R/imputation.R b/R/imputation.R
deleted file mode 100644
index 226d6be7..00000000
--- a/R/imputation.R
+++ /dev/null
@@ -1,468 +0,0 @@
-# =============================================================================
-# Molecular trait imputation
-#
-# User-facing imputation methods for filling missing values in molecular trait
-# matrices (expression, splicing, protein abundance) before fine-mapping and
-# TWAS-weight training. Three method wrappers plus a unified pipeline; see the
-# molecular-imputation vignette for when to use which method.
-#
-# The genotype-side mean_impute() helper lives in misc.R because it is an
-# internal building block of compute_LD() and genotype loading, not a
-# user-facing imputer.
-# =============================================================================
-
-#' XGBoost-based iterative imputation of missing values
-#'
-#' Imputes missing values in a numeric matrix by iteratively training
-#' per-column XGBoost models on observed entries and predicting missing ones.
-#' Columns that are entirely missing are removed. Initial imputation uses
-#' column means.
-#'
-#' @param data Numeric matrix with missing values (NA).
-#' @param maxiter Maximum number of imputation iterations (default 10).
-#' @param max_depth Maximum tree depth for XGBoost (default 2).
-#' @param nrounds Number of boosting rounds per variable (default 50).
-#' @param decreasing Logical. If TRUE, impute variables with most missing
-#'   values first. Default FALSE (fewest missing first).
-#' @param num_workers Number of parallel workers for BiocParallel. Default 1
-#'   (sequential).
-#' @param verbose Logical, print progress (default TRUE).
-#' @return The imputed matrix with the same dimensions as the input (minus
-#'   any all-NA columns).
-#' @importFrom BiocParallel MulticoreParam SerialParam bplapply
-#' @export
-xgboost_imputation <- function(data, maxiter = 10L, max_depth = 2L,
-                                nrounds = 50L, decreasing = FALSE,
-                                num_workers = 1L, verbose = TRUE) {
-  if (!requireNamespace("xgboost", quietly = TRUE))
-    stop("Package 'xgboost' is required for xgboost_imputation")
-
-  xmis <- as.matrix(data)
-  n <- nrow(xmis)
-  p <- ncol(xmis)
-
-  # Remove completely missing columns
-  all_na <- colSums(is.na(xmis)) == n
-  if (any(all_na)) {
-    if (verbose)
-      message("Removed ", sum(all_na), " column(s) with all entries missing.")
-    xmis <- xmis[, !all_na, drop = FALSE]
-    p <- ncol(xmis)
-  }
-
-  # Initial mean imputation
-  ximp <- xmis
-  col_means <- colMeans(xmis, na.rm = TRUE)
-  for (j in seq_len(p)) {
-    ximp[is.na(xmis[, j]), j] <- col_means[j]
-  }
-
-  # Missing value locations
-  NAloc <- is.na(xmis)
-  noNAvar <- colSums(NAloc)
-  sort_j <- order(noNAvar, decreasing = decreasing)
-  nzsort_j <- sort_j[noNAvar[sort_j] > 0]
-
-  if (length(nzsort_j) == 0) {
-    if (verbose) message("No missing values to impute.")
-    return(ximp)
-  }
-
-  # Set up BiocParallel
-  if (num_workers > 1L) {
-    BPPARAM <- MulticoreParam(workers = num_workers)
-  } else {
-    BPPARAM <- SerialParam()
-  }
-
-  iter <- 0L
-  conv_new <- 0
-  conv_old <- Inf
-  ximp_history <- vector("list", maxiter)
-
-  while (conv_new < conv_old && iter < maxiter) {
-    if (iter > 0) conv_old <- conv_new
-    if (verbose) message("  XGBoost iteration ", iter + 1L, " in progress...")
-
-    ximp_old <- ximp
-
-    # Impute each variable with missing values
-    impute_one <- function(var_idx) {
-      obsi <- !NAloc[, var_idx]
-      misi <- NAloc[, var_idx]
-      obsY <- ximp[obsi, var_idx]
-      obsX <- ximp[obsi, -var_idx, drop = FALSE]
-      misX <- ximp[misi, -var_idx, drop = FALSE]
-
-      xgb_train <- xgboost::xgb.DMatrix(data = obsX, label = obsY)
-      xgb_pred <- xgboost::xgb.DMatrix(data = misX)
-      model <- xgboost::xgb.train(
-        params = list(max_depth = max_depth, verbosity = 0),
-        data = xgb_train, nrounds = nrounds)
-      list(var_idx = var_idx, predicted = predict(model, xgb_pred))
-    }
-
-    results <- bplapply(nzsort_j, impute_one, BPPARAM = BPPARAM)
-
-    for (res in results) {
-      misi <- NAloc[, res$var_idx]
-      ximp[misi, res$var_idx] <- res$predicted
-    }
-
-    iter <- iter + 1L
-    ximp_history[[iter]] <- ximp
-
-    # Convergence: relative change in imputed values
-    conv_new <- sum((ximp - ximp_old)^2) / sum(ximp^2)
-  }
-
-  # Return last improving iteration
-  if (iter == maxiter) ximp_history[[iter]] else ximp_history[[max(iter - 1L, 1L)]]
-}
-
-#' softImpute-based matrix imputation
-#'
-#' Imputes missing values in a numeric matrix via the soft-thresholded SVD
-#' algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"}
-#' alternating-least-squares solver by default.
-#'
-#' \code{lambda} drives the bias-variance trade-off. By default the
-#' function tunes it via held-out cross-validation: a random fraction of
-#' observed cells is masked, softImpute is fit across a geometric grid of
-#' lambdas with warm-start, and the lambda minimising RMSE on the masked
-#' cells is selected. The final imputation then uses that lambda on the
-#' full input. Pass a scalar \code{lambda} to skip tuning.
-#'
-#' Columns with all entries missing are removed before fitting and a
-#' one-line message records the count.
-#'
-#' @param data Numeric matrix (samples x features) with missing values
-#'   (NA).
-#' @param rank.max Maximum rank of the SVD solution. Default
-#'   \code{min(dim(data) - 1)}.
-#' @param lambda Nuclear-norm penalty. \code{NULL} (default) tunes over
-#'   a default geometric grid via held-out CV. A scalar uses that lambda
-#'   directly with no tuning. A vector defines the tuning grid.
-#' @param tune_fraction Fraction of observed cells held out for CV.
-#'   Default 0.05. Ignored when \code{lambda} is a scalar.
-#' @param tune_seed Seed for the held-out mask. Default 42L.
-#' @param type Optimisation type passed to \code{softImpute::softImpute};
-#'   one of \code{"als"} (default, faster) or \code{"svd"}.
-#' @param thresh Convergence threshold. Default 1e-5.
-#' @param maxit Maximum iterations. Default 100.
-#' @param verbose Logical, print progress (including per-lambda CV RMSE).
-#'   Default FALSE.
-#' @return The imputed matrix with the same dimensions as the input
-#'   (minus any all-NA columns). Carries attributes
-#'   \code{"softimpute_lambda"} (the lambda used for the final fit) and,
-#'   when tuning was performed, \code{"softimpute_lambda_cv"} (a
-#'   data.frame of grid-search RMSE values).
-#' @export
-softimpute_imputation <- function(data, rank.max = NULL, lambda = NULL,
-                                   tune_fraction = 0.05,
-                                   tune_seed = 42L,
-                                   type = c("als", "svd"),
-                                   thresh = 1e-5, maxit = 100,
-                                   verbose = FALSE) {
-  if (!requireNamespace("softImpute", quietly = TRUE)) {
-    stop("Package 'softImpute' is required for softimpute_imputation.")
-  }
-  type <- match.arg(type)
-  X <- as.matrix(data)
-  n <- nrow(X); p <- ncol(X)
-  all_na <- colSums(is.na(X)) == n
-  if (any(all_na)) {
-    if (verbose) message("Removed ", sum(all_na), " column(s) with all entries missing.")
-    X <- X[, !all_na, drop = FALSE]
-    p <- ncol(X)
-  }
-  if (is.null(rank.max)) rank.max <- max(1L, min(n, p) - 1L)
-  lambda0_val <- softImpute::lambda0(X)
-
-  # Decide whether to tune. lambda = scalar => no tuning; NULL/vector => tune.
-  tune <- is.null(lambda) || length(lambda) > 1L
-  if (tune) {
-    if (is.null(lambda)) {
-      # Default geometric grid from lambda0 down to 1e-3 * lambda0.
-      lambda_grid <- exp(seq(log(lambda0_val),
-                              log(max(1e-3 * lambda0_val, .Machine$double.eps)),
-                              length.out = 10L))
-    } else {
-      lambda_grid <- sort(as.numeric(lambda), decreasing = TRUE)
-    }
-    cv_res <- .softimpute_cv_lambda(X, lambda_grid = lambda_grid,
-                                     rank.max = rank.max,
-                                     tune_fraction = tune_fraction,
-                                     tune_seed = tune_seed,
-                                     type = type, thresh = thresh, maxit = maxit,
-                                     verbose = verbose)
-    lambda <- cv_res$best_lambda
-    if (verbose) {
-      message(sprintf("softimpute_imputation: tuned lambda = %.4g (CV RMSE %.4f over %d cells)",
-                      lambda, cv_res$best_rmse, cv_res$n_held_out))
-    }
-  }
-
-  fit <- softImpute::softImpute(X, rank.max = rank.max, lambda = lambda,
-                                 type = type, thresh = thresh, maxit = maxit,
-                                 trace.it = isTRUE(verbose))
-  Ximp <- softImpute::complete(X, fit)
-  if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X)
-  if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X)
-  attr(Ximp, "softimpute_lambda") <- lambda
-  if (tune) attr(Ximp, "softimpute_lambda_cv") <- cv_res$grid
-  Ximp
-}
-
-# Internal: CV-tune softImpute's lambda over a grid using a held-out mask.
-# Uses softImpute's warm.start feature to chain fits along the lambda path
-# (largest lambda first). Returns the chosen lambda, its CV RMSE, and the
-# full grid scores.
-.softimpute_cv_lambda <- function(X, lambda_grid, rank.max,
-                                   tune_fraction, tune_seed,
-                                   type, thresh, maxit, verbose) {
-  na_mask <- is.na(X)
-  obs_idx <- which(!na_mask)
-  n_hold <- max(1L, floor(tune_fraction * length(obs_idx)))
-  if (n_hold >= length(obs_idx)) {
-    stop("tune_fraction too large; would mask all observed entries.")
-  }
-  set.seed(tune_seed)
-  hold_idx <- sample(obs_idx, size = n_hold, replace = FALSE)
-  truth <- X[hold_idx]
-  Xmasked <- X
-  Xmasked[hold_idx] <- NA_real_
-
-  lambdas <- sort(as.numeric(lambda_grid), decreasing = TRUE)
-  rmse_vec <- numeric(length(lambdas))
-  warm <- NULL
-  for (i in seq_along(lambdas)) {
-    fit_args <- list(x = Xmasked, rank.max = rank.max, lambda = lambdas[i],
-                     type = type, thresh = thresh, maxit = maxit,
-                     trace.it = FALSE)
-    if (!is.null(warm)) fit_args$warm.start <- warm
-    fit <- tryCatch(do.call(softImpute::softImpute, fit_args),
-                    error = function(e) NULL)
-    if (is.null(fit)) {
-      rmse_vec[i] <- NA_real_
-      next
-    }
-    Ximp <- softImpute::complete(Xmasked, fit)
-    rmse_vec[i] <- sqrt(mean((Ximp[hold_idx] - truth)^2, na.rm = TRUE))
-    warm <- fit
-    if (verbose) {
-      message(sprintf("  softimpute CV lambda = %.4g -> RMSE %.4f",
-                      lambdas[i], rmse_vec[i]))
-    }
-  }
-  grid <- data.frame(lambda = lambdas, rmse = rmse_vec,
-                     stringsAsFactors = FALSE)
-  ok <- which(is.finite(rmse_vec))
-  if (length(ok) == 0L) {
-    stop("All softImpute CV fits failed; supply lambda explicitly.")
-  }
-  best <- ok[which.min(rmse_vec[ok])]
-  list(best_lambda = lambdas[best], best_rmse = rmse_vec[best],
-       n_held_out = n_hold, grid = grid)
-}
-
-#' flashier-based matrix imputation
-#'
-#' Imputes missing values in a numeric matrix via empirical Bayes
-#' matrix factorisation (\code{flashier::flash}). \code{flash} natively
-#' supports NA entries and reconstructs them from the posterior fitted
-#' values. The default prior families match those used by
-#' \code{\link{compute_cov_flash}}: \code{ebnm::ebnm_normal} on the row
-#' factor and \code{ebnm::ebnm_normal_scale_mixture} on the column
-#' factor.
-#'
-#' @param data Numeric matrix (samples x features) with missing values
-#'   (NA).
-#' @param var_type Variance structure for the residuals, passed to
-#'   \code{flashier::flash}. Default 2 (per-column).
-#' @param ebnm_fn Prior family. Default
-#'   \code{c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture)}.
-#' @param greedy_Kmax Maximum number of greedy factors. Default
-#'   \code{min(20L, min(dim(data)) - 1L)}.
-#' @param backfit Logical; run a final backfit pass. Default TRUE.
-#' @param verbose Numeric verbosity (0 silent, default).
-#' @param ... Additional arguments forwarded to \code{flashier::flash}.
-#' @return The imputed matrix with the same dimensions as the input
-#'   (minus any all-NA columns).
-#' @export
-flashier_imputation <- function(data, var_type = 2,
-                                 ebnm_fn = c(ebnm::ebnm_normal,
-                                             ebnm::ebnm_normal_scale_mixture),
-                                 greedy_Kmax = NULL,
-                                 backfit = TRUE, verbose = 0, ...) {
-  if (!requireNamespace("flashier", quietly = TRUE)) {
-    stop("Package 'flashier' is required for flashier_imputation.")
-  }
-  if (!requireNamespace("ebnm", quietly = TRUE)) {
-    stop("Package 'ebnm' is required for flashier_imputation.")
-  }
-  X <- as.matrix(data)
-  n <- nrow(X); p <- ncol(X)
-  all_na <- colSums(is.na(X)) == n
-  if (any(all_na)) {
-    if (isTRUE(verbose) || (is.numeric(verbose) && verbose >= 1)) {
-      message("Removed ", sum(all_na), " column(s) with all entries missing.")
-    }
-    X <- X[, !all_na, drop = FALSE]
-    p <- ncol(X)
-  }
-  if (is.null(greedy_Kmax)) greedy_Kmax <- max(1L, min(20L, min(n, p) - 1L))
-  fl <- flashier::flash(X, var_type = var_type, ebnm_fn = ebnm_fn,
-                         greedy_Kmax = greedy_Kmax, backfit = backfit,
-                         verbose = verbose, ...)
-  # Posterior fitted value: L_pm %*% t(F_pm). When no factors are found,
-  # fall back to column means.
-  if (is.null(fl$n_factors) || fl$n_factors == 0) {
-    fitted <- matrix(colMeans(X, na.rm = TRUE),
-                     nrow = n, ncol = p, byrow = TRUE)
-  } else {
-    fitted <- tcrossprod(fl$L_pm, fl$F_pm)
-  }
-  Ximp <- X
-  na_idx <- is.na(X)
-  Ximp[na_idx] <- fitted[na_idx]
-  if (is.null(rownames(Ximp))) rownames(Ximp) <- rownames(X)
-  if (is.null(colnames(Ximp))) colnames(Ximp) <- colnames(X)
-  Ximp
-}
-
-#' Pipeline for molecular trait imputation
-#'
-#' Unified entry point for imputing missing values in a molecular trait
-#' matrix (e.g. expression, splicing, protein abundance). Dispatches to
-#' one or more underlying methods and optionally evaluates imputation
-#' quality on a held-out random mask of observed entries.
-#'
-#' Available methods:
-#' \itemize{
-#'   \item \code{"xgboost"} - \code{\link{xgboost_imputation}}, iterative
-#'     XGBoost per-column models. No structural assumption on the matrix.
-#'   \item \code{"softimpute"} - \code{\link{softimpute_imputation}},
-#'     soft-thresholded SVD via \code{softImpute::softImpute}. Assumes a
-#'     low-rank smooth signal.
-#'   \item \code{"flashier"} - \code{\link{flashier_imputation}},
-#'     empirical Bayes matrix factorisation via \code{flashier::flash}
-#'     with sparse + scale-mixture priors.
-#' }
-#'
-#' @param data Numeric matrix (samples x features) with missing values.
-#' @param methods Character vector of methods to fit. Any subset of
-#'   \code{c("xgboost", "softimpute", "flashier")}. Default fits all
-#'   three.
-#' @param method_args Named list of per-method argument lists. List
-#'   names must match \code{methods}. Default empty.
-#' @param evaluate Logical. If TRUE, mask a random fraction of observed
-#'   entries, run each method, and report root-mean-squared error on the
-#'   masked cells. Default FALSE.
-#' @param eval_fraction Fraction of observed cells to hold out for
-#'   evaluation. Default 0.05.
-#' @param eval_seed Random seed for the held-out mask. Default 42.
-#' @param verbose Logical, print progress. Default FALSE.
-#'
-#' @return A list with:
-#' \describe{
-#'   \item{imputed}{Named list of imputed matrices, one per requested
-#'     method.}
-#'   \item{missingness}{Per-column NA count and overall NA fraction.}
-#'   \item{evaluation}{When \code{evaluate = TRUE}, a data.frame with
-#'     one row per method: \code{method}, \code{rmse}, \code{n_held_out},
-#'     \code{time_seconds}.}
-#'   \item{method_args}{The resolved per-method arguments used.}
-#' }
-#' @export
-molecular_trait_imputation_pipeline <- function(
-    data,
-    methods = c("xgboost", "softimpute", "flashier"),
-    method_args = list(),
-    evaluate = FALSE,
-    eval_fraction = 0.05,
-    eval_seed = 42L,
-    verbose = FALSE) {
-  valid <- c("xgboost", "softimpute", "flashier")
-  if (!is.character(methods) || length(methods) == 0L) {
-    stop("methods must be a non-empty character vector.")
-  }
-  bad <- setdiff(methods, valid)
-  if (length(bad) > 0) {
-    stop("Unknown imputation method(s): ", paste(bad, collapse = ", "),
-         ". Valid options: ", paste(valid, collapse = ", "))
-  }
-  methods <- unique(methods)
-
-  Xfull <- as.matrix(data)
-  n <- nrow(Xfull); p <- ncol(Xfull)
-  na_mask_full <- is.na(Xfull)
-  na_per_col <- colSums(na_mask_full)
-  missingness <- list(
-    per_column = na_per_col,
-    n_missing = sum(na_mask_full),
-    fraction = sum(na_mask_full) / (n * p)
-  )
-  if (verbose) {
-    message(sprintf("molecular_trait_imputation_pipeline: %d x %d matrix, %.2f%% missing.",
-                    n, p, 100 * missingness$fraction))
-  }
-
-  dispatch <- list(xgboost = xgboost_imputation,
-                   softimpute = softimpute_imputation,
-                   flashier = flashier_imputation)
-
-  run_one <- function(m, X_in) {
-    args <- method_args[[m]] %||% list()
-    do.call(dispatch[[m]], c(list(X_in), args))
-  }
-
-  # If evaluating, hold out a random subset of observed cells, run each
-  # method against the masked matrix, then score RMSE on the held-out
-  # cells. Final imputation is fit against the original matrix so the
-  # returned matrices use every observed cell.
-  eval_df <- NULL
-  if (isTRUE(evaluate)) {
-    obs_idx <- which(!na_mask_full)
-    n_hold <- max(1L, floor(eval_fraction * length(obs_idx)))
-    if (n_hold >= length(obs_idx)) {
-      stop("eval_fraction too large; would mask all observed entries.")
-    }
-    set.seed(eval_seed)
-    hold_idx <- sample(obs_idx, size = n_hold, replace = FALSE)
-    Xmasked <- Xfull
-    held_truth <- Xfull[hold_idx]
-    Xmasked[hold_idx] <- NA_real_
-    eval_records <- vector("list", length(methods))
-    for (i in seq_along(methods)) {
-      m <- methods[i]
-      if (verbose) message("Evaluating ", m, " on held-out mask ...")
-      t0 <- proc.time()
-      Ximp_m <- run_one(m, Xmasked)
-      t1 <- proc.time()
-      pred <- Ximp_m[hold_idx]
-      rmse <- sqrt(mean((pred - held_truth)^2, na.rm = TRUE))
-      eval_records[[i]] <- data.frame(
-        method = m, rmse = rmse, n_held_out = n_hold,
-        time_seconds = as.numeric((t1 - t0)["elapsed"]),
-        stringsAsFactors = FALSE
-      )
-    }
-    eval_df <- do.call(rbind, eval_records)
-    rownames(eval_df) <- NULL
-  }
-
-  # Final imputation: run each method on the full matrix
-  imputed <- list()
-  for (m in methods) {
-    if (verbose) message("Imputing with ", m, " ...")
-    imputed[[m]] <- run_one(m, Xfull)
-  }
-
-  list(
-    imputed = imputed,
-    missingness = missingness,
-    evaluation = eval_df,
-    method_args = method_args
-  )
-}
diff --git a/_pkgdown.yml b/_pkgdown.yml
index efcf0b98..e8bdeade 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -45,12 +45,6 @@ navbar:
    aria-label: View source on GitHub
 
 articles:
-  - title: "Data Preparation"
-    desc: >
-      Imputing missing values in molecular trait matrices before
-      fine-mapping and TWAS weight training.
-    contents:
-    - molecular-imputation
   - title: "Quality Control"
     desc: >
       Allele alignment and LD-mismatch QC for GWAS and QTL summary
diff --git a/man/flashier_imputation.Rd b/man/flashier_imputation.Rd
deleted file mode 100644
index 7762c785..00000000
--- a/man/flashier_imputation.Rd
+++ /dev/null
@@ -1,48 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/imputation.R
-\name{flashier_imputation}
-\alias{flashier_imputation}
-\title{flashier-based matrix imputation}
-\usage{
-flashier_imputation(
-  data,
-  var_type = 2,
-  ebnm_fn = c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture),
-  greedy_Kmax = NULL,
-  backfit = TRUE,
-  verbose = 0,
-  ...
-)
-}
-\arguments{
-\item{data}{Numeric matrix (samples x features) with missing values
-(NA).}
-
-\item{var_type}{Variance structure for the residuals, passed to
-\code{flashier::flash}. Default 2 (per-column).}
-
-\item{ebnm_fn}{Prior family. Default
-\code{c(ebnm::ebnm_normal, ebnm::ebnm_normal_scale_mixture)}.}
-
-\item{greedy_Kmax}{Maximum number of greedy factors. Default
-\code{min(20L, min(dim(data)) - 1L)}.}
-
-\item{backfit}{Logical; run a final backfit pass. Default TRUE.}
-
-\item{verbose}{Numeric verbosity (0 silent, default).}
-
-\item{...}{Additional arguments forwarded to \code{flashier::flash}.}
-}
-\value{
-The imputed matrix with the same dimensions as the input
-  (minus any all-NA columns).
-}
-\description{
-Imputes missing values in a numeric matrix via empirical Bayes
-matrix factorisation (\code{flashier::flash}). \code{flash} natively
-supports NA entries and reconstructs them from the posterior fitted
-values. The default prior families match those used by
-\code{\link{compute_cov_flash}}: \code{ebnm::ebnm_normal} on the row
-factor and \code{ebnm::ebnm_normal_scale_mixture} on the column
-factor.
-}
diff --git a/man/molecular_trait_imputation_pipeline.Rd b/man/molecular_trait_imputation_pipeline.Rd
deleted file mode 100644
index 7932803b..00000000
--- a/man/molecular_trait_imputation_pipeline.Rd
+++ /dev/null
@@ -1,68 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/imputation.R
-\name{molecular_trait_imputation_pipeline}
-\alias{molecular_trait_imputation_pipeline}
-\title{Pipeline for molecular trait imputation}
-\usage{
-molecular_trait_imputation_pipeline(
-  data,
-  methods = c("xgboost", "softimpute", "flashier"),
-  method_args = list(),
-  evaluate = FALSE,
-  eval_fraction = 0.05,
-  eval_seed = 42L,
-  verbose = FALSE
-)
-}
-\arguments{
-\item{data}{Numeric matrix (samples x features) with missing values.}
-
-\item{methods}{Character vector of methods to fit. Any subset of
-\code{c("xgboost", "softimpute", "flashier")}. Default fits all
-three.}
-
-\item{method_args}{Named list of per-method argument lists. List
-names must match \code{methods}. Default empty.}
-
-\item{evaluate}{Logical. If TRUE, mask a random fraction of observed
-entries, run each method, and report root-mean-squared error on the
-masked cells. Default FALSE.}
-
-\item{eval_fraction}{Fraction of observed cells to hold out for
-evaluation. Default 0.05.}
-
-\item{eval_seed}{Random seed for the held-out mask. Default 42.}
-
-\item{verbose}{Logical, print progress. Default FALSE.}
-}
-\value{
-A list with:
-\describe{
-  \item{imputed}{Named list of imputed matrices, one per requested
-    method.}
-  \item{missingness}{Per-column NA count and overall NA fraction.}
-  \item{evaluation}{When \code{evaluate = TRUE}, a data.frame with
-    one row per method: \code{method}, \code{rmse}, \code{n_held_out},
-    \code{time_seconds}.}
-  \item{method_args}{The resolved per-method arguments used.}
-}
-}
-\description{
-Unified entry point for imputing missing values in a molecular trait
-matrix (e.g. expression, splicing, protein abundance). Dispatches to
-one or more underlying methods and optionally evaluates imputation
-quality on a held-out random mask of observed entries.
-}
-\details{
-Available methods:
-\itemize{
-  \item \code{"xgboost"} - \code{\link{xgboost_imputation}}, iterative
-    XGBoost per-column models. No structural assumption on the matrix.
-  \item \code{"softimpute"} - \code{\link{softimpute_imputation}},
-    soft-thresholded SVD via \code{softImpute::softImpute}. Assumes a
-    low-rank smooth signal.
-  \item \code{"flashier"} - \code{\link{flashier_imputation}},
-    empirical Bayes matrix factorisation via \code{flashier::flash}
-    with sparse + scale-mixture priors.
-}
-}
diff --git a/man/softimpute_imputation.Rd b/man/softimpute_imputation.Rd
deleted file mode 100644
index 89eadff7..00000000
--- a/man/softimpute_imputation.Rd
+++ /dev/null
@@ -1,67 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/imputation.R
-\name{softimpute_imputation}
-\alias{softimpute_imputation}
-\title{softImpute-based matrix imputation}
-\usage{
-softimpute_imputation(
-  data,
-  rank.max = NULL,
-  lambda = NULL,
-  tune_fraction = 0.05,
-  tune_seed = 42L,
-  type = c("als", "svd"),
-  thresh = 1e-05,
-  maxit = 100,
-  verbose = FALSE
-)
-}
-\arguments{
-\item{data}{Numeric matrix (samples x features) with missing values
-(NA).}
-
-\item{rank.max}{Maximum rank of the SVD solution. Default
-\code{min(dim(data) - 1)}.}
-
-\item{lambda}{Nuclear-norm penalty. \code{NULL} (default) tunes over
-a default geometric grid via held-out CV. A scalar uses that lambda
-directly with no tuning. A vector defines the tuning grid.}
-
-\item{tune_fraction}{Fraction of observed cells held out for CV.
-Default 0.05. Ignored when \code{lambda} is a scalar.}
-
-\item{tune_seed}{Seed for the held-out mask. Default 42L.}
-
-\item{type}{Optimisation type passed to \code{softImpute::softImpute};
-one of \code{"als"} (default, faster) or \code{"svd"}.}
-
-\item{thresh}{Convergence threshold. Default 1e-5.}
-
-\item{maxit}{Maximum iterations. Default 100.}
-
-\item{verbose}{Logical, print progress (including per-lambda CV RMSE).
-Default FALSE.}
-}
-\value{
-The imputed matrix with the same dimensions as the input
-  (minus any all-NA columns). Carries attributes
-  \code{"softimpute_lambda"} (the lambda used for the final fit) and,
-  when tuning was performed, \code{"softimpute_lambda_cv"} (a
-  data.frame of grid-search RMSE values).
-}
-\description{
-Imputes missing values in a numeric matrix via the soft-thresholded SVD
-algorithm of \code{softImpute::softImpute}. Uses the \code{type = "als"}
-alternating-least-squares solver by default.
-}
-\details{
-\code{lambda} drives the bias-variance trade-off. By default the
-function tunes it via held-out cross-validation: a random fraction of
-observed cells is masked, softImpute is fit across a geometric grid of
-lambdas with warm-start, and the lambda minimising RMSE on the masked
-cells is selected. The final imputation then uses that lambda on the
-full input. Pass a scalar \code{lambda} to skip tuning.
-
-Columns with all entries missing are removed before fitting and a
-one-line message records the count.
-}
diff --git a/man/xgboost_imputation.Rd b/man/xgboost_imputation.Rd
deleted file mode 100644
index b09309ee..00000000
--- a/man/xgboost_imputation.Rd
+++ /dev/null
@@ -1,43 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/imputation.R
-\name{xgboost_imputation}
-\alias{xgboost_imputation}
-\title{XGBoost-based iterative imputation of missing values}
-\usage{
-xgboost_imputation(
-  data,
-  maxiter = 10L,
-  max_depth = 2L,
-  nrounds = 50L,
-  decreasing = FALSE,
-  num_workers = 1L,
-  verbose = TRUE
-)
-}
-\arguments{
-\item{data}{Numeric matrix with missing values (NA).}
-
-\item{maxiter}{Maximum number of imputation iterations (default 10).}
-
-\item{max_depth}{Maximum tree depth for XGBoost (default 2).}
-
-\item{nrounds}{Number of boosting rounds per variable (default 50).}
-
-\item{decreasing}{Logical. If TRUE, impute variables with most missing
-values first. Default FALSE (fewest missing first).}
-
-\item{num_workers}{Number of parallel workers for BiocParallel. Default 1
-(sequential).}
-
-\item{verbose}{Logical, print progress (default TRUE).}
-}
-\value{
-The imputed matrix with the same dimensions as the input (minus
-  any all-NA columns).
-}
-\description{
-Imputes missing values in a numeric matrix by iteratively training
-per-column XGBoost models on observed entries and predicting missing ones.
-Columns that are entirely missing are removed. Initial imputation uses
-column means.
-}
diff --git a/tests/testthat/test_misc.R b/tests/testthat/test_misc.R
index 7dfb1068..013db71d 100644
--- a/tests/testthat/test_misc.R
+++ b/tests/testthat/test_misc.R
@@ -1684,50 +1684,6 @@ test_that("twas_method_cor with diagonal LD", {
   expect_equal(result[1, 2], 0)
 })
 
-# =============================================================================
-# xgboost_imputation
-# =============================================================================
-
-test_that("xgboost_imputation works on simple matrix", {
-  skip_if_not_installed("xgboost")
-  set.seed(42)
-  mat <- matrix(rnorm(100), nrow = 20, ncol = 5)
-  colnames(mat) <- paste0("V", 1:5)
-  # Introduce some missing values
-  mat[1, 1] <- NA
-  mat[5, 3] <- NA
-  mat[10, 2] <- NA
-  result <- xgboost_imputation(mat, maxiter = 2, nrounds = 10, verbose = FALSE)
-  expect_equal(dim(result), dim(mat))
-  expect_false(anyNA(result))
-})
-
-test_that("xgboost_imputation removes all-NA columns", {
-  skip_if_not_installed("xgboost")
-  set.seed(42)
-  mat <- matrix(rnorm(80), nrow = 20, ncol = 4)
-  colnames(mat) <- paste0("V", 1:4)
-  mat[, 3] <- NA  # entirely missing column
-  mat[1, 1] <- NA
-  result <- expect_message(
-    xgboost_imputation(mat, maxiter = 2, nrounds = 10, verbose = TRUE),
-    "Removed 1 column"
-  )
-  expect_equal(ncol(result), 3)
-  expect_false(anyNA(result))
-})
-
-test_that("xgboost_imputation with no missing returns data unchanged", {
-  skip_if_not_installed("xgboost")
-  set.seed(42)
-  mat <- matrix(rnorm(60), nrow = 10, ncol = 6)
-  result <- expect_message(
-    xgboost_imputation(mat, maxiter = 2, verbose = TRUE),
-    "No missing values"
-  )
-  expect_equal(result, mat)
-})
-
 # =============================================================================
 # compute_qvalues — uncovered branches
 # =============================================================================
diff --git a/tests/testthat/test_molecular_imputation.R b/tests/testthat/test_molecular_imputation.R
deleted file mode 100644
index 54d39a4a..00000000
--- a/tests/testthat/test_molecular_imputation.R
+++ /dev/null
@@ -1,185 +0,0 @@
-context("Molecular trait imputation pipeline")
-
-# Helper: low-rank synthetic matrix with NA mask.
-.make_imputation_inputs <- function(n = 60, p = 15, rank = 3, na_frac = 0.1,
-                                     seed = 1) {
-  set.seed(seed)
-  U <- matrix(rnorm(n * rank), n, rank)
-  V <- matrix(rnorm(p * rank), p, rank)
-  truth <- U %*% t(V) + matrix(rnorm(n * p, sd = 0.1), n, p)
-  X <- truth
-  na_mask <- matrix(runif(n * p) < na_frac, n, p)
-  X[na_mask] <- NA
-  rownames(X) <- paste0("S", seq_len(n))
-  colnames(X) <- paste0("F", seq_len(p))
-  list(X = X, truth = truth, na_mask = na_mask)
-}
-
-# =============================================================================
-# softimpute_imputation
-# =============================================================================
-
-test_that("softimpute_imputation: returns matrix with same dims, no NAs", {
-  skip_if_not_installed("softImpute")
-  inp <- .make_imputation_inputs()
-  Xi <- softimpute_imputation(inp$X)
-  expect_equal(dim(Xi), dim(inp$X))
-  expect_false(any(is.na(Xi)))
-  expect_equal(rownames(Xi), rownames(inp$X))
-})
-
-test_that("softimpute_imputation: handles all-NA columns by removing them", {
-  skip_if_not_installed("softImpute")
-  inp <- .make_imputation_inputs()
-  X <- inp$X
-  X[, 5] <- NA
-  expect_message(
-    Xi <- softimpute_imputation(X, verbose = TRUE),
-    "Removed 1 column"
-  )
-  expect_equal(ncol(Xi), ncol(X) - 1L)
-})
-
-test_that("softimpute_imputation: default tunes lambda via CV and attaches grid", {
-  skip_if_not_installed("softImpute")
-  inp <- .make_imputation_inputs()
-  Xi <- softimpute_imputation(inp$X)  # default lambda = NULL -> tune
-  expect_false(is.null(attr(Xi, "softimpute_lambda")))
-  grid <- attr(Xi, "softimpute_lambda_cv")
-  expect_s3_class(grid, "data.frame")
-  expect_named(grid, c("lambda", "rmse"))
-  expect_true(all(diff(grid$lambda) <= 0))  # grid sorted descending
-  expect_true(any(is.finite(grid$rmse)))
-  # The chosen lambda must be one of the grid points
-  expect_true(attr(Xi, "softimpute_lambda") %in% grid$lambda)
-})
-
-test_that("softimpute_imputation: scalar lambda bypasses tuning", {
-  skip_if_not_installed("softImpute")
-  inp <- .make_imputation_inputs()
-  l0 <- softImpute::lambda0(inp$X)
-  Xi <- softimpute_imputation(inp$X, lambda = 0.3 * l0)
-  expect_equal(attr(Xi, "softimpute_lambda"), 0.3 * l0)
-  expect_null(attr(Xi, "softimpute_lambda_cv"))
-})
-
-test_that("softimpute_imputation: vector lambda restricts tuning to that grid", {
-  skip_if_not_installed("softImpute")
-  inp <- .make_imputation_inputs()
-  l0 <- softImpute::lambda0(inp$X)
-  custom_grid <- l0 * c(0.05, 0.1, 0.25, 0.5)
-  Xi <- softimpute_imputation(inp$X, lambda = custom_grid)
-  grid <- attr(Xi, "softimpute_lambda_cv")
-  expect_setequal(grid$lambda, custom_grid)
-  expect_true(attr(Xi, "softimpute_lambda") %in% custom_grid)
-})
-
-test_that("softimpute_imputation: chosen lambda minimises CV RMSE on the grid", {
-  skip_if_not_installed("softImpute")
-  inp <- .make_imputation_inputs(n = 80, p = 20, rank = 3, seed = 11)
-  Xi <- softimpute_imputation(inp$X)
-  grid <- attr(Xi, "softimpute_lambda_cv")
-  chosen <- attr(Xi, "softimpute_lambda")
-  ok <- which(is.finite(grid$rmse))
-  best_idx <- ok[which.min(grid$rmse[ok])]
-  expect_equal(chosen, grid$lambda[best_idx])
-})
-
-# =============================================================================
-# flashier_imputation
-# =============================================================================
-
-test_that("flashier_imputation: returns matrix with same dims, no NAs", {
-  skip_if_not_installed("flashier")
-  skip_if_not_installed("ebnm")
-  inp <- .make_imputation_inputs()
-  Xi <- flashier_imputation(inp$X, verbose = 0)
-  expect_equal(dim(Xi), dim(inp$X))
-  expect_false(any(is.na(Xi)))
-})
-
-test_that("flashier_imputation: recovers low-rank truth better than column means", {
-  skip_if_not_installed("flashier")
-  skip_if_not_installed("ebnm")
-  inp <- .make_imputation_inputs()
-  Xi <- flashier_imputation(inp$X, verbose = 0)
-  # Baseline: column-mean imputation
-  Xbase <- inp$X
-  cm <- colMeans(Xbase, na.rm = TRUE)
-  for (j in seq_len(ncol(Xbase))) Xbase[is.na(Xbase[, j]), j] <- cm[j]
-  rmse_flash <- sqrt(mean((Xi[inp$na_mask] - inp$truth[inp$na_mask])^2))
-  rmse_mean  <- sqrt(mean((Xbase[inp$na_mask] - inp$truth[inp$na_mask])^2))
-  expect_lt(rmse_flash, rmse_mean)
-})
-
-# =============================================================================
-# molecular_trait_imputation_pipeline
-# =============================================================================
-
-test_that("pipeline: single-method dispatch returns named list keyed by method", {
-  skip_if_not_installed("softImpute")
-  inp <- .make_imputation_inputs()
-  res <- molecular_trait_imputation_pipeline(inp$X, methods = "softimpute")
-  expect_named(res$imputed, "softimpute")
-  expect_equal(dim(res$imputed[["softimpute"]]), dim(inp$X))
-  expect_null(res$evaluation)
-})
-
-test_that("pipeline: multi-method runs all three and returns one matrix per method", {
-  skip_if_not_installed("softImpute")
-  skip_if_not_installed("flashier")
-  skip_if_not_installed("xgboost")
-  inp <- .make_imputation_inputs()
-  res <- molecular_trait_imputation_pipeline(
-    inp$X, methods = c("xgboost", "softimpute", "flashier"),
-    method_args = list(
-      xgboost = list(maxiter = 2, nrounds = 10, verbose = FALSE)
-    )
-  )
-  expect_named(res$imputed, c("xgboost", "softimpute", "flashier"))
-  for (m in names(res$imputed)) {
-    expect_equal(dim(res$imputed[[m]]), dim(inp$X))
-    expect_false(any(is.na(res$imputed[[m]])))
-  }
-})
-
-test_that("pipeline: evaluate = TRUE reports RMSE on held-out cells", {
-  skip_if_not_installed("softImpute")
-  skip_if_not_installed("flashier")
-  inp <- .make_imputation_inputs()
-  res <- molecular_trait_imputation_pipeline(
-    inp$X, methods = c("softimpute", "flashier"),
-    evaluate = TRUE, eval_fraction = 0.1, eval_seed = 7
-  )
-  expect_s3_class(res$evaluation, "data.frame")
-  expect_equal(nrow(res$evaluation), 2L)
-  expect_named(res$evaluation,
-               c("method", "rmse", "n_held_out", "time_seconds"))
-  expect_true(all(res$evaluation$rmse > 0))
-  expect_true(all(res$evaluation$n_held_out > 0))
-})
-
-test_that("pipeline: missingness summary matches input NA count", {
-  skip_if_not_installed("softImpute")
-  inp <- .make_imputation_inputs()
-  res <- molecular_trait_imputation_pipeline(inp$X, methods = "softimpute")
-  expect_equal(res$missingness$n_missing, sum(is.na(inp$X)))
-  expect_equal(res$missingness$fraction,
-               sum(is.na(inp$X)) / prod(dim(inp$X)))
-})
-
-test_that("pipeline: rejects unknown method", {
-  inp <- .make_imputation_inputs()
-  expect_error(
-    molecular_trait_imputation_pipeline(inp$X, methods = "knnimpute"),
-    "Unknown imputation method"
-  )
-})
-
-test_that("pipeline: rejects empty methods vector", {
-  inp <- .make_imputation_inputs()
-  expect_error(
-    molecular_trait_imputation_pipeline(inp$X, methods = character(0)),
-    "non-empty character vector"
-  )
-})

From 032f5cc553d10fcbbef745b290bfc9edba65bfb6 Mon Sep 17 00:00:00 2001
From: danielnachun <danielnachun@users.noreply.github.com>
Date: Sat, 6 Jun 2026 02:11:40 +0000
Subject: [PATCH 6/6] Update documentation

---
 man/load_multitask_regional_data.Rd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/man/load_multitask_regional_data.Rd b/man/load_multitask_regional_data.Rd
index 57b231ab..8f74cb72 100644
--- a/man/load_multitask_regional_data.Rd
+++ b/man/load_multitask_regional_data.Rd
@@ -129,10 +129,10 @@ This function loads a mixture data sets for a specific region, including individ
 or summary statistics (sumstats, LD). Run \code{load_regional_univariate_data} and \code{load_rss_data} multiple times for different datasets
 }
 \section{Loading individual level data from multiple corhorts}{
-
+NA
 }
 
 \section{Loading summary statistics from multiple corhorts or data set}{
-
+NA
 }