Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 67 additions & 13 deletions R/file_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -1317,18 +1317,15 @@ standardise_sumstats_columns <- function(sumstats, column_file_path = NULL, comm
)
# Make a copy to avoid in-place modification by MungeSumstats
sumstats_copy <- data.frame(sumstats, check.names = FALSE)
# Use MungeSumstats for comprehensive column standardization
sumstats_copy <- standardise_header(
sumstats_copy, return_list = FALSE, uppercase_unmapped = FALSE
)
# Rename MungeSumstats standard names to pecotmr conventions
for (ms_name in names(ms_to_pecotmr)) {
idx <- which(colnames(sumstats_copy) == ms_name)
if (length(idx) > 0) {
colnames(sumstats_copy)[idx] <- ms_to_pecotmr[ms_name]
}
}
# Apply additional custom column mapping if provided

# Read the explicit user column mapping first. User declarations are
# AUTHORITATIVE: a column the user mapped (e.g. `af:effect_allele_frequency`)
# must not be silently overridden by MungeSumstats (which would otherwise
# absorb `effect_allele_frequency` into `FRQ` -> `maf` before the custom map
# could run). We therefore shield each declared source column behind a unique
# placeholder, let MungeSumstats standardize everything else, then restore the
# declared columns to their requested standard names last.
placeholders <- character(0)
if (!is.null(column_file_path)) {
if (!file.exists(column_file_path)) {
stop("Column mapping file not found: ", column_file_path)
Expand All @@ -1342,10 +1339,32 @@ standardise_sumstats_columns <- function(sumstats, column_file_path = NULL, comm
for (i in seq_len(nrow(column_data))) {
idx <- which(colnames(sumstats_copy) == column_data$original[i])
if (length(idx) > 0) {
colnames(sumstats_copy)[idx] <- column_data$standard[i]
ph <- paste0(".pecotmr_decl_", i)
colnames(sumstats_copy)[idx] <- ph
placeholders[[ph]] <- column_data$standard[i]
}
}
}

# Use MungeSumstats for comprehensive column standardization (shielded
# declared columns pass through untouched as unmapped placeholders).
sumstats_copy <- standardise_header(
sumstats_copy, return_list = FALSE, uppercase_unmapped = FALSE
)
# Rename MungeSumstats standard names to pecotmr conventions
for (ms_name in names(ms_to_pecotmr)) {
idx <- which(colnames(sumstats_copy) == ms_name)
if (length(idx) > 0) {
colnames(sumstats_copy)[idx] <- ms_to_pecotmr[ms_name]
}
}
# Restore user-declared columns to their requested standard names (last word).
for (ph in names(placeholders)) {
idx <- which(colnames(sumstats_copy) == ph)
if (length(idx) > 0) {
colnames(sumstats_copy)[idx] <- placeholders[[ph]]
}
}
as.data.frame(sumstats_copy)
}

Expand Down Expand Up @@ -1419,6 +1438,41 @@ load_rss_data <- function(sumstat_path, column_file_path = NULL, n_sample = 0, n

# Standardize column names via MungeSumstats + optional custom mapping
sumstats <- standardise_sumstats_columns(sumstats, column_file_path, comment_string)

# ---- Effect-allele frequency (af) propagation -------------------------------
# `af` is the frequency of the effect allele / A1 and is exported (after
# harmonization) as top_loci$af. It becomes available ONLY through an explicit
# column-file mapping to the standard name `af` (MungeSumstats never emits
# `af`; it maps FRQ -> `maf`, which stays an internal QC quantity). Ambiguous
# frequency headers therefore never silently become `af`. The effect allele
# must also be resolvable (an A1 column or an allele-bearing variant id), or
# the declared frequency cannot be tied to a direction and is not exported.
af_declared <- "af" %in% colnames(sumstats)
has_effect_allele <- "A1" %in% colnames(sumstats) ||
any(c("variant_id", "variant") %in% colnames(sumstats))
if (af_declared && has_effect_allele) {
sumstats$af <- suppressWarnings(as.numeric(sumstats$af))
if (all(is.na(sumstats$af))) {
warning("Effect-allele frequency column 'af' was declared but its values ",
"are missing/unusable; top_loci$af will be NA and MAF filtering ",
"will be skipped.")
}
} else {
if (af_declared && !has_effect_allele) {
warning("Effect-allele frequency 'af' was declared but no effect allele ",
"(A1 / allele-bearing variant id) is available to tie it to a ",
"direction; it will not be exported. top_loci$af will be NA and ",
"MAF filtering will be skipped.")
} else {
warning("Effect-allele frequency (af) was not declared in the column ",
"file; top_loci$af will be NA and MAF filtering will be skipped. ",
"Generic frequency headers (FRQ/AF/allele_frequency) are kept as ",
"internal MAF only and are never exported as af.")
}
sumstats$af <- NA_real_
}
# ----------------------------------------------------------------------------

has_observed_beta_se <- all(c("beta", "se") %in% colnames(sumstats))
if (binary_trait_model == "ols" && !has_observed_beta_se) {
stop("binary_trait_model = 'ols' requires observed beta and se columns ",
Expand Down
14 changes: 14 additions & 0 deletions R/misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@ compute_maf <- function(geno) {
return(min(f, 1 - f))
}

#' Derive minor-allele frequency from effect-allele frequency
#'
#' MAF is an internal QC/filtering quantity only; it is never exported. Use this
#' helper wherever a MAF is needed from a (directional) effect-allele frequency
#' \code{af}, instead of carrying a separate \code{maf} column. NA in -> NA out.
#'
#' @param af Numeric vector of effect-allele frequencies in \code{[0, 1]}.
#' @return Numeric vector \code{pmin(af, 1 - af)}, preserving NA.
#' @noRd
maf_from_af <- function(af) {
af <- as.numeric(af)
pmin(af, 1 - af)
}

compute_missing <- function(geno) {
miss <- sum(is.na(geno)) / length(geno)
return(miss)
Expand Down
31 changes: 21 additions & 10 deletions R/susie_wrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,8 @@ fit_susie_inf_then_susie_rss <- function(z, R, n, args = list(),
#' @param data_y Phenotype vector/matrix or summary statistics. Default NULL.
#' @param X_scalar Scaling factor for genotype effects. Default 1.
#' @param y_scalar Scaling factor for phenotype effects. Default 1.
#' @param maf Minor allele frequencies. Default NULL.
#' @param af Effect-allele frequencies (exported as the \code{af} column; never
#' MAF). Default NULL.
#' @param coverage Primary credible-set coverage.
#' @param secondary_coverage Additional credible-set coverages.
#' @param signal_cutoff PIP cutoff for including non-CS variants in top loci.
Expand All @@ -230,7 +231,7 @@ fit_susie_inf_then_susie_rss <- function(z, R, n, args = list(),
#' @export
postprocess_finemapping_fits <- function(fits, data_x, data_y = NULL,
X_scalar = 1, y_scalar = 1,
maf = NULL, coverage = NULL,
af = NULL, coverage = NULL,
secondary_coverage = c(0.7, 0.5),
signal_cutoff = 0.1,
other_quantities = NULL,
Expand All @@ -251,7 +252,7 @@ postprocess_finemapping_fits <- function(fits, data_x, data_y = NULL,
fit <- .set_finemapping_fit_class(fits[[method]], method)
postprocess_finemapping_fit(
fit, method = method, data_x = data_x, data_y = data_y,
X_scalar = X_scalar, y_scalar = y_scalar, maf = maf,
X_scalar = X_scalar, y_scalar = y_scalar, af = af,
coverage = coverage, secondary_coverage = secondary_coverage,
signal_cutoff = signal_cutoff, other_quantities = other_quantities,
region = region,
Expand Down Expand Up @@ -316,7 +317,7 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", cs_input =

.postprocess_finemapping_fit_common <- function(fit, method, data_x, data_y = NULL,
X_scalar = 1, y_scalar = 1,
maf = NULL, coverage = NULL,
af = NULL, coverage = NULL,
secondary_coverage = c(0.7, 0.5),
signal_cutoff = 0.1,
other_quantities = NULL,
Expand All @@ -335,7 +336,7 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", cs_input =
)
top_loci <- build_top_loci(
fit, cs_tables, variant_names = variant_names, sumstats = sumstats,
maf = maf, method = method, signal_cutoff = signal_cutoff,
af = af, method = method, signal_cutoff = signal_cutoff,
data_x = data_x, data_y = data_y, other_quantities = other_quantities,
region = region
)
Expand Down Expand Up @@ -489,7 +490,7 @@ compute_cs_table <- function(fit, data_x, coverage, cs_input = c("X", "Xcorr", "
#' single \code{top_loci} returned by \code{format_finemapping_output()}.
#'
#' Output columns, in order: \code{#chr}, \code{start}, \code{end}, \code{a1},
#' \code{a2}, \code{variant}, \code{gene}, \code{event}, \code{n}, \code{maf},
#' \code{a2}, \code{variant}, \code{gene}, \code{event}, \code{n}, \code{af},
#' \code{beta}, \code{se}, \code{pip}, \code{posterior_effect_mean},
#' \code{posterior_effect_se}, \code{cs_95}, \code{cs_70}, \code{cs_50},
#' \code{cs_95_purity}, \code{method}, \code{grange_start}, \code{grange_end}.
Expand All @@ -513,7 +514,11 @@ compute_cs_table <- function(fit, data_x, coverage, cs_input = c("X", "Xcorr", "
#' (\code{chr:pos:A2:A1}).
#' @param sumstats Optional marginal-association summary (\code{betahat},
#' \code{sebetahat}) filling \code{beta} / \code{se}.
#' @param maf Optional numeric vector of minor-allele frequencies.
#' @param af Optional numeric vector of effect-allele frequencies (frequency of
#' the final effect allele / \code{a1} after allele harmonization against the
#' LD/reference variants). Exported directly as the \code{af} column. MAF is
#' never exported; derive it from \code{af} at filter time. Default NULL ->
#' \code{af = NA_real_}.
#' @param method Method name (e.g. \code{"susie"}, \code{"susie_inf"}). Required.
#' @param signal_cutoff PIP cutoff for retaining PIP-only (non-CS) variants.
#' @param data_x Optional regional genotype matrix.
Expand All @@ -525,7 +530,7 @@ compute_cs_table <- function(fit, data_x, coverage, cs_input = c("X", "Xcorr", "
#' or an empty data frame if nothing is retained.
#' @export
build_top_loci <- function(fit, cs_tables, variant_names, sumstats = NULL,
maf = NULL, method, signal_cutoff = 0.1,
af = NULL, method, signal_cutoff = 0.1,
data_x = NULL, data_y = NULL,
other_quantities = NULL,
region = NULL) {
Expand Down Expand Up @@ -631,7 +636,7 @@ build_top_loci <- function(fit, cs_tables, variant_names, sumstats = NULL,
gene = rep(fit_gene, n_keys),
event = rep(fit_event, n_keys),
n = rep(fit_n, n_keys),
maf = pick(maf),
af = pick(af),
beta = pick(sumstats$betahat),
se = pick(sumstats$sebetahat),
pip = as.numeric(fit$pip[v_idx]),
Expand Down Expand Up @@ -679,7 +684,7 @@ build_top_loci <- function(fit, cs_tables, variant_names, sumstats = NULL,
gene = character(),
event = character(),
n = integer(),
maf = numeric(),
af = numeric(),
beta = numeric(),
se = numeric(),
pip = numeric(),
Expand Down Expand Up @@ -1094,10 +1099,16 @@ susie_rss_pipeline <- function(sumstats, LD_mat = NULL, X_mat = NULL, n = NULL,
pp_cs_input <- "X"
}

# Effect-allele frequency for top_loci$af. Carried only when the harmonized
# sumstats declares it (effect-allele AF); aligned to the z / variant order.
# MAF is never exported here; it is an internal QC quantity derived from af.
af <- if (!is.null(sumstats$af)) as.numeric(sumstats$af) else NULL

post <- postprocess_finemapping_fits(
fits = fitted_models,
data_x = data_x,
data_y = list(z = z),
af = af,
coverage = coverage,
secondary_coverage = secondary_coverage,
signal_cutoff = signal_cutoff,
Expand Down
19 changes: 17 additions & 2 deletions R/univariate_pipeline.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
#' @param Y A vector of phenotype measurements.
#' @param X_scalar A scalar or vector to rescale X to its original scale.
#' @param Y_scalar A scalar to rescale Y to its original scale.
#' @param maf A vector of minor allele frequencies for each variant in X.
#' @param maf A vector of minor allele frequencies for each variant in X. Used
#' only for \code{maf_cutoff} filtering; never exported.
#' @param af Optional vector of directional effect-allele frequencies (frequency
#' of \code{a1}) aligned to the columns of X. When supplied it is exported as
#' the \code{top_loci$af} column; when NULL, \code{af} is \code{NA_real_}.
#' Default NULL.
#' @param X_variance Optional variance of X. Default is NULL.
#' @param other_quantities A list of other quantities to be carried into fine-mapping post-processing. Default is an empty list.
#' @param region Optional \code{"chr:start-end"} string for the analysis region. Default is NULL.
Expand Down Expand Up @@ -60,6 +65,7 @@ univariate_analysis_pipeline <- function(
X,
Y,
maf,
af = NULL,
X_scalar = 1,
Y_scalar = 1,
X_variance = NULL,
Expand Down Expand Up @@ -95,6 +101,13 @@ univariate_analysis_pipeline <- function(
if (nrow(X) != length(Y)) stop("X and Y must have the same number of rows/length")
if (!is.numeric(maf) || length(maf) != ncol(X)) stop("maf must be a numeric vector with length equal to the number of columns in X")
if (any(maf < 0 | maf > 1)) stop("maf values must be between 0 and 1")
# af (directional effect-allele frequency) is optional. When supplied it must
# align with X columns; it is exported as top_loci$af. maf stays directionless
# and is used only for maf_cutoff filtering.
if (!is.null(af)) {
if (!is.numeric(af) || length(af) != ncol(X)) stop("af must be NULL or a numeric vector with length equal to the number of columns in X")
if (any(af < 0 | af > 1, na.rm = TRUE)) stop("af values must be between 0 and 1")
}
if (!is.numeric(X_scalar) || (length(X_scalar) != 1 && length(X_scalar) != ncol(X))) stop("X_scalar must be a numeric scalar or vector with length equal to the number of columns in X")
if (!is.numeric(Y_scalar) || length(Y_scalar) != 1) stop("Y_scalar must be a numeric scalar")
if (!is.numeric(L) || L <= 0) stop("L must be a positive integer")
Expand Down Expand Up @@ -153,6 +166,7 @@ univariate_analysis_pipeline <- function(
variants_kept <- filter_variants_by_ld_reference(colnames(X), ld_reference_meta_file)
X <- X[, variants_kept$data, drop = FALSE]
maf <- maf[variants_kept$idx]
if (!is.null(af)) af <- af[variants_kept$idx]
if (length(X_scalar) > 1) X_scalar <- X_scalar[variants_kept$idx]
}

Expand All @@ -161,6 +175,7 @@ univariate_analysis_pipeline <- function(
X_filtered <- filter_X(X, imiss_cutoff, maf_cutoff, var_thresh = xvar_cutoff, maf = maf, X_variance = X_variance)
kept_indices <- match(colnames(X_filtered), colnames(X))
maf <- maf[kept_indices]
if (!is.null(af)) af <- af[kept_indices]
if (length(X_scalar) > 1) X_scalar <- X_scalar[kept_indices]
X <- X_filtered
}
Expand Down Expand Up @@ -241,7 +256,7 @@ univariate_analysis_pipeline <- function(
data_y = Y,
X_scalar = X_scalar,
y_scalar = Y_scalar,
maf = maf,
af = af,
coverage = coverage[1],
secondary_coverage = if (length(coverage) > 1) coverage[-1] else NULL,
signal_cutoff = signal_cutoff,
Expand Down
10 changes: 7 additions & 3 deletions man/build_top_loci.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions man/postprocess_finemapping_fits.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 8 additions & 1 deletion man/univariate_analysis_pipeline.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading