From c6813069eb5653eb7a7e25fb1b05f091e5e3dae1 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Wed, 27 May 2026 09:28:42 -0500 Subject: [PATCH 1/2] fix bug in loading LD sketch --- R/genotype_io.R | 11 +++++++---- tests/testthat/test_load_genotype.R | 9 +++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/R/genotype_io.R b/R/genotype_io.R index 8774385d..8fecfdc6 100644 --- a/R/genotype_io.R +++ b/R/genotype_io.R @@ -460,7 +460,7 @@ computeBlockLdCor <- function(handle, snp_idx, backend = "internal", ext <- tolower(file_ext(path)) if (nzchar(ext)) { - return(switch(ext, + detected <- switch(ext, "vcf" = "vcf", "bcf" = "vcf", "bed" = "plink1", @@ -475,16 +475,19 @@ computeBlockLdCor <- function(handle, snp_idx, backend = "internal", "annot" = "ldsc_annot", "bw" = "bigwig", "bigwig" = "bigwig", - stop("Cannot detect format from extension: ", ext) - )) + NULL + ) + if (!is.null(detected)) return(detected) } - # No extension — check for plink stem files + # Check for file stems, including dotted prefixes such as sample.EUR.chr21. if (file.exists(paste0(path, ".pgen")) || file.exists(paste0(path, ".pvar"))) return("plink2") if (file.exists(paste0(path, ".bed")) || file.exists(paste0(path, ".bim"))) return("plink1") if (file.exists(paste0(path, ".gds"))) return("gds") + if (nzchar(ext)) + stop("Cannot detect format from extension: ", ext) stop("Cannot detect genotype format for path: ", path) } diff --git a/tests/testthat/test_load_genotype.R b/tests/testthat/test_load_genotype.R index 8ca761d7..60854574 100644 --- a/tests/testthat/test_load_genotype.R +++ b/tests/testthat/test_load_genotype.R @@ -13,6 +13,15 @@ region_sub <- "chr21:17513228-17550000" n_samples <- 100L n_variants <- 349L +test_that("format detection supports dotted PLINK2 prefixes", { + tmp <- tempfile("plink2_dotted_prefix_") + prefix <- file.path(dirname(tmp), "ADSP.R4.EUR.chr21") + file.create(paste0(prefix, ".pgen"), paste0(prefix, ".pvar"), paste0(prefix, ".psam")) + on.exit(unlink(paste0(prefix, c(".pgen", ".pvar", ".psam"))), add = TRUE) + + expect_equal(pecotmr:::.h2_detect_format(prefix), "plink2") +}) + # Shared helper: validate the output structure from load_genotype_region # (with return_variant_info=TRUE) check_genotype_result <- function(result, expected_nrow = n_samples, expected_ncol = n_variants, From 0bb698a0de2f465216b7bed41143aaf9bc71a726 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Thu, 28 May 2026 04:06:03 -0500 Subject: [PATCH 2/2] Fix ColocBoost S4 LDData handling ColocBoost adapters now accept genotype-backed LDData objects from load_LD_matrix(), preserving X_ref-based paths instead of treating LDData as a legacy list. Added regression coverage for direct RegionalData conversion, direct ColocBoost QC input prep, and summary-stat QC with S4 LDData. --- R/colocboost_pipeline.R | 5 +- R/sumstats_qc.R | 31 ++++++++- tests/testthat/test_colocboost_pipeline.R | 77 +++++++++++++++++++++++ tests/testthat/test_sumstats_qc.R | 57 +++++++++++++++++ 4 files changed, 167 insertions(+), 3 deletions(-) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index f79a8d3d..ea3a82f5 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -66,8 +66,9 @@ region_data_to_colocboost_input <- function(region_data) { ind_args <- .cb_format_individual(ind_records) sumstat_records <- lapply(names(rss_input$rss_input), function(study) { + ld_data <- .normalize_ld_data_for_qc(rss_input$LD_data[[study]]) list(rss_input = rss_input$rss_input[[study]], - LD_matrix = rss_input$LD_data[[study]]$LD_matrix) + LD_matrix = ld_data$LD_matrix) }) names(sumstat_records) <- names(rss_input$rss_input) sumstat_args <- .cb_format_sumstat(sumstat_records) @@ -1127,7 +1128,7 @@ qc_individual_data <- function(X, Y, maf = NULL, X_variance = NULL, LD_reference_info = NULL, variant_convention = c("A2_A1", "A1_A2")) { is_ld_data <- function(x) { - is.list(x) && !is.null(x$LD_matrix) + methods::is(x, "LDData") || (is.list(x) && !is.null(x$LD_matrix)) } as_reference_info_list <- function(x) { if (is.null(x)) return(NULL) diff --git a/R/sumstats_qc.R b/R/sumstats_qc.R index d0f13b6a..ca14e4fe 100644 --- a/R/sumstats_qc.R +++ b/R/sumstats_qc.R @@ -277,7 +277,8 @@ summary_stats_qc <- function(sumstats, LD_data, n = NULL, is.list(x) && is.data.frame(x$sumstats) } is_ld_record <- function(x) { - is.matrix(x) || (is.list(x) && !is.null(x$LD_matrix)) + methods::is(x, "LDData") || is.matrix(x) || + (is.list(x) && !is.null(x$LD_matrix)) } first_ld_record <- function(x, study_name = NULL) { if (is_ld_record(x)) return(x) @@ -371,6 +372,34 @@ summary_stats_qc <- function(sumstats, LD_data, n = NULL, .normalize_ld_data_for_qc <- function(LD_data) { if (is.null(LD_data)) return(NULL) + if (methods::is(LD_data, "LDData")) { + variant_info <- getVariantInfo(LD_data) + ref_panel <- as.data.frame(S4Vectors::mcols(variant_info)) + ref_panel$chrom <- as.character(GenomicRanges::seqnames(variant_info)) + ref_panel$pos <- GenomicRanges::start(variant_info) + if (!"variant_id" %in% colnames(ref_panel)) { + ref_panel$variant_id <- getVariantIds(LD_data) + } + + is_genotype <- hasGenotypes(LD_data) + LD_matrix <- if (is_genotype) getGenotypes(LD_data) else getCorrelation(LD_data) + LD_variants <- getVariantIds(LD_data) + if (is.matrix(LD_matrix)) { + if (is_genotype && length(LD_variants) == ncol(LD_matrix)) { + colnames(LD_matrix) <- LD_variants + } else if (!is_genotype && length(LD_variants) == nrow(LD_matrix)) { + rownames(LD_matrix) <- colnames(LD_matrix) <- LD_variants + } + } + + return(list( + LD_matrix = LD_matrix, + LD_variants = LD_variants, + ref_panel = ref_panel, + block_metadata = getBlockMetadata(LD_data), + is_genotype = is_genotype + )) + } if (is.matrix(LD_data)) { ids <- if (nrow(LD_data) == ncol(LD_data)) rownames(LD_data) else colnames(LD_data) LD_data <- list(LD_matrix = LD_data, LD_variants = ids) diff --git a/tests/testthat/test_colocboost_pipeline.R b/tests/testthat/test_colocboost_pipeline.R index e9214765..9cc09661 100644 --- a/tests/testthat/test_colocboost_pipeline.R +++ b/tests/testthat/test_colocboost_pipeline.R @@ -227,6 +227,83 @@ test_that("RegionalData adapters expose individual and RSS inputs", { expect_true(all(names(rss_input$rss_input) %in% names(rss_input$LD_data))) }) +test_that("ColocBoost adapters accept genotype-backed LDData", { + skip_if_not_installed("pgenlibr") + td <- test_path("test_data") + tmp <- tempfile("cb_lddata_") + dir.create(tmp) + on.exit(unlink(tmp, recursive = TRUE), add = TRUE) + + prefix <- "test_variants" + for (ext in c("pgen", "pvar", "psam", "afreq")) { + file.copy(file.path(td, paste0(prefix, ".", ext)), + file.path(tmp, paste0(prefix, ".", ext))) + } + meta_file <- file.path(tmp, "ld_meta.tsv") + writeLines(c("chrom\tstart\tend\tpath", "21\t0\t0\ttest_variants"), meta_file) + ld_data <- suppressWarnings(suppressMessages(load_LD_matrix( + meta_file, + region = "chr21:17513228-17550000", + return_genotype = TRUE + ))) + + variant_info <- getVariantInfo(ld_data) + ref_panel <- as.data.frame(S4Vectors::mcols(variant_info)) + ref_panel$chrom <- as.character(GenomicRanges::seqnames(variant_info)) + ref_panel$pos <- GenomicRanges::start(variant_info) + allele_pair <- apply(cbind(ref_panel$A1, ref_panel$A2), 1, function(x) { + paste(sort(x), collapse = "") + }) + ref_panel <- ref_panel[nchar(ref_panel$A1) == 1 & + nchar(ref_panel$A2) == 1 & + !allele_pair %in% c("AT", "CG"), , drop = FALSE] + ref_panel <- utils::head(ref_panel, 5) + variant_id <- format_variant_id(ref_panel$chrom, ref_panel$pos, + ref_panel$A2, ref_panel$A1) + rss_record <- list( + sumstats = data.frame( + chrom = ref_panel$chrom, + pos = ref_panel$pos, + A1 = ref_panel$A1, + A2 = ref_panel$A2, + z = seq_len(nrow(ref_panel)), + variant_id = variant_id, + stringsAsFactors = FALSE + ), + n = 1000, + var_y = 1 + ) + region_data <- list( + individual_data = NULL, + sumstat_data = list( + sumstats = list(ldgrp = list(study = rss_record)), + LD_info = list(ldgrp = ld_data) + ) + ) + + converted <- region_data_to_colocboost_input(region_data) + expect_null(converted$colocboost_input$LD) + expect_equal(length(converted$colocboost_input$X_ref), 1) + expect_equal(nrow(converted$colocboost_input$X_ref[[1]]), 100L) + expect_equal(ncol(converted$colocboost_input$X_ref[[1]]), length(getVariantIds(ld_data))) + + local_mocked_bindings( + .cb_call_colocboost = function(args, dots) list(args = args, dots = dots) + ) + X_ref <- getGenotypes(ld_data)[, match(variant_id, getVariantIds(ld_data)), drop = FALSE] + colnames(X_ref) <- variant_id + result <- suppressMessages(colocboost_analysis( + sumstat = data.frame(variant = variant_id, z = seq_along(variant_id), n = 1000), + X_ref = X_ref, + LD_reference_info = ld_data, + qc_method = "none", + M = 2 + )) + expect_null(result$args$LD) + expect_equal(length(result$args$X_ref), 1) + expect_equal(result$args$M, 2) +}) + test_that("RegionalData individual adapter restores context names from residual_Y", { ind_region <- make_individual_region_data(n = 12, p = 5, n_contexts = 2, n_events = 1) names(ind_region$individual_data$residual_X) <- NULL diff --git a/tests/testthat/test_sumstats_qc.R b/tests/testthat/test_sumstats_qc.R index 68322daa..5ae5be63 100644 --- a/tests/testthat/test_sumstats_qc.R +++ b/tests/testthat/test_sumstats_qc.R @@ -425,6 +425,63 @@ test_that("summary_stats_qc basic genotype-backed path does not compute LD", { expect_equal(ncol(result$LD_matrix), nrow(result$rss_input$sumstats)) }) +test_that("summary_stats_qc accepts genotype-backed LDData", { + skip_if_not_installed("pgenlibr") + td <- test_path("test_data") + tmp <- tempfile("lddata_qc_") + dir.create(tmp) + on.exit(unlink(tmp, recursive = TRUE), add = TRUE) + + prefix <- "test_variants" + for (ext in c("pgen", "pvar", "psam", "afreq")) { + file.copy(file.path(td, paste0(prefix, ".", ext)), + file.path(tmp, paste0(prefix, ".", ext))) + } + meta_file <- file.path(tmp, "ld_meta.tsv") + writeLines(c("chrom\tstart\tend\tpath", "21\t0\t0\ttest_variants"), meta_file) + + ld_data <- suppressWarnings(suppressMessages(load_LD_matrix( + meta_file, + region = "chr21:17513228-17550000", + return_genotype = TRUE + ))) + variant_info <- getVariantInfo(ld_data) + ref_panel <- as.data.frame(S4Vectors::mcols(variant_info)) + ref_panel$chrom <- as.character(GenomicRanges::seqnames(variant_info)) + ref_panel$pos <- GenomicRanges::start(variant_info) + is_snp <- nchar(ref_panel$A1) == 1 & nchar(ref_panel$A2) == 1 + allele_pair <- apply(cbind(ref_panel$A1, ref_panel$A2), 1, function(x) { + paste(sort(x), collapse = "") + }) + ref_panel <- ref_panel[is_snp & !allele_pair %in% c("AT", "CG"), , drop = FALSE] + ref_panel <- utils::head(ref_panel, 5) + + sumstats <- data.frame( + chrom = ref_panel$chrom, + pos = ref_panel$pos, + A1 = ref_panel$A1, + A2 = ref_panel$A2, + beta = seq_len(nrow(ref_panel)) / 10, + se = rep(0.1, nrow(ref_panel)), + z = seq_len(nrow(ref_panel)), + stringsAsFactors = FALSE + ) + rss_input <- list(sumstats = sumstats, n = 1000, var_y = 1) + + local_mocked_bindings( + compute_LD = function(...) stop("compute_LD should not be called") + ) + result <- suppressMessages(summary_stats_qc( + rss_input = rss_input, + LD_data = ld_data, + qc_method = "none", + impute = FALSE + )) + + expect_equal(nrow(result$LD_matrix), 100L) + expect_equal(ncol(result$LD_matrix), nrow(result$rss_input$sumstats)) +}) + test_that("summary_stats_qc PIP screening uses LD-independent SER", { td <- make_test_sumstats_ld(n_variants = 5) X_ref <- matrix(rnorm(50), 10, 5)