Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
1bf595d
Fix marginal-HR g-formula: take first-event row in survival reduction
remlapmot Jun 9, 2026
be14ad7
Bump version
remlapmot Jun 9, 2026
7e12f39
Amend TP email
remlapmot Jun 9, 2026
af444b2
Auto-format code
github-actions[bot] Jun 9, 2026
374f78e
Make glum models picklable so parallel/offload work end-to-end
remlapmot Jun 9, 2026
107f0a0
Auto-format code
github-actions[bot] Jun 9, 2026
2f5d594
Parallelize the bootstrap hazard simulation over a process pool
remlapmot Jun 9, 2026
b9a3dc6
Auto-format code
github-actions[bot] Jun 9, 2026
8e88545
Preserve main-fit weight models across the bootstrap loop
remlapmot Jun 10, 2026
e8eab64
Auto-format code
github-actions[bot] Jun 10, 2026
1f3a627
Report the censored/uncensored split in verbose expansion output
remlapmot Jun 10, 2026
dad6e28
Auto-format code
github-actions[bot] Jun 10, 2026
0cc419b
Fix bootstrap weight corruption under weight_preexpansion
remlapmot Jun 11, 2026
71d4802
Fix operator precedence in selection_random filter
remlapmot Jun 11, 2026
5fca21a
Normalize time_varying_cols and fixed_cols
remlapmot Jun 11, 2026
592da92
Replace Python 2 dict.has_key with dict.get in SEQoutput.retrieve_data
remlapmot Jun 11, 2026
c0d568c
Fix UnboundLocalError when collect() is called before fit()
remlapmot Jun 11, 2026
7e1e51e
Skip padded None entries in excused_colnames validation
remlapmot Jun 11, 2026
98ab567
Make _JaxFit picklable
remlapmot Jun 11, 2026
2dbf097
Add jax to dev dependencies
remlapmot Jun 12, 2026
228ed8a
Reject hazard ratio estimation for method dose-response
remlapmot Jun 12, 2026
0899c16
Freeze categorical levels into the glum pickle reference frame dtypes
remlapmot Jun 12, 2026
8f60240
Skip refitting weight models on bootstrap replicates under weight_pre…
remlapmot Jun 12, 2026
1d90b06
Offload all weight models under their real attribute names
remlapmot Jun 12, 2026
1b61997
Ship the SEQuential object and analysis frame to parallel bootstrap w…
remlapmot Jun 12, 2026
234fd44
Share the polars-to-pandas conversion across numerator/denominator an…
remlapmot Jun 12, 2026
1434d16
Route _build_md model sections through SEQoutput.summary
remlapmot Jun 12, 2026
1868b7c
Remove Linux tests
remlapmot Jun 15, 2026
752af61
Clarify unique vs non-unique in diagnostic table docs and report labels
remlapmot Jun 16, 2026
7dfaf82
Remove comment from random selection filter
remlapmot Jun 16, 2026
b8dff51
Relax cached-weight bootstrap test to numerical precision
remlapmot Jun 16, 2026
69cabcc
Fix flake8 warnings in SEQopts and optional-dependency test imports
remlapmot Jun 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [macos-26, ubuntu-latest]
os: [macos-26]
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]

steps:
Expand Down
3 changes: 2 additions & 1 deletion pySEQTarget/SEQopts.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ class SEQopts:
:param excused_colnames: Column names (at the same length of treatment_level) specifying excused conditions, default ``[]``
:param expand_only: If True, ``SEQuential.expand()`` returns the expanded dataset and skips weighting,
modelling, and survival steps
:param glm_package: Backend for fitting logistic (outcome/competing-event) models ["statsmodels", "glum", or "jax"], default "statsmodels".
:param glm_package: Backend for fitting logistic (outcome/competing-event)
models ["statsmodels", "glum", or "jax"], default "statsmodels".
:param followup_class: Boolean to force followup values to be treated as classes
:param followup_include: Boolean to force regular followup values into model covariates
:param followup_spline: Boolean to force followup values to be fit to cubic spline
Expand Down
61 changes: 46 additions & 15 deletions pySEQTarget/SEQoutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import polars as pl
from statsmodels.base.wrapper import ResultsWrapper

from .helpers import _build_md, _build_pdf
from .helpers import Offloader, _build_md, _build_pdf
from .SEQopts import SEQopts


Expand Down Expand Up @@ -41,7 +41,13 @@ class SEQoutput:
:type risk_difference: pl.DataFrame or None
:param time: Timings for every step of the process completed thus far
:type time: dict or None
:param diagnostic_tables: Diagnostic tables for unique and nonunique outcome events and treatment switches
:param diagnostic_tables: Diagnostic tables (outcome, follow-up, switch, and
competing-event counts where applicable), each split by baseline treatment
arm. The "unique" tables count distinct subjects; the "nonunique" tables
count rows: total outcome events for the outcome tables, and total
person-time intervals (expanded follow-up rows) for the follow-up tables.
For a one-time (terminal) outcome the unique and nonunique outcome counts
coincide, since each subject contributes at most one event row.
:type diagnostic_tables: dict or None
"""

Expand Down Expand Up @@ -72,7 +78,10 @@ def plot(self) -> None:
plt.show()

def summary(
self, type=Optional[Literal["numerator", "denominator", "outcome", "compevent"]]
self,
type: Optional[
Literal["numerator", "denominator", "outcome", "compevent"]
] = None,
) -> List:
"""
Returns a list of model summaries of either the numerator, denominator, outcome, or competing event models
Expand All @@ -90,11 +99,26 @@ def summary(
case _:
models = self.outcome_models

return [model.summary() for model in models if model is not None]
if models is None:
return []

# Under offload=True the stored entries are path refs; load them back.
loader = None
if self.options is not None and self.options.offload:
loader = Offloader(enabled=True, dir=self.options.offload_dir)

summaries = []
for model in models:
if model is None:
continue
if loader is not None:
model = loader.load_model(model)
summaries.append(model.summary())
return summaries

def retrieve_data(
self,
type=Optional[
type: Optional[
Literal[
"km_data",
"hazard",
Expand All @@ -109,11 +133,24 @@ def retrieve_data(
"unique_switches",
"nonunique_switches",
]
],
] = None,
) -> pl.DataFrame:
"""
Getter for data stored within ``SEQoutput``

The diagnostic tables come in "unique" and "nonunique" variants that count
different things, each broken down by baseline treatment arm:

- ``unique_outcomes`` / ``nonunique_outcomes``: distinct subjects who had
the outcome vs. the total number of outcome events. These coincide for a
one-time (terminal) outcome, since each subject contributes at most one
event row.
- ``unique_followup`` / ``nonunique_followup``: distinct subjects
contributing follow-up vs. the total number of person-time intervals
(expanded rows). The nonunique count is much larger because each subject
contributes one row per follow-up period; it is the denominator that,
with ``nonunique_outcomes``, gives the per-arm event rate.

:param type: Data which you would like to access, ['km_data', 'hazard',
'risk_ratio', 'risk_difference', 'unique_outcomes',
'nonunique_outcomes', 'unique_followup', 'nonunique_followup',
Expand Down Expand Up @@ -141,19 +178,13 @@ def retrieve_data(
case "nonunique_compevent":
data = self.diagnostic_tables.get("nonunique_compevent")
case "unique_switches":
if self.diagnostic_tables.has_key("unique_switches"):
data = self.diagnostic_tables["unique_switches"]
else:
data = None
data = self.diagnostic_tables.get("unique_switches")
case "nonunique_switches":
if self.diagnostic_tables.has_key("nonunique_switches"):
data = self.diagnostic_tables["nonunique_switches"]
else:
data = None
data = self.diagnostic_tables.get("nonunique_switches")
case _:
data = self.km_data
if data is None:
raise ValueError("Data {type} was not created in the SEQuential process")
raise ValueError(f"Data {type} was not created in the SEQuential process")
return data

def to_md(self, filename="SEQuential_results.md") -> None:
Expand Down
120 changes: 87 additions & 33 deletions pySEQTarget/SEQuential.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,11 @@ def __init__(
self.eligible_col = eligible_col
self.treatment_col = treatment_col
self.outcome_col = outcome_col
self.time_varying_cols = time_varying_cols
self.fixed_cols = fixed_cols
# Normalize the documented-Optional covariate lists to [] once, so the
# many downstream `for col in self.fixed_cols` / set() sites need no
# None guards.
self.time_varying_cols = time_varying_cols if time_varying_cols else []
self.fixed_cols = fixed_cols if fixed_cols else []
self.method = method

self._time_initialized = datetime.datetime.now()
Expand Down Expand Up @@ -111,6 +114,16 @@ def __init__(
_param_checker(self)
_data_checker(self)

def __getstate__(self):
# The glum design-info cache (_outcome_fit) holds patsy DesignInfo
# objects, which can't be pickled (patsy #26). It is a per-process speed
# cache rebuilt lazily on first fit, so drop it when crossing a process
# boundary (parallel bootstrap / offload); workers repopulate it. Without
# this, parallel=True + glm_package="glum" crashes on pickling.
state = self.__dict__.copy()
state.pop("_patsy_design_cache", None)
return state

def expand(self):
"""
Creates the sequentially nested, emulated target trial structure.
Expand Down Expand Up @@ -187,6 +200,17 @@ def expand(self):
if self.verbose:
n, m = self.DT.shape
print(f"Final analysis dataset: {n:,} observations, {m} variables")
# Under censoring the outcome model is fit only on the un-censored
# rows (switch != 1, matching _outcome_fit); the rest are retained in
# the dataset but artificially censored. Report the split so the
# count lines up with implementations that print only the modelled
# rows (e.g. Stata seqtte).
if self.method == "censoring" and "switch" in self.DT.columns:
n_censored = self.DT.filter(pl.col("switch") == 1).height
print(
f" entering outcome model (uncensored): {n - n_censored:,}\n"
f" artificially censored (treatment switch): {n_censored:,}"
)

end = time.perf_counter()
self._expansion_time = _format_time(start, end)
Expand Down Expand Up @@ -250,32 +274,50 @@ def fit(self) -> None:
boot_idx = self._current_boot_idx

if self.weighted:
WDT_pl = _weight_setup(self)
if not self.weight_preexpansion and not self.excused:
WDT_pl = WDT_pl.filter(pl.col("followup") > 0)

# The weight-fit helpers (_fit_LTFU etc.) use pandas-style indexing
# and pass pandas frames to glum/statsmodels, so we convert once.
# The fits don't mutate WDT_pd - they store models on `self` - so
# we keep the original polars frame for the downstream steps
# rather than paying a pl.from_pandas() round-trip per replicate.
WDT_pd = WDT_pl.to_pandas()
for col in self.fixed_cols:
if col in WDT_pd.columns:
WDT_pd[col] = WDT_pd[col].astype("category")

_fit_LTFU(self, WDT_pd)
_fit_visit(self, WDT_pd)
_fit_numerator(self, WDT_pd)
_fit_denominator(self, WDT_pd)

if self.offload:
_offload_weights(self, boot_idx)

del WDT_pd
WDT = _weight_predict(self, WDT_pl)
_weight_bind(self, WDT)
self.weight_stats = _weight_stats(self)
# With weight_preexpansion the weight models are fit on the
# un-resampled pre-expansion data, so every bootstrap replicate
# would refit bit-identical models and re-predict identical
# weights. Cache the predicted weight frame from the main fit and
# reuse it on replicates; only the join onto the resampled DT
# (_weight_bind) and the resulting weight stats differ.
cached_WDT = (
getattr(self, "_main_weight_WDT", None)
if boot_idx is not None and self.weight_preexpansion
else None
)
if cached_WDT is not None:
_weight_bind(self, cached_WDT)
self.weight_stats = _weight_stats(self)
else:
WDT_pl = _weight_setup(self)
if not self.weight_preexpansion and not self.excused:
WDT_pl = WDT_pl.filter(pl.col("followup") > 0)

# The weight-fit helpers (_fit_LTFU etc.) use pandas-style
# indexing and pass pandas frames to glum/statsmodels, so we
# convert once. The fits don't mutate WDT_pd - they store
# models on `self` - so we keep the original polars frame for
# the downstream steps rather than paying a pl.from_pandas()
# round-trip per replicate.
WDT_pd = WDT_pl.to_pandas()
for col in self.fixed_cols:
if col in WDT_pd.columns:
WDT_pd[col] = WDT_pd[col].astype("category")

_fit_LTFU(self, WDT_pd)
_fit_visit(self, WDT_pd)
_fit_numerator(self, WDT_pd)
_fit_denominator(self, WDT_pd)

if self.offload:
_offload_weights(self, boot_idx)

del WDT_pd
WDT = _weight_predict(self, WDT_pl)
if self.weight_preexpansion and boot_idx is None:
self._main_weight_WDT = WDT
_weight_bind(self, WDT)
self.weight_stats = _weight_stats(self)

is_boot = boot_idx is not None
start = getattr(self, "_outcome_start_params", None) if is_boot else None
Expand Down Expand Up @@ -365,6 +407,14 @@ def hazard(self) -> None:
"""
start = time.perf_counter()

if self.method == "dose-response":
raise NotImplementedError(
"Hazard ratio estimation is not supported for method='dose-response': "
"the counterfactual simulation only sets the baseline treatment, but "
"the dose-response outcome model depends on the cumulative dose, so "
"both arms would simulate identical outcomes (HR ≈ 1)."
)

if not hasattr(self, "outcome_model") or not self.outcome_model:
raise ValueError(
"Outcome model not found. Please run the 'fit' method before calculating hazard ratio."
Expand Down Expand Up @@ -429,13 +479,17 @@ def collect(self) -> SEQoutput:
"collection_time": self._time_collected,
}

if self.compevent_colname is not None:
compevent_models = [model["compevent"] for model in self.outcome_model]
else:
compevent_models = None

if self.outcome_model is not None:
outcome_models = [model["outcome"] for model in self.outcome_model]
if self.compevent_colname is not None:
compevent_models = [model["compevent"] for model in self.outcome_model]
else:
compevent_models = None
else:
# collect() before fit(): no models to report, but the rest of the
# output (diagnostics, timings) is still valid.
outcome_models = None
compevent_models = None

if self.risk_estimates is None:
risk_ratio = risk_difference = None
Expand Down
Loading