From 087eec9839749b186f9ae92622ce1bca8741553e Mon Sep 17 00:00:00 2001 From: danielward27 Date: Thu, 4 Sep 2025 14:42:47 +0100 Subject: [PATCH] add qq_transform (tnormal/tunif) --- docs/api/miscellaneous.md | 2 +- pymgcv/plot.py | 35 ++++++++---------- pymgcv/qq.py | 76 +++++++++++++++++++++++++-------------- pyproject.toml | 1 + tests/test_qq.py | 10 ++++-- 5 files changed, 73 insertions(+), 51 deletions(-) diff --git a/docs/api/miscellaneous.md b/docs/api/miscellaneous.md index fda61ab..79ee061 100644 --- a/docs/api/miscellaneous.md +++ b/docs/api/miscellaneous.md @@ -6,5 +6,5 @@ Various functionality not part of the main API, but we want to be exposed/search options: members: - "QQResult" - - "qq_cdf" + - "qq_transform" - "qq_simulate" diff --git a/pymgcv/plot.py b/pymgcv/plot.py index 80c9f55..721e146 100644 --- a/pymgcv/plot.py +++ b/pymgcv/plot.py @@ -15,12 +15,11 @@ from matplotlib.figure import Figure from pandas import CategoricalDtype from pandas.api.types import is_numeric_dtype +from scipy.stats import norm from pymgcv.basis_functions import FactorSmooth, RandomEffect from pymgcv.gam import AbstractGAM from pymgcv.qq import QQResult, qq_simulate -from pymgcv.rlibs import rstats -from pymgcv.rpy_utils import to_py from pymgcv.terms import ( AbstractTerm, L, @@ -712,7 +711,11 @@ def random_effect( data=pd.DataFrame(pd.Series(levels, name=term.varnames[0], dtype="category")), ) order = np.argsort(pred) - x = rstats.qnorm(rstats.ppoints(len(levels))) # Gaussian quantiles + + n = len(levels) + probs = (np.arange(n) + 0.5) / n + x = norm.ppf(probs) + ax.scatter( x, pred[order], @@ -724,14 +727,10 @@ def random_effect( # page 6. We need to multiply them by sd(.dat$y) because we are not normalizing the # random effects. alpha = (1 - confidence_interval_level) / 2 - n = len(levels) - p = to_py(rstats.ppoints(n)) interval = ( - np.std(pred) - * to_py(rstats.qnorm(alpha)) # e.g. 1.96 - * np.sqrt(p * (1 - p) / n) - / to_py(rstats.dnorm(x)) + np.std(pred) * norm.ppf(alpha) * np.sqrt(probs * (1 - probs) / n) / norm.pdf(x) ) + ref_y = x * np.std(pred) _with_disable(ax.fill_between)( x, @@ -768,15 +767,9 @@ def qq( object storing the theoretical residuals, residuals, and the confidence interval. Defaults to [`qq_simulate`][pymgcv.qq.qq_simulate], which is the most widely supported method only requiring the family to provide a - sampling function. [`qq_cdf`][pymgcv.qq.qq_cdf] can be used for families - providing a cdf method, which transforms the data to a uniform - distribution - - !!! warning - - `qq_cdf` transforms all data to [0, 1] uniform. This may make it hard to - spot deviations in tail behaviour, e.g. due to a few extreme outliers. - + sampling function. [`qq_transform`][pymgcv.qq.qq_transform] can be used for + families providing a cdf method, which transforms the data to a known + distribution for which an analytical confidence interval is available. scatter_kwargs: Key word arguments passed to `matplotlib.pyplot.scatter`. fill_between_kwargs: Key word arguments passed to `matplotlib.pyplot.fill_between`, for plotting the confidence interval. @@ -787,14 +780,14 @@ def qq( !!! note - We can change e.g. the confidence level and number of simulations using - partial application: + To change settings of ``qq_fun``, use partial application, e.g. ```python from pymgcv.qq import qq_simulate from functools import partial + import pymgcv.plot as gplt qq_fun = partial(qq_simulate, level=0.95, n_sim=10) - # qq(..., qq_fun=qq_fun) + # gplt.qq(..., qq_fun=qq_fun) ``` Returns: diff --git a/pymgcv/qq.py b/pymgcv/qq.py index 906c922..1151267 100644 --- a/pymgcv/qq.py +++ b/pymgcv/qq.py @@ -2,11 +2,12 @@ from typing import Literal import numpy as np +from scipy.stats import beta, norm from pymgcv.families import SupportsCDF from pymgcv.gam import AbstractGAM from pymgcv.rlibs import rbase, rstats -from pymgcv.rpy_utils import is_null, to_py, to_rpy +from pymgcv.rpy_utils import is_null, to_py @dataclass @@ -84,32 +85,27 @@ def qq_simulate( # TODO support normal. For this logp in cdf matters! -def qq_cdf( +def qq_transform( gam: AbstractGAM, *, + transform_to: Literal["normal", "uniform"] = "normal", level: float = 0.9, ) -> QQResult: - """Generate a QQ-plot by transforming the data to uniform using the families CDF. + """Generate a QQ-plot by transforming the data to a known distribution. - In this case: - - - The theoretical values are evenly spaced between 0 and 1, - - The residuals are the result of passing the data through the family's CDF, - parameterized using the fitted values (and weights and scale as appropriate). + This plots the theoretical quantiles against the transformed data. The data + are transformed by 1) passing through the CDF implied by the model, and 2) + passing to the quantile function of the distribution implied by ``transform_to``. !!! note - The Q-Q plot generated by this function likely will not be good at for detecting deviations - in tail behaviour/due to outliers. For example, a few outliers would be transformed to - values close to zero or one, which will often **not** be visable in the plot! - - - The Q-Q curve when using [`qq_cdf`][pymgcv.qq.qq_cdf] will naturally be a - different shape to those produced by [`qq_simulate`][pymgcv.qq.qq_simulate]! + Using ``transform_to="uniform"`` may hide outliers/tail behaviour issues, + as the data is constrained to [0, 1], which can be hard to assess visually. Args: gam: The fitted GAM object. The family should support the CDF method, which can be checked with `isinstance(family, SupportsCDF)`. + transform_to: The distribution to transform the residuals to. level: The confidence level for the interval. Returns: @@ -119,10 +115,35 @@ def qq_cdf( raise ValueError("GAM has not been fit") if not isinstance(gam.family, SupportsCDF): - raise TypeError("Family must support CDF method to use qq_cdf.") - + raise TypeError("Family must support CDF method to use qq_transform.") + + class _Uniform: + @staticmethod + def quantile(probs): + return probs + + @staticmethod + def confidence_interval(alpha, n_obs: int): + a = np.arange(n_obs) + lower = beta.ppf(alpha, a, n_obs - a + 1) + upper = beta.ppf(1 - alpha, a, n_obs - a + 1) + return lower, upper + + class _Normal: + @staticmethod + def quantile(probs): + q = norm.ppf(probs) + return np.nan_to_num(q, nan=np.nan, posinf=100, neginf=-100) + + @staticmethod + def confidence_interval(alpha, n_obs: int): + a = (np.arange(n_obs) + 0.5) / n_obs + q = norm.ppf(a) + offset = norm.ppf(alpha) * np.sqrt(a * (1 - a) / n_obs) / norm.pdf(q) + return q - offset, q + offset + + dist = {"uniform": _Uniform, "normal": _Normal}[transform_to] model = gam.fit_state.rgam - fit = rstats.fitted(model) weights = rstats.weights(model, type="prior") sigma2 = model.rx2["sig2"] @@ -133,22 +154,23 @@ def qq_cdf( fit, weights, sigma2 = to_py(fit), to_py(weights), to_py(sigma2) # Transform the data to uniform (CDF) - residuals = gam.family.cdf( + probs = gam.family.cdf( to_py(gam.fit_state.rgam.rx2["y"]), mu=fit, wt=weights, scale=sigma2, ) - n_obs = len(residuals) - theoretical = (np.arange(n_obs) + 0.5) / n_obs - residuals = np.sort(residuals) - tmp = np.arange(n_obs) - alpha = (1 - level) / 2 - lower = rstats.qbeta(alpha, to_rpy(tmp), to_rpy(n_obs - tmp + 1)) - upper = rstats.qbeta(1 - alpha, to_rpy(tmp), to_rpy(n_obs - tmp + 1)) + probs = np.clip(probs, 0, 1) # Probably not needed but avoid nan risk + probs = np.sort(probs) + transformed = dist.quantile(probs) + + n_obs = len(probs) + theoretical = dist.quantile((np.arange(n_obs) + 0.5) / n_obs) + alpha = (1 - level) / 2 + lower, upper = dist.confidence_interval(alpha, n_obs) return QQResult( theoretical=theoretical, - residuals=residuals, + residuals=transformed, interval=(lower, upper), ) diff --git a/pyproject.toml b/pyproject.toml index 4a3570d..7ec97f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "numpy>=2.2.5", "pandas>=2.2.3", "matplotlib>=3.10", + "scipy>=1.12.0", ] [build-system] diff --git a/tests/test_qq.py b/tests/test_qq.py index 00bbe74..6bf30b4 100644 --- a/tests/test_qq.py +++ b/tests/test_qq.py @@ -1,9 +1,11 @@ +from functools import partial + import numpy as np import pandas as pd import pytest from pymgcv.gam import GAM -from pymgcv.qq import qq_cdf, qq_simulate +from pymgcv.qq import qq_simulate, qq_transform from pymgcv.terms import L, S from .gam_test_cases import GAMTestCase, get_test_cases @@ -11,7 +13,11 @@ @pytest.mark.parametrize( "qq_fun", - [qq_cdf, qq_simulate], + [ + partial(qq_transform, transform_to="uniform"), + partial(qq_transform, transform_to="normal"), + qq_simulate, + ], ) def test_qq_functions(qq_fun): """Test that qq_uniform runs without error and returns expected structure."""