From 087eec9839749b186f9ae92622ce1bca8741553e Mon Sep 17 00:00:00 2001
From: danielward27 <danielward27@outlook.com>
Date: Thu, 4 Sep 2025 14:42:47 +0100
Subject: [PATCH] add qq_transform (tnormal/tunif)

---
 docs/api/miscellaneous.md |  2 +-
 pymgcv/plot.py            | 35 ++++++++----------
 pymgcv/qq.py              | 76 +++++++++++++++++++++++++--------------
 pyproject.toml            |  1 +
 tests/test_qq.py          | 10 ++++--
 5 files changed, 73 insertions(+), 51 deletions(-)

diff --git a/docs/api/miscellaneous.md b/docs/api/miscellaneous.md
index fda61ab..79ee061 100644
--- a/docs/api/miscellaneous.md
+++ b/docs/api/miscellaneous.md
@@ -6,5 +6,5 @@ Various functionality not part of the main API, but we want to be exposed/search
     options:
       members:
         - "QQResult"
-        - "qq_cdf"
+        - "qq_transform"
         - "qq_simulate"
diff --git a/pymgcv/plot.py b/pymgcv/plot.py
index 80c9f55..721e146 100644
--- a/pymgcv/plot.py
+++ b/pymgcv/plot.py
@@ -15,12 +15,11 @@
 from matplotlib.figure import Figure
 from pandas import CategoricalDtype
 from pandas.api.types import is_numeric_dtype
+from scipy.stats import norm
 
 from pymgcv.basis_functions import FactorSmooth, RandomEffect
 from pymgcv.gam import AbstractGAM
 from pymgcv.qq import QQResult, qq_simulate
-from pymgcv.rlibs import rstats
-from pymgcv.rpy_utils import to_py
 from pymgcv.terms import (
     AbstractTerm,
     L,
@@ -712,7 +711,11 @@ def random_effect(
         data=pd.DataFrame(pd.Series(levels, name=term.varnames[0], dtype="category")),
     )
     order = np.argsort(pred)
-    x = rstats.qnorm(rstats.ppoints(len(levels)))  # Gaussian quantiles
+
+    n = len(levels)
+    probs = (np.arange(n) + 0.5) / n
+    x = norm.ppf(probs)
+
     ax.scatter(
         x,
         pred[order],
@@ -724,14 +727,10 @@ def random_effect(
     # page 6. We need to multiply them by sd(.dat$y) because we are not normalizing the
     # random effects.
     alpha = (1 - confidence_interval_level) / 2
-    n = len(levels)
-    p = to_py(rstats.ppoints(n))
     interval = (
-        np.std(pred)
-        * to_py(rstats.qnorm(alpha))  # e.g. 1.96
-        * np.sqrt(p * (1 - p) / n)
-        / to_py(rstats.dnorm(x))
+        np.std(pred) * norm.ppf(alpha) * np.sqrt(probs * (1 - probs) / n) / norm.pdf(x)
     )
+
     ref_y = x * np.std(pred)
     _with_disable(ax.fill_between)(
         x,
@@ -768,15 +767,9 @@ def qq(
             object storing the theoretical residuals, residuals, and the confidence
             interval. Defaults to [`qq_simulate`][pymgcv.qq.qq_simulate], which is the
             most widely supported method only requiring the family to provide a
-            sampling function. [`qq_cdf`][pymgcv.qq.qq_cdf] can be used for families
-            providing a cdf method, which transforms the data to a uniform
-            distribution
-
-            !!! warning
-
-                `qq_cdf` transforms all data to [0, 1] uniform. This may make it hard to
-                spot deviations in tail behaviour, e.g. due to a few extreme outliers.
-
+            sampling function. [`qq_transform`][pymgcv.qq.qq_transform] can be used for
+            families providing a cdf method, which transforms the data to a known
+            distribution for which an analytical confidence interval is available.
         scatter_kwargs: Key word arguments passed to `matplotlib.pyplot.scatter`.
         fill_between_kwargs: Key word arguments passed to
             `matplotlib.pyplot.fill_between`, for plotting the confidence interval.
@@ -787,14 +780,14 @@ def qq(
 
     !!! note
 
-        We can change e.g. the confidence level and number of simulations using
-        partial application:
+        To change settings of ``qq_fun``, use partial application, e.g.
         ```python
         from pymgcv.qq import qq_simulate
         from functools import partial
+        import pymgcv.plot as gplt
 
         qq_fun = partial(qq_simulate, level=0.95, n_sim=10)
-        # qq(..., qq_fun=qq_fun)
+        # gplt.qq(..., qq_fun=qq_fun)
         ```
 
     Returns:
diff --git a/pymgcv/qq.py b/pymgcv/qq.py
index 906c922..1151267 100644
--- a/pymgcv/qq.py
+++ b/pymgcv/qq.py
@@ -2,11 +2,12 @@
 from typing import Literal
 
 import numpy as np
+from scipy.stats import beta, norm
 
 from pymgcv.families import SupportsCDF
 from pymgcv.gam import AbstractGAM
 from pymgcv.rlibs import rbase, rstats
-from pymgcv.rpy_utils import is_null, to_py, to_rpy
+from pymgcv.rpy_utils import is_null, to_py
 
 
 @dataclass
@@ -84,32 +85,27 @@ def qq_simulate(
 
 
 # TODO support normal. For this logp in cdf matters!
-def qq_cdf(
+def qq_transform(
     gam: AbstractGAM,
     *,
+    transform_to: Literal["normal", "uniform"] = "normal",
     level: float = 0.9,
 ) -> QQResult:
-    """Generate a QQ-plot by transforming the data to uniform using the families CDF.
+    """Generate a QQ-plot by transforming the data to a known distribution.
 
-    In this case:
-
-       - The theoretical values are evenly spaced between 0 and 1,
-       - The residuals are the result of passing the data through the family's CDF,
-         parameterized using the fitted values (and weights and scale as appropriate).
+    This plots the theoretical quantiles against the transformed data. The data
+    are transformed by 1) passing through the CDF implied by the model, and 2)
+    passing to the quantile function of the distribution implied by ``transform_to``.
 
     !!! note
 
-        The Q-Q plot generated by this function likely will not be good at for detecting deviations
-        in tail behaviour/due to outliers. For example, a few outliers would be transformed to
-        values close to zero or one, which will often **not** be visable in the plot!
-
-
-        The Q-Q curve when using [`qq_cdf`][pymgcv.qq.qq_cdf] will naturally be a
-        different shape to those produced by [`qq_simulate`][pymgcv.qq.qq_simulate]!
+       Using ``transform_to="uniform"`` may hide outliers/tail behaviour issues,
+       as the data is constrained to [0, 1], which can be hard to assess visually.
 
     Args:
         gam: The fitted GAM object. The family should support the CDF method,
             which can be checked with `isinstance(family, SupportsCDF)`.
+        transform_to: The distribution to transform the residuals to.
         level: The confidence level for the interval.
 
     Returns:
@@ -119,10 +115,35 @@ def qq_cdf(
         raise ValueError("GAM has not been fit")
 
     if not isinstance(gam.family, SupportsCDF):
-        raise TypeError("Family must support CDF method to use qq_cdf.")
-
+        raise TypeError("Family must support CDF method to use qq_transform.")
+
+    class _Uniform:
+        @staticmethod
+        def quantile(probs):
+            return probs
+
+        @staticmethod
+        def confidence_interval(alpha, n_obs: int):
+            a = np.arange(n_obs)
+            lower = beta.ppf(alpha, a, n_obs - a + 1)
+            upper = beta.ppf(1 - alpha, a, n_obs - a + 1)
+            return lower, upper
+
+    class _Normal:
+        @staticmethod
+        def quantile(probs):
+            q = norm.ppf(probs)
+            return np.nan_to_num(q, nan=np.nan, posinf=100, neginf=-100)
+
+        @staticmethod
+        def confidence_interval(alpha, n_obs: int):
+            a = (np.arange(n_obs) + 0.5) / n_obs
+            q = norm.ppf(a)
+            offset = norm.ppf(alpha) * np.sqrt(a * (1 - a) / n_obs) / norm.pdf(q)
+            return q - offset, q + offset
+
+    dist = {"uniform": _Uniform, "normal": _Normal}[transform_to]
     model = gam.fit_state.rgam
-
     fit = rstats.fitted(model)
     weights = rstats.weights(model, type="prior")
     sigma2 = model.rx2["sig2"]
@@ -133,22 +154,23 @@ def qq_cdf(
     fit, weights, sigma2 = to_py(fit), to_py(weights), to_py(sigma2)
 
     # Transform the data to uniform (CDF)
-    residuals = gam.family.cdf(
+    probs = gam.family.cdf(
         to_py(gam.fit_state.rgam.rx2["y"]),
         mu=fit,
         wt=weights,
         scale=sigma2,
     )
-    n_obs = len(residuals)
-    theoretical = (np.arange(n_obs) + 0.5) / n_obs
-    residuals = np.sort(residuals)
-    tmp = np.arange(n_obs)
-    alpha = (1 - level) / 2
-    lower = rstats.qbeta(alpha, to_rpy(tmp), to_rpy(n_obs - tmp + 1))
-    upper = rstats.qbeta(1 - alpha, to_rpy(tmp), to_rpy(n_obs - tmp + 1))
+    probs = np.clip(probs, 0, 1)  # Probably not needed but avoid nan risk
+    probs = np.sort(probs)
+    transformed = dist.quantile(probs)
+
+    n_obs = len(probs)
+    theoretical = dist.quantile((np.arange(n_obs) + 0.5) / n_obs)
 
+    alpha = (1 - level) / 2
+    lower, upper = dist.confidence_interval(alpha, n_obs)
     return QQResult(
         theoretical=theoretical,
-        residuals=residuals,
+        residuals=transformed,
         interval=(lower, upper),
     )
diff --git a/pyproject.toml b/pyproject.toml
index 4a3570d..7ec97f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,7 @@ dependencies = [
     "numpy>=2.2.5",
     "pandas>=2.2.3",
     "matplotlib>=3.10",
+    "scipy>=1.12.0",
 ]
 
 [build-system]
diff --git a/tests/test_qq.py b/tests/test_qq.py
index 00bbe74..6bf30b4 100644
--- a/tests/test_qq.py
+++ b/tests/test_qq.py
@@ -1,9 +1,11 @@
+from functools import partial
+
 import numpy as np
 import pandas as pd
 import pytest
 
 from pymgcv.gam import GAM
-from pymgcv.qq import qq_cdf, qq_simulate
+from pymgcv.qq import qq_simulate, qq_transform
 from pymgcv.terms import L, S
 
 from .gam_test_cases import GAMTestCase, get_test_cases
@@ -11,7 +13,11 @@
 
 @pytest.mark.parametrize(
     "qq_fun",
-    [qq_cdf, qq_simulate],
+    [
+        partial(qq_transform, transform_to="uniform"),
+        partial(qq_transform, transform_to="normal"),
+        qq_simulate,
+    ],
 )
 def test_qq_functions(qq_fun):
     """Test that qq_uniform runs without error and returns expected structure."""