Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/api/miscellaneous.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ Various functionality not part of the main API, but we want to be exposed/search
options:
members:
- "QQResult"
- "qq_cdf"
- "qq_transform"
- "qq_simulate"
35 changes: 14 additions & 21 deletions pymgcv/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@
from matplotlib.figure import Figure
from pandas import CategoricalDtype
from pandas.api.types import is_numeric_dtype
from scipy.stats import norm

from pymgcv.basis_functions import FactorSmooth, RandomEffect
from pymgcv.gam import AbstractGAM
from pymgcv.qq import QQResult, qq_simulate
from pymgcv.rlibs import rstats
from pymgcv.rpy_utils import to_py
from pymgcv.terms import (
AbstractTerm,
L,
Expand Down Expand Up @@ -712,7 +711,11 @@ def random_effect(
data=pd.DataFrame(pd.Series(levels, name=term.varnames[0], dtype="category")),
)
order = np.argsort(pred)
x = rstats.qnorm(rstats.ppoints(len(levels))) # Gaussian quantiles

n = len(levels)
probs = (np.arange(n) + 0.5) / n
x = norm.ppf(probs)

ax.scatter(
x,
pred[order],
Expand All @@ -724,14 +727,10 @@ def random_effect(
# page 6. We need to multiply them by sd(.dat$y) because we are not normalizing the
# random effects.
alpha = (1 - confidence_interval_level) / 2
n = len(levels)
p = to_py(rstats.ppoints(n))
interval = (
np.std(pred)
* to_py(rstats.qnorm(alpha)) # e.g. 1.96
* np.sqrt(p * (1 - p) / n)
/ to_py(rstats.dnorm(x))
np.std(pred) * norm.ppf(alpha) * np.sqrt(probs * (1 - probs) / n) / norm.pdf(x)
)

ref_y = x * np.std(pred)
_with_disable(ax.fill_between)(
x,
Expand Down Expand Up @@ -768,15 +767,9 @@ def qq(
object storing the theoretical residuals, residuals, and the confidence
interval. Defaults to [`qq_simulate`][pymgcv.qq.qq_simulate], which is the
most widely supported method only requiring the family to provide a
sampling function. [`qq_cdf`][pymgcv.qq.qq_cdf] can be used for families
providing a cdf method, which transforms the data to a uniform
distribution

!!! warning

`qq_cdf` transforms all data to [0, 1] uniform. This may make it hard to
spot deviations in tail behaviour, e.g. due to a few extreme outliers.

sampling function. [`qq_transform`][pymgcv.qq.qq_transform] can be used for
families providing a cdf method, which transforms the data to a known
distribution for which an analytical confidence interval is available.
scatter_kwargs: Key word arguments passed to `matplotlib.pyplot.scatter`.
fill_between_kwargs: Key word arguments passed to
`matplotlib.pyplot.fill_between`, for plotting the confidence interval.
Expand All @@ -787,14 +780,14 @@ def qq(

!!! note

We can change e.g. the confidence level and number of simulations using
partial application:
To change settings of ``qq_fun``, use partial application, e.g.
```python
from pymgcv.qq import qq_simulate
from functools import partial
import pymgcv.plot as gplt

qq_fun = partial(qq_simulate, level=0.95, n_sim=10)
# qq(..., qq_fun=qq_fun)
# gplt.qq(..., qq_fun=qq_fun)
```

Returns:
Expand Down
76 changes: 49 additions & 27 deletions pymgcv/qq.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
from typing import Literal

import numpy as np
from scipy.stats import beta, norm

from pymgcv.families import SupportsCDF
from pymgcv.gam import AbstractGAM
from pymgcv.rlibs import rbase, rstats
from pymgcv.rpy_utils import is_null, to_py, to_rpy
from pymgcv.rpy_utils import is_null, to_py


@dataclass
Expand Down Expand Up @@ -84,32 +85,27 @@ def qq_simulate(


# TODO support normal. For this logp in cdf matters!
def qq_cdf(
def qq_transform(
gam: AbstractGAM,
*,
transform_to: Literal["normal", "uniform"] = "normal",
level: float = 0.9,
) -> QQResult:
"""Generate a QQ-plot by transforming the data to uniform using the families CDF.
"""Generate a QQ-plot by transforming the data to a known distribution.

In this case:

- The theoretical values are evenly spaced between 0 and 1,
- The residuals are the result of passing the data through the family's CDF,
parameterized using the fitted values (and weights and scale as appropriate).
This plots the theoretical quantiles against the transformed data. The data
are transformed by 1) passing through the CDF implied by the model, and 2)
passing to the quantile function of the distribution implied by ``transform_to``.

!!! note

The Q-Q plot generated by this function likely will not be good at for detecting deviations
in tail behaviour/due to outliers. For example, a few outliers would be transformed to
values close to zero or one, which will often **not** be visable in the plot!


The Q-Q curve when using [`qq_cdf`][pymgcv.qq.qq_cdf] will naturally be a
different shape to those produced by [`qq_simulate`][pymgcv.qq.qq_simulate]!
Using ``transform_to="uniform"`` may hide outliers/tail behaviour issues,
as the data is constrained to [0, 1], which can be hard to assess visually.

Args:
gam: The fitted GAM object. The family should support the CDF method,
which can be checked with `isinstance(family, SupportsCDF)`.
transform_to: The distribution to transform the residuals to.
level: The confidence level for the interval.

Returns:
Expand All @@ -119,10 +115,35 @@ def qq_cdf(
raise ValueError("GAM has not been fit")

if not isinstance(gam.family, SupportsCDF):
raise TypeError("Family must support CDF method to use qq_cdf.")

raise TypeError("Family must support CDF method to use qq_transform.")

class _Uniform:
@staticmethod
def quantile(probs):
return probs

@staticmethod
def confidence_interval(alpha, n_obs: int):
a = np.arange(n_obs)
lower = beta.ppf(alpha, a, n_obs - a + 1)
upper = beta.ppf(1 - alpha, a, n_obs - a + 1)
return lower, upper

class _Normal:
@staticmethod
def quantile(probs):
q = norm.ppf(probs)
return np.nan_to_num(q, nan=np.nan, posinf=100, neginf=-100)

@staticmethod
def confidence_interval(alpha, n_obs: int):
a = (np.arange(n_obs) + 0.5) / n_obs
q = norm.ppf(a)
offset = norm.ppf(alpha) * np.sqrt(a * (1 - a) / n_obs) / norm.pdf(q)
return q - offset, q + offset

dist = {"uniform": _Uniform, "normal": _Normal}[transform_to]
model = gam.fit_state.rgam

fit = rstats.fitted(model)
weights = rstats.weights(model, type="prior")
sigma2 = model.rx2["sig2"]
Expand All @@ -133,22 +154,23 @@ def qq_cdf(
fit, weights, sigma2 = to_py(fit), to_py(weights), to_py(sigma2)

# Transform the data to uniform (CDF)
residuals = gam.family.cdf(
probs = gam.family.cdf(
to_py(gam.fit_state.rgam.rx2["y"]),
mu=fit,
wt=weights,
scale=sigma2,
)
n_obs = len(residuals)
theoretical = (np.arange(n_obs) + 0.5) / n_obs
residuals = np.sort(residuals)
tmp = np.arange(n_obs)
alpha = (1 - level) / 2
lower = rstats.qbeta(alpha, to_rpy(tmp), to_rpy(n_obs - tmp + 1))
upper = rstats.qbeta(1 - alpha, to_rpy(tmp), to_rpy(n_obs - tmp + 1))
probs = np.clip(probs, 0, 1) # Probably not needed but avoid nan risk
probs = np.sort(probs)
transformed = dist.quantile(probs)

n_obs = len(probs)
theoretical = dist.quantile((np.arange(n_obs) + 0.5) / n_obs)

alpha = (1 - level) / 2
lower, upper = dist.confidence_interval(alpha, n_obs)
return QQResult(
theoretical=theoretical,
residuals=residuals,
residuals=transformed,
interval=(lower, upper),
)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies = [
"numpy>=2.2.5",
"pandas>=2.2.3",
"matplotlib>=3.10",
"scipy>=1.12.0",
]

[build-system]
Expand Down
10 changes: 8 additions & 2 deletions tests/test_qq.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
from functools import partial

import numpy as np
import pandas as pd
import pytest

from pymgcv.gam import GAM
from pymgcv.qq import qq_cdf, qq_simulate
from pymgcv.qq import qq_simulate, qq_transform
from pymgcv.terms import L, S

from .gam_test_cases import GAMTestCase, get_test_cases


@pytest.mark.parametrize(
"qq_fun",
[qq_cdf, qq_simulate],
[
partial(qq_transform, transform_to="uniform"),
partial(qq_transform, transform_to="normal"),
qq_simulate,
],
)
def test_qq_functions(qq_fun):
"""Test that qq_uniform runs without error and returns expected structure."""
Expand Down