Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions src/access_moppy/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
calculate_longitude_bounds,
calculate_time_bounds,
normalize_cf_time_units,
parse_cmip6_table_frequency,
type_mapping,
validate_and_resample_if_needed,
validate_cmip6_frequency_compatibility,
Expand Down Expand Up @@ -596,6 +597,29 @@ def rechunk_dataset(self):
else:
logger.debug("No dataset loaded, cannot rechunk")

def _target_frequency_hint(self):
"""Map the CMOR table's target frequency to a coarse label
("daily"/"monthly"/"yearly") for time-bounds construction.

Used only as a fallback when the time axis has a single point and the
frequency cannot be inferred from point spacing. Returns None when the
frequency is not determinable or is sub-daily.
"""
if not self.compound_name:
return None
try:
target = parse_cmip6_table_frequency(self.compound_name)
except Exception:
return None
days = target.total_seconds() / 86400
if 0.9 <= days <= 1.1:
return "daily"
if 28 <= days <= 31:
return "monthly"
if 360 <= days <= 366:
return "yearly"
return None

def calculate_missing_bounds_variables(self, bnds_required):
"""Calculate missing bounds variables for coordinates."""
for bnds_var in bnds_required:
Expand All @@ -622,6 +646,9 @@ def calculate_missing_bounds_variables(self, bnds_required):
self.ds,
time_coord=coord_name,
bnds_name="bnds", # Atmosphere uses "bnds"
# Fallback for a single time point (e.g. one resampled
# year) where the frequency cannot be inferred.
freq_hint=self._target_frequency_hint(),
)

elif coord_name in ["lat", "latitude", "y"]:
Expand Down
13 changes: 11 additions & 2 deletions src/access_moppy/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def __init__(
parent_info: dict[str, dict[str, Any]] | None = None,
model_id: str | None = None,
validate_frequency: bool = True,
enable_resampling: bool = False,
enable_resampling: bool = True,
enable_chunking: bool = False,
resampling_method: str = "auto",
input_folder: str | Path | None = None,
Expand Down Expand Up @@ -157,7 +157,10 @@ def __init__(
validate_frequency: Validate temporal frequency consistency across
file inputs. This is disabled automatically for xarray inputs.
enable_resampling: Enable automatic temporal resampling when
frequency mismatches are detected.
frequency mismatches are detected. Defaults to ``True``;
resampling is a no-op when the input already matches the target
frequency, and only triggers on a genuine mismatch (e.g. monthly
input for an ``Oyr`` table). Pass ``False`` to disable.
enable_chunking: Enable dask chunking in supported component
CMORisers.
resampling_method: Temporal resampling method: ``"auto"``,
Expand Down Expand Up @@ -515,6 +518,9 @@ def __init__(
vocab=self.vocab,
variable_mapping=self.variable_mapping.to_dict(),
drs_root=drs_root if drs_root else None,
validate_frequency=self.validate_frequency,
enable_resampling=self.enable_resampling,
resampling_method=self.resampling_method,
)
else:
# ACCESS-OM2 uses MOM5 (B-grid) — handled by a separate CMORiser class
Expand All @@ -528,6 +534,9 @@ def __init__(
vocab=self.vocab,
variable_mapping=self.variable_mapping.to_dict(),
drs_root=drs_root if drs_root else None,
validate_frequency=self.validate_frequency,
enable_resampling=self.enable_resampling,
resampling_method=self.resampling_method,
)
elif table in ("SImon", "SIday") or table.startswith(_mip_seaice_prefixes):
self.cmoriser = SeaIce_CMORiser(
Expand Down
12 changes: 12 additions & 0 deletions src/access_moppy/ocean.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,9 @@ def __init__(
vocab: CMIP6Vocabulary,
variable_mapping: Dict[str, Any],
drs_root: Optional[Path] = None,
validate_frequency: bool = True,
enable_resampling: bool = False,
resampling_method: str = "auto",
# Backward compatibility
input_paths: Optional[Union[str, List[str]]] = None,
):
Expand All @@ -345,6 +348,9 @@ def __init__(
vocab=vocab,
variable_mapping=variable_mapping,
drs_root=drs_root,
validate_frequency=validate_frequency,
enable_resampling=enable_resampling,
resampling_method=resampling_method,
)

nominal_resolution = vocab._get_nominal_resolution(target_realm="ocean")
Expand Down Expand Up @@ -406,6 +412,9 @@ def __init__(
vocab: CMIP6Vocabulary,
variable_mapping: Dict[str, Any],
drs_root: Optional[Path] = None,
validate_frequency: bool = True,
enable_resampling: bool = False,
resampling_method: str = "auto",
# Backward compatibility
input_paths: Optional[Union[str, List[str]]] = None,
):
Expand All @@ -417,6 +426,9 @@ def __init__(
vocab=vocab,
variable_mapping=variable_mapping,
drs_root=drs_root,
validate_frequency=validate_frequency,
enable_resampling=enable_resampling,
resampling_method=resampling_method,
)

nominal_resolution = vocab._get_nominal_resolution(target_realm="ocean")
Expand Down
118 changes: 111 additions & 7 deletions src/access_moppy/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -1917,6 +1917,53 @@ def get_resampling_frequency_string(target_freq: pd.Timedelta) -> str:
return f"{int(years)}YE"


def _normalise_calendar_name(calendar: Optional[str]) -> Optional[str]:
"""Map the non-CF ``"GREGORIAN"`` label to ``"proleptic_gregorian"``.

ACCESS-ESM1-5 files label their (proleptic-Gregorian) time axis with the
non-CF name ``"GREGORIAN"``; ``base.CMORiser._check_calendar`` rewrites this
to ``"proleptic_gregorian"`` on the written file. Date arithmetic for bounds
and resampling must use the same calendar — otherwise cftime treats
``"GREGORIAN"`` as the mixed Julian/Gregorian ``"standard"`` calendar and
shifts pre-1582 dates by ~1 day. All other names are returned unchanged.
"""
return "proleptic_gregorian" if calendar == "GREGORIAN" else calendar


def _shift_resampled_time_to_period_midpoint(
time_da: xr.DataArray, target_freq: pd.Timedelta
) -> xr.DataArray:
"""Move a resampled time coordinate from the period boundary to its midpoint.

pandas/xarray ``resample`` labels each bin on the period boundary (e.g. the
yearly frequency "YE" lands every value on 31 December). The CMOR convention
is to centre the time coordinate on the averaging period — a yearly mean sits
on ~2 July (12:00 in a 365-day year, 00:00 in a 366-day year), the midpoint of
``[Jan 1, next Jan 1]``. This recomputes the coordinate as that midpoint.

Sub-daily or unrecognised frequencies are returned unchanged.
"""
days = target_freq.total_seconds() / 86400
if 360 <= days <= 366:
bounds_fn = _calculate_yearly_bounds
elif 28 <= days <= 31:
bounds_fn = _calculate_monthly_bounds
elif 0.9 <= days <= 1.1:
bounds_fn = _calculate_daily_bounds
else:
return time_da

values = time_da.values
if values.size == 0:
return time_da
is_cftime = isinstance(values.flat[0], cftime.datetime)
calendar = time_da.attrs.get("calendar", "proleptic_gregorian")

bounds = bounds_fn(values, calendar, is_cftime)
midpoints = np.array([lo + (hi - lo) / 2 for lo, hi in bounds], dtype=values.dtype)
return time_da.copy(data=midpoints)


def resample_dataset_temporal(
ds: xr.Dataset,
target_freq: pd.Timedelta,
Expand All @@ -1943,6 +1990,20 @@ def resample_dataset_temporal(
f"Available coordinates: {sorted(ds.coords)}"
)

# xarray's resample requires a monotonic time index. Multi-file inputs supplied
# in non-chronological order (e.g. an unsorted glob) concatenate into a
# non-monotonic time axis, so sort here before resampling.
ds = ds.sortby(time_coord)

# Normalise the non-CF "GREGORIAN" calendar label to "proleptic_gregorian"
# (the calendar the written file ultimately declares) before decoding, so the
# resampled values, the period midpoint and the restored encoding are all
# computed in that calendar rather than cftime's Julian "standard" reading.
if ds[time_coord].attrs.get("calendar") == "GREGORIAN":
ds = ds.assign_coords(
{time_coord: ds[time_coord].assign_attrs(calendar="proleptic_gregorian")}
)

# Convert target frequency to resampling string
freq_str = get_resampling_frequency_string(target_freq)

Expand Down Expand Up @@ -2006,6 +2067,33 @@ def resample_dataset_temporal(
if coord_name != time_coord:
ds_resampled[coord_name] = ds[coord_name]

# Centre the resampled time coordinate on each period's midpoint
# (CMOR convention) instead of the period boundary that resample labels it
# with (e.g. yearly means on ~2 July rather than 31 December).
ds_resampled[time_coord] = _shift_resampled_time_to_period_midpoint(
ds_resampled[time_coord], target_freq
)

# Restore the original CF time encoding. decode_cf() moved units/calendar
# off the coordinate (into encoding) for the resample, so without this the
# written file would have a time axis with no units — CF-invalid. Re-encode
# the (cftime) midpoints back to numeric using the input units/calendar and
# reattach the original attributes, matching the decode_cf=False pipeline.
orig_time_attrs = dict(ds[time_coord].attrs)
orig_units = orig_time_attrs.get("units", "")
if "since" in orig_units:
orig_calendar = orig_time_attrs.get("calendar", "standard")
time_vals = ds_resampled[time_coord].values
# decode_cf yields cftime for non-standard/pre-1582 calendars but
# numpy datetime64 otherwise; date2num needs cftime/datetime objects,
# so convert datetime64 to python datetimes first.
if time_vals.size and not isinstance(time_vals.flat[0], cftime.datetime):
time_vals = pd.to_datetime(time_vals).to_pydatetime()
numeric_time = date2num(time_vals, units=orig_units, calendar=orig_calendar)
ds_resampled[time_coord] = xr.DataArray(
numeric_time, dims=[time_coord], attrs=orig_time_attrs
)

# Update attributes
ds_resampled.attrs = ds.attrs.copy()

Expand Down Expand Up @@ -2165,7 +2253,10 @@ def normalize_cf_time_units(units: Optional[str]) -> Optional[str]:


def calculate_time_bounds(
ds: xr.Dataset, time_coord: str = "time", bnds_name: str = "nv"
ds: xr.Dataset,
time_coord: str = "time",
bnds_name: str = "nv",
freq_hint: Optional[str] = None,
) -> xr.DataArray:
"""
Calculate time bounds from time coordinate for CMIP6 compliance.
Expand All @@ -2186,6 +2277,12 @@ def calculate_time_bounds(
bnds_name : str, default "nv"
Name of the bounds dimension. Use "nv" for ocean data (default),
or "bnds" for atmosphere data
freq_hint : str, optional
Frequency label ("daily", "monthly", "yearly") used only as a fallback
when the frequency cannot be inferred from the time axis itself — i.e.
when there is a single time point (e.g. a multi-year input resampled
down to one year). Inference from ≥2 points always takes precedence, so
this never changes existing multi-point behaviour.

Returns
-------
Expand All @@ -2205,14 +2302,13 @@ def calculate_time_bounds(
time = ds[time_coord]
n_times = len(time)

if n_times < 2:
raise ValueError("Need at least 2 time points to infer time bounds")

# Compute only the 1-D time coordinate. Using .compute().values (rather than
# plain .values) ensures that only the time coordinate's dask graph is
# triggered, not any larger graph that happens to reference the same chunks.
time_values = time.compute().values
calendar = time.attrs.get("calendar", "proleptic_gregorian")
calendar = _normalise_calendar_name(
time.attrs.get("calendar", "proleptic_gregorian")
)
units = time.attrs.get("units")

# Determine the type of time coordinate
Expand All @@ -2236,8 +2332,16 @@ def calculate_time_bounds(
)
is_cftime = True

# Try to infer frequency
freq = _infer_frequency(time_values)
# Infer frequency from the spacing of time points. When only a single time
# point is present (e.g. a multi-year input resampled down to one year),
# inference returns None; fall back to the caller-supplied frequency hint
# (derived from the CMOR table) so per-period bounds can still be built.
freq = _infer_frequency(time_values) or freq_hint
if freq is None:
raise ValueError(
"Need at least 2 time points to infer time bounds, or pass freq_hint "
"(e.g. 'yearly') derived from the target table frequency"
)

# Initialize bounds array
time_bnds = np.empty((n_times, 2), dtype=object if is_cftime else time_values.dtype)
Expand Down
4 changes: 4 additions & 0 deletions src/access_moppy/vocabulary_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,6 +891,7 @@ def generate_filename(
table_lower = table_name.lower()
is_subdaily_data = any(freq in table_lower for freq in ["3hr", "6hr", "hr"])
is_daily_data = "day" in table_lower
is_yearly_data = "yr" in table_lower

# Format time range based on frequency
if is_subdaily_data:
Expand All @@ -902,6 +903,9 @@ def generate_filename(
elif is_daily_data:
# Daily data: include day (YYYYMMDD)
start, end = [f"{t.year:04d}{t.month:02d}{t.day:02d}" for t in times]
elif is_yearly_data:
# Yearly data (e.g. Oyr): year only (YYYY)
start, end = [f"{t.year:04d}" for t in times]
else:
# Monthly or other data: year and month only (YYYYMM)
start, end = [f"{t.year:04d}{t.month:02d}" for t in times]
Expand Down
23 changes: 23 additions & 0 deletions tests/unit/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import logging
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import MagicMock, Mock, patch

import dask.array as da
Expand Down Expand Up @@ -2602,3 +2603,25 @@ def test_above_max_error_includes_actual_maximum(self):
msg = str(exc_info.value)
assert "Actual maximum found" in msg
assert "9" in msg


class TestTargetFrequencyHint:
"""_target_frequency_hint maps the CMOR table frequency to a coarse label,
used only as a single-point fallback in time-bounds construction."""

@pytest.mark.parametrize(
"compound_name, expected",
[
("Oyr.no3", "yearly"),
("Omon.tos", "monthly"),
("Oday.tos", "daily"),
("3hr.x", None), # sub-daily has no coarse bucket
("fx.areacello", None), # time-independent
("Bogus.zzz", None), # unparseable table -> None (exception swallowed)
(None, None), # no compound_name
("", None),
],
)
def test_frequency_hint(self, compound_name, expected):
stub = SimpleNamespace(compound_name=compound_name)
assert CMORiser._target_frequency_hint(stub) == expected
Loading