From 1fabd1eca374ab2518e3cf2826aa3c2a6ed9873b Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 19 Jun 2026 10:57:35 -0600 Subject: [PATCH 1/3] migrate to polars --- README.md | 47 ++++ pyproject.toml | 27 +- src/cytodataframe/__init__.py | 10 + src/cytodataframe/engine.py | 216 ++++++++++++++++ src/cytodataframe/frame.py | 124 ++++++++- src/cytodataframe/lazy.py | 271 ++++++++++++++++++++ src/cytodataframe/schema.py | 456 ++++++++++++++++++++++++++++++++++ tests/test_engine.py | 154 ++++++++++++ tests/test_lazy.py | 164 ++++++++++++ tests/test_schema.py | 199 +++++++++++++++ uv.lock | 251 ++++++------------- 11 files changed, 1739 insertions(+), 180 deletions(-) create mode 100644 src/cytodataframe/engine.py create mode 100644 src/cytodataframe/lazy.py create mode 100644 src/cytodataframe/schema.py create mode 100644 tests/test_engine.py create mode 100644 tests/test_lazy.py create mode 100644 tests/test_schema.py diff --git a/README.md b/README.md index 3c97656..62a1c50 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,39 @@ With CytoDataFrame you can: - Highlight image objects using mask or outline files to understand their segmentation. - Adjust image displays on-the-fly using interactive slider widgets. - Automatically detect 3D image volumes and render interactive [trame](https://github.com/Kitware/trame) views in notebooks when 3D dependencies are installed (with graceful fallback otherwise). +- Interoperate with the [Polars](https://pola.rs/) and [Apache Arrow](https://arrow.apache.org/) ecosystems while keeping the familiar Pandas-based experience. + +### Polars and Arrow interoperability + +CytoDataFrame uses Apache Arrow as its canonical schema/interchange contract and +Polars as an execution engine, while Pandas remains the compatibility layer. You +can move between representations and run lazy, scalable queries without leaving +the CytoDataFrame API: + +```python +import polars as pl +from cytodataframe import CytoDataFrame + +# Construct from pandas, polars (DataFrame or LazyFrame), or a pyarrow Table. +cdf = CytoDataFrame("profiles.parquet") + +# Convert out to any representation (Pandas stays a boundary layer). +cdf.to_pandas() # pandas.DataFrame +cdf.to_polars() # polars.DataFrame +cdf.to_arrow() # pyarrow.Table +cdf.to_lazy() # CytoLazyFrame (lazy, Polars-backed) + +# Inspect the inferred schema (metadata / feature / geometry roles). +cdf.cyto_schema + +# Lazily scan large Parquet datasets with predicate/projection pushdown. +result = ( + CytoDataFrame.scan_parquet("profiles.parquet") + .filter(pl.col("Metadata_Well") == "A01") + .select_features() + .collect() # -> CytoDataFrame +) +``` For 3D notebook display behavior: @@ -53,6 +86,20 @@ pip install cytodataframe pip install git+https://github.com/cytomining/CytoDataFrame.git ``` +The core install is intentionally lean. Heavier, feature-specific stacks are +available as optional extras: + +```shell +# interactive 3D volume rendering (trame / pyvista) +pip install "cytodataframe[viz3d]" + +# OME-Arrow image read/write/embedding (to_ome_parquet, OME-Arrow columns) +pip install "cytodataframe[ome]" + +# everything +pip install "cytodataframe[all]" +``` + ## Contributing, Development, and Testing Please see our [contributing](https://cytomining.github.io/CytoDataFrame/main/contributing) documentation for more details on contributions, development, and testing. diff --git a/pyproject.toml b/pyproject.toml index 25653bb..5186635 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,21 +23,29 @@ dependencies = [ "imagecodecs>=2024.9.22,<2027", "imageio>=2.37,<3", "ipython>=8.12.3,<10", - "ipyvolume>=0.6.3,<0.7", "ipywidgets>=8.1.7,<9", - "matplotlib>=3.9.3,<4", - "nest-asyncio>=1.6,<2", - "ome-arrow>=0.0.3,<0.0.10", "opencv-python>=4.10.0.84,<5", "pandas>=2.2.2,<4", + "polars>=1,<2", "pyarrow>=16", - "pyvista>=0.46.4", - "pywavelets>1.4.1", "scikit-image>0.19.3", +] +# Optional feature stacks. Install with e.g. `pip install cytodataframe[viz3d,ome]`. +# These are imported lazily, so the core package imports fine without them. +optional-dependencies.ome = [ + # OME-Arrow image read/write/embedding (to_ome_parquet, OME-Arrow columns). + "ome-arrow>=0.0.3,<0.0.10", +] +optional-dependencies.viz3d = [ + # Interactive 3D volume rendering (trame/pyvista views in notebooks). + "pyvista>=0.46.4", "trame>=3.12", "trame-vtk>=2.10", "trame-vuetify>=3.1", ] +optional-dependencies.all = [ + "cytodataframe[ome,viz3d]", +] [dependency-groups] dev = [ @@ -46,6 +54,7 @@ dev = [ "coverage>=7.6,<8", "duckdb>=1.1.3,<2", "httpcore>=0.18,<1.1", + "hypothesis>=6,<7", "isort>=5.13.2,<9", "jupyterlab>=4.3,<5", "jupyterlab-code-formatter>=3.0.2,<4", @@ -54,6 +63,12 @@ dev = [ "pytest>=8.3.3,<10", "pytest-cov>=5,<8", "sqlalchemy>=1.3.6,<3", + # optional-feature stacks needed to exercise the full test suite + "ome-arrow>=0.0.3,<0.0.10", + "pyvista>=0.46.4", + "trame>=3.12", + "trame-vtk>=2.10", + "trame-vuetify>=3.1", ] docs = [ "dunamai>=1.22,<2", diff --git a/src/cytodataframe/__init__.py b/src/cytodataframe/__init__.py index f1dc227..33c8379 100644 --- a/src/cytodataframe/__init__.py +++ b/src/cytodataframe/__init__.py @@ -2,7 +2,17 @@ Initialization for cytodataframe package """ +from . import engine from .frame import CytoDataFrame +from .lazy import CytoLazyFrame +from .schema import CytoSchema # note: version placeholder is updated during builds __version__ = "0.0.0" + +__all__ = [ + "CytoDataFrame", + "CytoLazyFrame", + "CytoSchema", + "engine", +] diff --git a/src/cytodataframe/engine.py b/src/cytodataframe/engine.py new file mode 100644 index 0000000..3b20b80 --- /dev/null +++ b/src/cytodataframe/engine.py @@ -0,0 +1,216 @@ +""" +Backend abstraction layer for CytoDataFrame. + +This module is the execution/interchange boundary described in the CytoDataFrame +evolution plan. It treats Apache Arrow as the canonical schema and memory +contract, Polars as the execution engine, and pandas as a compatibility layer. + +The functions here normalize the supported tabular inputs + + * :class:`pandas.DataFrame` / :class:`pandas.Series` + * :class:`polars.DataFrame` + * :class:`polars.LazyFrame` + * :class:`pyarrow.Table` + * :class:`cytodataframe.frame.CytoDataFrame` (a ``pandas.DataFrame`` subclass) + +into the representation requested by the caller while preserving row counts, +null semantics, column ordering, and schema. + +Design notes: + * Arrow is used as the bridge whenever a schema/serialization contract is + requested (``to_arrow``). + * Conversions intentionally avoid forcing existing *pandas* object columns + (which may hold numpy image arrays or OME-Arrow structs) through Arrow, + because Arrow cannot always round-trip arbitrary Python objects. Such + columns are only converted when the caller explicitly asks for an Arrow or + Polars representation. +""" + +from __future__ import annotations + +import pathlib +from typing import TYPE_CHECKING, Any, Union + +import pandas as pd + +if TYPE_CHECKING: # pragma: no cover - typing only + import polars as pl + import pyarrow as pa + +# Public alias describing every tabular input CytoDataFrame's engine understands. +TabularData = Union[ + "pd.DataFrame", + "pd.Series", + "pl.DataFrame", + "pl.LazyFrame", + "pa.Table", +] + + +def _polars() -> Any: + """Import polars lazily so importing this module stays cheap.""" + import polars as pl + + return pl + + +def _pyarrow() -> Any: + """Import pyarrow lazily so importing this module stays cheap.""" + import pyarrow as pa + + return pa + + +def is_polars_dataframe(data: Any) -> bool: + """Return True when ``data`` is a :class:`polars.DataFrame`.""" + try: + pl = _polars() + except ImportError: + return False + return isinstance(data, pl.DataFrame) + + +def is_polars_lazyframe(data: Any) -> bool: + """Return True when ``data`` is a :class:`polars.LazyFrame`.""" + try: + pl = _polars() + except ImportError: + return False + return isinstance(data, pl.LazyFrame) + + +def is_arrow_table(data: Any) -> bool: + """Return True when ``data`` is a :class:`pyarrow.Table`.""" + try: + pa = _pyarrow() + except ImportError: + return False + return isinstance(data, pa.Table) + + +def is_supported(data: Any) -> bool: + """Return True when ``data`` is one of the supported tabular inputs.""" + return ( + isinstance(data, (pd.DataFrame, pd.Series)) + or is_polars_dataframe(data) + or is_polars_lazyframe(data) + or is_arrow_table(data) + ) + + +def to_pandas(data: TabularData) -> pd.DataFrame: + """ + Convert any supported tabular input to a :class:`pandas.DataFrame`. + + pandas inputs (including ``CytoDataFrame``) are returned as-is so that object + columns holding images or OME-Arrow structs are never disturbed. + """ + if isinstance(data, pd.DataFrame): + return data + if isinstance(data, pd.Series): + return data.to_frame() + if is_polars_lazyframe(data): + return data.collect().to_pandas() + if is_polars_dataframe(data): + return data.to_pandas() + if is_arrow_table(data): + return data.to_pandas() + raise TypeError( + f"Unsupported type for CytoDataFrame engine conversion: {type(data)!r}" + ) + + +def to_polars(data: TabularData) -> "pl.DataFrame": + """Convert any supported tabular input to an eager :class:`polars.DataFrame`.""" + pl = _polars() + if isinstance(data, pl.DataFrame): + return data + if isinstance(data, pl.LazyFrame): + return data.collect() + if is_arrow_table(data): + return pl.from_arrow(data) + if isinstance(data, pd.Series): + data = data.to_frame() + if isinstance(data, pd.DataFrame): + # Strip any pandas subclass (e.g. CytoDataFrame) and index before handing + # the frame to polars, which has no index concept. + try: + return pl.from_pandas(pd.DataFrame(data)) + except Exception as exc: + raise TypeError( + "Could not convert pandas data to polars. Columns holding " + "non-Arrow-compatible Python objects (e.g. numpy image arrays) " + "cannot be represented in polars/Arrow." + ) from exc + raise TypeError( + f"Unsupported type for CytoDataFrame engine conversion: {type(data)!r}" + ) + + +def to_lazyframe(data: TabularData) -> "pl.LazyFrame": + """Convert any supported tabular input to a :class:`polars.LazyFrame`.""" + pl = _polars() + if isinstance(data, pl.LazyFrame): + return data + return to_polars(data).lazy() + + +def to_arrow(data: TabularData, *, preserve_index: bool = False) -> "pa.Table": + """ + Convert any supported tabular input to a :class:`pyarrow.Table`. + + Arrow is the canonical schema/serialization contract, so this is the + conversion used whenever schema or interchange guarantees matter. + """ + pa = _pyarrow() + if is_arrow_table(data): + return data + if is_polars_lazyframe(data): + return data.collect().to_arrow() + if is_polars_dataframe(data): + return data.to_arrow() + if isinstance(data, pd.Series): + data = data.to_frame() + if isinstance(data, pd.DataFrame): + try: + return pa.Table.from_pandas( + pd.DataFrame(data), preserve_index=preserve_index + ) + except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as exc: + raise TypeError( + "Could not convert pandas data to an Arrow table. Columns " + "holding non-Arrow-compatible Python objects (e.g. numpy image " + "arrays) cannot be represented in Arrow." + ) from exc + raise TypeError( + f"Unsupported type for CytoDataFrame engine conversion: {type(data)!r}" + ) + + +def normalize_to_pandas(data: TabularData) -> pd.DataFrame: + """ + Normalize a supported input to pandas for the compatibility facade. + + This is the ingestion entry point used by ``CytoDataFrame.__init__`` to wrap + Polars/Arrow inputs while keeping pandas as the backing store. + """ + return to_pandas(data) + + +def scan_parquet( + source: Union[str, pathlib.Path], **kwargs: Any +) -> "pl.LazyFrame": + """ + Lazily scan a Parquet file/dataset into a :class:`polars.LazyFrame`. + + This enables predicate/projection pushdown for large profiling datasets + without materializing them eagerly. + """ + pl = _polars() + return pl.scan_parquet(source, **kwargs) + + +def read_parquet(source: Union[str, pathlib.Path], **kwargs: Any) -> "pl.DataFrame": + """Eagerly read a Parquet file into a :class:`polars.DataFrame`.""" + pl = _polars() + return pl.read_parquet(source, **kwargs) diff --git a/src/cytodataframe/frame.py b/src/cytodataframe/frame.py index 3cad707..790b44b 100644 --- a/src/cytodataframe/frame.py +++ b/src/cytodataframe/frame.py @@ -42,6 +42,7 @@ ) from skimage.util import img_as_ubyte +from . import engine from .image import ( add_image_scale_bar, adjust_with_adaptive_histogram_equalization, @@ -49,6 +50,9 @@ draw_outline_on_image_from_outline, get_pixel_bbox_from_offsets, ) +from .lazy import CytoLazyFrame, build_context +from .lazy import scan_parquet as _lazy_scan_parquet +from .schema import CytoSchema from .volume import ( build_3d_html_from_path, build_3d_image_html_stub, @@ -112,7 +116,7 @@ class CytoDataFrame(pd.DataFrame): # while avoiding oversized outputs in typical Jupyter viewports. _DEFAULT_TABLE_MAX_HEIGHT: ClassVar[str] = "700px" - def __init__( # noqa: PLR0913 + def __init__( # noqa: PLR0913, C901, PLR0912 self: CytoDataFrame_type, data: Union[CytoDataFrame_type, pd.DataFrame, str, pathlib.Path], data_context_dir: Optional[str] = None, @@ -326,6 +330,19 @@ def __init__( # noqa: PLR0913 super().__init__(data) + # polars/arrow inputs are probed last so the common pandas/path cases + # never trigger a polars/pyarrow import. + elif engine.is_polars_lazyframe(data): + # Lazy polars input: collect through the Arrow contract into the + # pandas compatibility facade. + self._custom_attrs["data_source"] = "polars.LazyFrame" + super().__init__(engine.normalize_to_pandas(data)) + elif engine.is_polars_dataframe(data): + self._custom_attrs["data_source"] = "polars.DataFrame" + super().__init__(engine.normalize_to_pandas(data)) + elif engine.is_arrow_table(data): + self._custom_attrs["data_source"] = "pyarrow.Table" + super().__init__(engine.normalize_to_pandas(data)) else: super().__init__(data) @@ -1506,6 +1523,111 @@ def export( else: raise ValueError("Unsupported file format for export.") + # ------------------------------------------------------------------ # + # Backend / interchange conversions (Polars engine, Arrow contract) + # ------------------------------------------------------------------ # + def to_pandas(self: CytoDataFrame_type) -> pd.DataFrame: + """ + Return the data as a plain :class:`pandas.DataFrame`. + + The pandas layer is CytoDataFrame's compatibility boundary; this + returns a standard pandas DataFrame (not a CytoDataFrame) for use with + pandas-native tooling. + """ + return pd.DataFrame(self) + + def to_polars(self: CytoDataFrame_type) -> Any: + """ + Return the tabular data as an eager :class:`polars.DataFrame`. + + Note: polars has no row-index concept, so the pandas index is dropped. + Object columns holding non-Arrow values (e.g. numpy image arrays) cannot + be converted and will raise a ``TypeError``. + """ + return engine.to_polars(pd.DataFrame(self)) + + def to_lazy(self: CytoDataFrame_type) -> CytoLazyFrame: + """ + Return a lazy, Polars-backed :class:`CytoLazyFrame` view. + + The returned object carries this frame's image/display context so that a + subsequent ``.collect()`` rebuilds an equivalently-configured + CytoDataFrame. + """ + return CytoLazyFrame( + engine.to_lazyframe(pd.DataFrame(self)), + context=build_context(self._custom_attrs), + ) + + def to_arrow(self: CytoDataFrame_type, preserve_index: bool = False) -> Any: + """ + Return the tabular data as a :class:`pyarrow.Table`. + + Arrow is CytoDataFrame's canonical schema and interchange contract. + """ + return engine.to_arrow(pd.DataFrame(self), preserve_index=preserve_index) + + @property + def cyto_schema(self: CytoDataFrame_type) -> CytoSchema: + """ + The inferred :class:`CytoSchema` describing this frame's columns. + + Classifies columns into image/object keys, metadata, feature, and + geometry roles using the Arrow-native schema contract. + """ + return CytoSchema.from_pandas(pd.DataFrame(self)) + + @classmethod + def from_file( + cls, + source: Union[str, pathlib.Path], + **kwargs: Any, + ) -> "CytoDataFrame": + """ + Eagerly construct a CytoDataFrame from a file path. + + A thin, explicit alias for ``CytoDataFrame(source, ...)`` matching the + domain-oriented API in the evolution plan. + """ + return cls(source, **kwargs) + + @classmethod + def scan_parquet( # noqa: PLR0913 + cls, + source: Union[str, pathlib.Path], + data_context_dir: Optional[str] = None, + data_mask_context_dir: Optional[str] = None, + data_outline_context_dir: Optional[str] = None, + segmentation_file_regex: Optional[Dict[str, str]] = None, + image_adjustment: Optional[Callable] = None, + display_options: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> CytoLazyFrame: + """ + Lazily scan a Parquet source into a :class:`CytoLazyFrame`. + + Enables predicate/projection pushdown for large profiling datasets:: + + ( + CytoDataFrame.scan_parquet("profiles.parquet") + .filter(...) + .select_features() + .collect() + ) + + The image/display context provided here is carried through the lazy + pipeline and applied when the result is ``.collect()``-ed. + """ + context = { + "data_context_dir": data_context_dir, + "data_mask_context_dir": data_mask_context_dir, + "data_outline_context_dir": data_outline_context_dir, + "segmentation_file_regex": segmentation_file_regex, + "image_adjustment": image_adjustment, + "display_options": display_options, + } + return _lazy_scan_parquet(source, context=context, **kwargs) + def to_ome_parquet( # noqa: PLR0915, PLR0912, C901 self: CytoDataFrame_type, file_path: Union[str, pathlib.Path], diff --git a/src/cytodataframe/lazy.py b/src/cytodataframe/lazy.py new file mode 100644 index 0000000..25b40bb --- /dev/null +++ b/src/cytodataframe/lazy.py @@ -0,0 +1,271 @@ +""" +Lazy Polars query builder for CytoDataFrame. + +``CytoLazyFrame`` wraps a :class:`polars.LazyFrame` and carries the +CytoDataFrame "context" (image directories, display options, ...) so that a +lazy pipeline can be materialized back into a fully-configured +:class:`~cytodataframe.frame.CytoDataFrame`. + +This is the surface that powers the lazy-execution example from the evolution +plan:: + + ( + CytoDataFrame.scan_parquet("profiles.parquet") + .filter(pl.col("Metadata_Well") == "A01") + .select_features() + .collect() + ) + +It is intentionally a *separate* type from ``CytoDataFrame`` so that its +polars-native ``filter``/``select`` semantics never collide with pandas' own +``DataFrame.filter`` (which CytoDataFrame inherits and relies on internally). +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence + +from . import engine +from .schema import CytoSchema + +if TYPE_CHECKING: # pragma: no cover - typing only + import pandas as pd + import polars as pl + import pyarrow as pa + + from .frame import CytoDataFrame + + +# Constructor kwargs that carry image/display context and should survive a lazy +# pipeline so ``collect()`` rebuilds an equivalently-configured CytoDataFrame. +_CONTEXT_KEYS = ( + "data_context_dir", + "data_mask_context_dir", + "data_outline_context_dir", + "segmentation_file_regex", + "image_adjustment", + "display_options", +) + +# Number of column names shown in a CytoLazyFrame repr before truncating. +_REPR_PREVIEW_COLS = 8 + + +class CytoLazyGroupBy: + """Thin wrapper around a polars lazy group-by that returns a CytoLazyFrame.""" + + def __init__(self, group_by: Any, context: Dict[str, Any]) -> None: + self._group_by = group_by + self._context = context + + def agg(self, *aggs: Any, **named_aggs: Any) -> "CytoLazyFrame": + """Aggregate grouped data, returning a :class:`CytoLazyFrame`.""" + return CytoLazyFrame( + self._group_by.agg(*aggs, **named_aggs), context=self._context + ) + + +class CytoLazyFrame: + """ + A lazy, Polars-backed view over CytoDataFrame data. + + The wrapped :class:`polars.LazyFrame` is the canonical execution engine; + operations build up a query plan and only execute on :meth:`collect`. + """ + + def __init__( + self, + data: Any, + *, + context: Optional[Dict[str, Any]] = None, + ) -> None: + self._lf: "pl.LazyFrame" = engine.to_lazyframe(data) + self._context: Dict[str, Any] = dict(context or {}) + + # ------------------------------------------------------------------ # + # Introspection + # ------------------------------------------------------------------ # + @property + def lazyframe(self) -> "pl.LazyFrame": + """The underlying :class:`polars.LazyFrame`.""" + return self._lf + + @property + def context(self) -> Dict[str, Any]: + """The CytoDataFrame context carried through the pipeline.""" + return dict(self._context) + + @property + def columns(self) -> List[str]: + """Column names of the (lazily) resolved schema.""" + return list(self._lf.collect_schema().names()) + + @property + def cyto_schema(self) -> CytoSchema: + """Infer a :class:`CytoSchema` from the lazy schema (no data scan).""" + return CytoSchema.from_polars(self._lf) + + def _wrap(self, lazyframe: "pl.LazyFrame") -> "CytoLazyFrame": + """Wrap a derived LazyFrame, preserving context.""" + return CytoLazyFrame(lazyframe, context=self._context) + + # ------------------------------------------------------------------ # + # Table operations (delegated to polars, return CytoLazyFrame) + # ------------------------------------------------------------------ # + def filter(self, *predicates: Any, **constraints: Any) -> "CytoLazyFrame": + """Filter rows. Mirrors :meth:`polars.LazyFrame.filter`.""" + return self._wrap(self._lf.filter(*predicates, **constraints)) + + def select(self, *exprs: Any, **named_exprs: Any) -> "CytoLazyFrame": + """Select/transform columns. Mirrors :meth:`polars.LazyFrame.select`.""" + return self._wrap(self._lf.select(*exprs, **named_exprs)) + + def with_columns(self, *exprs: Any, **named_exprs: Any) -> "CytoLazyFrame": + """Add/replace columns. Mirrors :meth:`polars.LazyFrame.with_columns`.""" + return self._wrap(self._lf.with_columns(*exprs, **named_exprs)) + + def rename(self, mapping: Dict[str, str], **kwargs: Any) -> "CytoLazyFrame": + """Rename columns. Mirrors :meth:`polars.LazyFrame.rename`.""" + return self._wrap(self._lf.rename(mapping, **kwargs)) + + def drop(self, *columns: Any, **kwargs: Any) -> "CytoLazyFrame": + """Drop columns. Mirrors :meth:`polars.LazyFrame.drop`.""" + return self._wrap(self._lf.drop(*columns, **kwargs)) + + def sort(self, *by: Any, **kwargs: Any) -> "CytoLazyFrame": + """Sort rows. Mirrors :meth:`polars.LazyFrame.sort`.""" + return self._wrap(self._lf.sort(*by, **kwargs)) + + def unique(self, *args: Any, **kwargs: Any) -> "CytoLazyFrame": + """Drop duplicate rows. Mirrors :meth:`polars.LazyFrame.unique`.""" + return self._wrap(self._lf.unique(*args, **kwargs)) + + def head(self, n: int = 5) -> "CytoLazyFrame": + """Return the first ``n`` rows lazily.""" + return self._wrap(self._lf.head(n)) + + def tail(self, n: int = 5) -> "CytoLazyFrame": + """Return the last ``n`` rows lazily.""" + return self._wrap(self._lf.tail(n)) + + def limit(self, n: int = 5) -> "CytoLazyFrame": + """Limit to ``n`` rows lazily.""" + return self._wrap(self._lf.limit(n)) + + def join( + self, + other: "CytoLazyFrame | pl.LazyFrame | pl.DataFrame | pd.DataFrame", + *args: Any, + **kwargs: Any, + ) -> "CytoLazyFrame": + """ + Join against another frame. Mirrors :meth:`polars.LazyFrame.join`. + + ``other`` may be a CytoLazyFrame, polars LazyFrame/DataFrame, or pandas + DataFrame; it is normalized to a LazyFrame first. + """ + if isinstance(other, CytoLazyFrame): + other_lf = other._lf + else: + other_lf = engine.to_lazyframe(other) + return self._wrap(self._lf.join(other_lf, *args, **kwargs)) + + def group_by(self, *by: Any, **kwargs: Any) -> CytoLazyGroupBy: + """Group rows for aggregation. Mirrors :meth:`polars.LazyFrame.group_by`.""" + return CytoLazyGroupBy(self._lf.group_by(*by, **kwargs), self._context) + + def select_features( + self, + features: Optional[Iterable[str]] = None, + *, + keep_metadata: bool = True, + ) -> "CytoLazyFrame": + """ + Select feature columns (optionally keeping metadata identifiers). + + When ``features`` is omitted, the schema-inferred feature columns are + used. When ``keep_metadata`` is True, metadata/identifier/image columns + are retained alongside the selected features, preserving original column + order. + """ + import polars as pl + + schema = self.cyto_schema + available = self.columns + available_set = set(available) + + if features is None: + feature_set = set(schema.feature_columns) + else: + feature_set = {str(f) for f in features} + + keep = set(feature_set) + if keep_metadata: + keep.update(schema.metadata_columns) + + selected = [col for col in available if col in keep and col in available_set] + return self._wrap(self._lf.select([pl.col(c) for c in selected])) + + # ------------------------------------------------------------------ # + # Materialization + # ------------------------------------------------------------------ # + def collect_polars(self, **kwargs: Any) -> "pl.DataFrame": + """Execute the query plan, returning an eager :class:`polars.DataFrame`.""" + return self._lf.collect(**kwargs) + + def to_polars(self, **kwargs: Any) -> "pl.DataFrame": + """Alias for :meth:`collect_polars`.""" + return self.collect_polars(**kwargs) + + def to_arrow(self, **kwargs: Any) -> "pa.Table": + """Execute and return a :class:`pyarrow.Table`.""" + return self.collect_polars(**kwargs).to_arrow() + + def to_pandas(self, **kwargs: Any) -> "pd.DataFrame": + """Execute and return a :class:`pandas.DataFrame`.""" + return self.collect_polars(**kwargs).to_pandas() + + def collect(self, **kwargs: Any) -> "CytoDataFrame": + """ + Execute the query plan and return a configured ``CytoDataFrame``. + + The CytoDataFrame is rebuilt with the image/display context that was + carried through the lazy pipeline. + """ + # Imported lazily to avoid a circular import at module load time. + from .frame import CytoDataFrame + + pandas_df = self.collect_polars(**kwargs).to_pandas() + context = {k: v for k, v in self._context.items() if k in _CONTEXT_KEYS} + return CytoDataFrame(pandas_df, **context) + + def __repr__(self) -> str: + try: + cols = self.columns + head = cols[:_REPR_PREVIEW_COLS] + preview = ", ".join(head) + ( + " ..." if len(cols) > _REPR_PREVIEW_COLS else "" + ) + except Exception: # repr must never raise + preview = "" + return f"CytoLazyFrame(columns=[{preview}])" + + +def build_context(custom_attrs: Dict[str, Any]) -> Dict[str, Any]: + """Extract the carry-through context from a CytoDataFrame ``_custom_attrs``.""" + return {key: custom_attrs.get(key) for key in _CONTEXT_KEYS} + + +def scan_parquet( + source: Any, + *, + context: Optional[Dict[str, Any]] = None, + **kwargs: Any, +) -> CytoLazyFrame: + """Lazily scan a Parquet source into a :class:`CytoLazyFrame`.""" + return CytoLazyFrame(engine.scan_parquet(source, **kwargs), context=context) + + +def from_sequence_context(keys: Sequence[str], values: Sequence[Any]) -> Dict[str, Any]: + """Build a context dict from parallel key/value sequences (helper for tests).""" + return dict(zip(keys, values, strict=False)) diff --git a/src/cytodataframe/schema.py b/src/cytodataframe/schema.py new file mode 100644 index 0000000..5ec5982 --- /dev/null +++ b/src/cytodataframe/schema.py @@ -0,0 +1,456 @@ +""" +Formal schema system for CytoDataFrame. + +This module implements the Arrow-native schema contract described in the +CytoDataFrame evolution plan (Phases 2 and 3). It provides: + + * :class:`CytoSchema` - an explicit, inspectable classification of a + profiling table's columns into image / object keys, metadata, feature, and + geometry roles. The classification reduces reliance on ad-hoc naming + conventions scattered through the codebase and gives downstream operations + a single source of truth. + * Arrow-native struct helpers that fold the flattened CellProfiler-style + bounding-box / centroid columns into nested Arrow structs while keeping the + flattened compatibility columns available for existing consumers. + +Schema inference is deterministic and works from a pandas DataFrame, a polars +DataFrame/LazyFrame, or a :class:`pyarrow.Schema`/:class:`pyarrow.Table`. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, List, Mapping, Optional, Sequence + +if TYPE_CHECKING: # pragma: no cover - typing only + import pandas as pd + import polars as pl + import pyarrow as pa + + +# --------------------------------------------------------------------------- # +# Column-role detection patterns +# --------------------------------------------------------------------------- # + +# Geometry columns hold spatial coordinates (bounding boxes / centroids). +_GEOMETRY_PATTERN = re.compile( + r"(boundingbox" + r"|location_center" + r"|areashape_center" + r"|center_mass" + r"|_center_[xyz]\b" + r"|_center_[xyz]$)", + flags=re.IGNORECASE, +) + +# Image filename / path columns reference images rather than measurements. +_IMAGE_FILENAME_PATTERN = re.compile(r"filename", flags=re.IGNORECASE) +_IMAGE_PATHNAME_PATTERN = re.compile(r"pathname", flags=re.IGNORECASE) + +# Known single-cell object identifier columns, in preference order. +_OBJECT_KEY_PRIORITY = ( + "metadata_objectnumber", + "metadata_object_number", + "objectnumber", + "object_number", +) +_OBJECT_KEY_SUFFIX = "number_object_number" + +# Known identifier-style metadata columns (casefolded exact names). +_KNOWN_ID_COLUMNS = frozenset( + { + "imagenumber", + "objectnumber", + "object_number", + "tablenumber", + "table_number", + "plate", + "well", + "site", + } +) + + +def _is_image_column(name: str) -> bool: + """Return True when a column name references an image filename or path.""" + return bool( + _IMAGE_FILENAME_PATTERN.search(name) + or _IMAGE_PATHNAME_PATTERN.search(name) + ) + + +def _is_geometry_column(name: str) -> bool: + """Return True when a column name encodes spatial geometry.""" + return bool(_GEOMETRY_PATTERN.search(name)) + + +def _is_identifier_metadata(name: str) -> bool: + """Return True when a column name looks like an identifier/metadata column.""" + lowered = name.casefold() + if lowered.startswith("metadata"): + return True + if lowered in _KNOWN_ID_COLUMNS: + return True + return lowered.endswith(_OBJECT_KEY_SUFFIX) + + +@dataclass +class CytoSchema: + """ + An explicit classification of a profiling table's columns. + + Attributes: + image_key: + The primary image filename column, if present. + object_key: + The single-cell object identifier column, if present. + metadata_columns: + Identifier / annotation / image-reference / non-numeric columns. + feature_columns: + Numeric measurement columns (the modeling features). + geometry_columns: + Spatial coordinate columns (bounding boxes, centroids). + image_columns: + All image filename/path columns (``image_key`` is the first). + """ + + image_key: Optional[str] = None + object_key: Optional[str] = None + metadata_columns: List[str] = field(default_factory=list) + feature_columns: List[str] = field(default_factory=list) + geometry_columns: List[str] = field(default_factory=list) + image_columns: List[str] = field(default_factory=list) + + # ------------------------------------------------------------------ # + # Construction / inference + # ------------------------------------------------------------------ # + @classmethod + def from_columns( + cls, + columns: Sequence[str], + numeric: Optional[Mapping[str, bool]] = None, + ) -> "CytoSchema": + """ + Classify ``columns`` into schema roles. + + Args: + columns: + Ordered column names. + numeric: + Optional mapping of column name -> whether the column holds a + numeric dtype. When a column is absent from the mapping (or the + mapping is ``None``) the column is treated as numeric for the + purpose of feature detection, so name-based rules still apply. + """ + numeric = dict(numeric) if numeric is not None else None + + metadata: List[str] = [] + features: List[str] = [] + geometry: List[str] = [] + image_columns: List[str] = [] + + for name in columns: + col = str(name) + is_numeric = True if numeric is None else bool(numeric.get(col, True)) + + if _is_image_column(col): + image_columns.append(col) + metadata.append(col) + continue + if _is_geometry_column(col): + geometry.append(col) + continue + if _is_identifier_metadata(col) or not is_numeric: + metadata.append(col) + continue + features.append(col) + + image_key = image_columns[0] if image_columns else None + object_key = cls._detect_object_key(columns) + + return cls( + image_key=image_key, + object_key=object_key, + metadata_columns=metadata, + feature_columns=features, + geometry_columns=geometry, + image_columns=image_columns, + ) + + @staticmethod + def _detect_object_key(columns: Sequence[str]) -> Optional[str]: + """Return the best single-cell object identifier column, if any.""" + lowered = {str(c).casefold(): str(c) for c in columns} + for candidate in _OBJECT_KEY_PRIORITY: + if candidate in lowered: + return lowered[candidate] + for col in columns: + if str(col).casefold().endswith(_OBJECT_KEY_SUFFIX): + return str(col) + return None + + @classmethod + def from_pandas(cls, data: "pd.DataFrame") -> "CytoSchema": + """Infer a schema from a :class:`pandas.DataFrame`.""" + import pandas as pd + + numeric = { + str(col): ( + pd.api.types.is_numeric_dtype(dtype) + and not pd.api.types.is_bool_dtype(dtype) + ) + for col, dtype in data.dtypes.items() + } + return cls.from_columns(list(data.columns), numeric=numeric) + + @classmethod + def from_arrow(cls, schema: "pa.Schema") -> "CytoSchema": + """Infer a schema from a :class:`pyarrow.Schema`.""" + import pyarrow as pa + + def _numeric(dtype: "pa.DataType") -> bool: + return ( + pa.types.is_integer(dtype) + or pa.types.is_floating(dtype) + or pa.types.is_decimal(dtype) + ) + + numeric = {field.name: _numeric(field.type) for field in schema} + return cls.from_columns(list(schema.names), numeric=numeric) + + @classmethod + def from_polars(cls, data: "pl.DataFrame | pl.LazyFrame") -> "CytoSchema": + """Infer a schema from a polars DataFrame or LazyFrame.""" + schema = data.collect_schema() if hasattr(data, "collect_schema") else None + if schema is None: + schema = data.schema + numeric = {name: dtype.is_numeric() for name, dtype in schema.items()} + return cls.from_columns(list(schema.keys()), numeric=numeric) + + @classmethod + def infer(cls, data: Any) -> "CytoSchema": + """ + Infer a schema from any supported tabular input. + + Dispatches on the runtime type so callers can pass pandas, polars, or + Arrow data without converting first. + """ + import pandas as pd + + # pyarrow first: a Table exposes ``.schema``. + try: + import pyarrow as pa + + if isinstance(data, pa.Table): + return cls.from_arrow(data.schema) + if isinstance(data, pa.Schema): + return cls.from_arrow(data) + except ImportError: # pragma: no cover - pyarrow is a hard dependency + pass + + try: + import polars as pl + + if isinstance(data, (pl.DataFrame, pl.LazyFrame)): + return cls.from_polars(data) + except ImportError: # pragma: no cover - polars is a hard dependency + pass + + if isinstance(data, pd.DataFrame): + return cls.from_pandas(data) + + raise TypeError( + f"Cannot infer a CytoSchema from object of type {type(data)!r}." + ) + + # ------------------------------------------------------------------ # + # Introspection / validation + # ------------------------------------------------------------------ # + @property + def columns(self) -> List[str]: + """All classified columns in metadata/geometry/feature order.""" + ordered: List[str] = [] + seen: set[str] = set() + for bucket in ( + self.metadata_columns, + self.geometry_columns, + self.feature_columns, + ): + for col in bucket: + if col not in seen: + seen.add(col) + ordered.append(col) + return ordered + + def validate(self, strict: bool = False) -> List[str]: + """ + Check schema self-consistency. + + Returns a list of human-readable issues. When ``strict`` is True and any + issue is found, a :class:`ValueError` is raised instead. + """ + issues: List[str] = [] + + feature_set = set(self.feature_columns) + metadata_set = set(self.metadata_columns) + geometry_set = set(self.geometry_columns) + + overlap_fm = feature_set & metadata_set + overlap_fg = feature_set & geometry_set + if overlap_fm: + issues.append( + f"Columns classified as both feature and metadata: " + f"{sorted(overlap_fm)}" + ) + if overlap_fg: + issues.append( + f"Columns classified as both feature and geometry: " + f"{sorted(overlap_fg)}" + ) + if self.image_key is not None and self.image_key not in metadata_set: + issues.append( + f"image_key {self.image_key!r} is not present in metadata columns." + ) + + if strict and issues: + raise ValueError("Invalid CytoSchema: " + "; ".join(issues)) + return issues + + def require(self, *keys: str) -> "CytoSchema": + """ + Assert that the named required keys are present. + + Args: + keys: + Any of ``"image_key"`` / ``"object_key"``. Raises + :class:`ValueError` when a required key is ``None``. + """ + missing = [key for key in keys if getattr(self, key, None) is None] + if missing: + raise ValueError( + f"CytoSchema is missing required key(s): {missing}" + ) + return self + + def to_dict(self) -> dict: + """Return a plain-dict view of the schema (handy for tests/serialization).""" + return { + "image_key": self.image_key, + "object_key": self.object_key, + "metadata_columns": list(self.metadata_columns), + "feature_columns": list(self.feature_columns), + "geometry_columns": list(self.geometry_columns), + "image_columns": list(self.image_columns), + } + + +# --------------------------------------------------------------------------- # +# Arrow-native struct helpers (Phase 3) +# --------------------------------------------------------------------------- # + +# Bounding-box column groups keyed by compartment, mirroring the flattened +# CellProfiler naming convention. Order is (min_x, min_y, max_x, max_y). +_BBOX_GROUPS = { + "cytoplasm": "Cytoplasm_AreaShape_BoundingBox", + "nuclei": "Nuclei_AreaShape_BoundingBox", + "cells": "Cells_AreaShape_BoundingBox", + "generic": "AreaShape_BoundingBox", +} + +# Centroid column groups keyed by compartment, mirroring the flattened naming. +_CENTROID_GROUPS = { + "nuclei": "Nuclei_Location_Center", + "nuclei_meta": "Metadata_Nuclei_Location_Center", + "cells": "Cells_Location_Center", + "cells_meta": "Metadata_Cells_Location_Center", + "cytoplasm": "Cytoplasm_Location_Center", + "cytoplasm_meta": "Metadata_Cytoplasm_Location_Center", +} + + +def _bbox_field_columns(prefix: str) -> dict: + """Return the flattened bounding-box column names for a prefix.""" + return { + "min_x": f"{prefix}Minimum_X", + "min_y": f"{prefix}Minimum_Y", + "max_x": f"{prefix}Maximum_X", + "max_y": f"{prefix}Maximum_Y", + "min_z": f"{prefix}Minimum_Z", + "max_z": f"{prefix}Maximum_Z", + } + + +def add_bbox_struct( + data: "pl.DataFrame", + struct_name: str = "bbox", + keep_flattened: bool = True, +) -> "pl.DataFrame": + """ + Fold flattened bounding-box columns into a nested Arrow struct. + + The first matching compartment group (cytoplasm -> nuclei -> cells -> + generic) is used. The flattened compatibility columns are retained by + default so existing consumers keep working. + + Returns the input unchanged when no bounding-box columns are present. + """ + import polars as pl + + required_keys = ("min_x", "min_y", "max_x", "max_y") + available = set(data.columns) + for prefix in _BBOX_GROUPS.values(): + cols = _bbox_field_columns(prefix) + required = {k: v for k, v in cols.items() if k in required_keys} + if not all(col in available for col in required.values()): + continue + fields = [ + pl.col(cols["min_x"]).alias("min_x"), + pl.col(cols["min_y"]).alias("min_y"), + pl.col(cols["max_x"]).alias("max_x"), + pl.col(cols["max_y"]).alias("max_y"), + ] + if cols["min_z"] in available and cols["max_z"] in available: + fields.append(pl.col(cols["min_z"]).alias("min_z")) + fields.append(pl.col(cols["max_z"]).alias("max_z")) + result = data.with_columns(pl.struct(fields).alias(struct_name)) + if not keep_flattened: + drop = [c for c in cols.values() if c in available] + result = result.drop(drop) + return result + return data + + +def add_centroid_struct( + data: "pl.DataFrame", + struct_name: str = "centroid", + keep_flattened: bool = True, +) -> "pl.DataFrame": + """ + Fold flattened centroid columns into a nested Arrow struct ``{x, y[, z]}``. + + The first matching compartment group is used. Flattened compatibility + columns are retained by default. Returns the input unchanged when no + centroid columns are present. + """ + import polars as pl + + available = set(data.columns) + for prefix in _CENTROID_GROUPS.values(): + x_col = f"{prefix}_X" + y_col = f"{prefix}_Y" + z_col = f"{prefix}_Z" + if x_col not in available or y_col not in available: + continue + fields = [ + pl.col(x_col).alias("x"), + pl.col(y_col).alias("y"), + ] + if z_col in available: + fields.append(pl.col(z_col).alias("z")) + result = data.with_columns(pl.struct(fields).alias(struct_name)) + if not keep_flattened: + drop = [c for c in (x_col, y_col, z_col) if c in available] + result = result.drop(drop) + return result + return data diff --git a/tests/test_engine.py b/tests/test_engine.py new file mode 100644 index 0000000..3ee82d6 --- /dev/null +++ b/tests/test_engine.py @@ -0,0 +1,154 @@ +""" +Tests for the CytoDataFrame backend abstraction layer (engine.py). + +Covers Arrow round-trip / interchange guarantees described in the evolution +plan: row counts, nulls, schema, and column ordering must be preserved across + + cdf -> Arrow -> cdf + cdf -> Parquet -> cdf + cdf -> pandas -> cdf + cdf -> Polars -> cdf +""" + +import pathlib + +import numpy as np +import pandas as pd +import polars as pl +import pyarrow as pa +import pytest + +from cytodataframe import CytoDataFrame, engine + + +@pytest.fixture(name="profiling_frame") +def fixture_profiling_frame() -> pd.DataFrame: + """A small profiling-like frame with mixed dtypes and nulls.""" + return pd.DataFrame( + { + "Metadata_Well": ["A01", "A01", "B02", None], + "Metadata_ObjectNumber": [1, 2, 1, 2], + "Cells_AreaShape_Area": [10.0, np.nan, 30.0, 40.0], + "Nuclei_Location_Center_X": [5.0, 6.0, 7.0, 8.0], + "Nuclei_Intensity_MeanIntensity_DNA": [0.1, 0.2, 0.3, 0.4], + } + ) + + +def _assert_tabular_equivalent(left: pd.DataFrame, right: pd.DataFrame) -> None: + """Assert two frames share row count, columns, null mask, and values.""" + left = left.reset_index(drop=True) + right = right.reset_index(drop=True) + assert len(left) == len(right) + assert list(left.columns) == list(right.columns) + for col in left.columns: + lnull = left[col].isna().to_numpy() + rnull = right[col].isna().to_numpy() + assert np.array_equal(lnull, rnull), f"null mask differs for {col}" + lvals = left[col][~left[col].isna()].tolist() + rvals = right[col][~right[col].isna()].tolist() + assert lvals == rvals, f"values differ for {col}" + + +# --------------------------------------------------------------------------- # +# Conversions from every supported input type +# --------------------------------------------------------------------------- # +def test_engine_to_arrow_from_all_inputs(profiling_frame: pd.DataFrame): + pdf = profiling_frame + expected = pa.Table.from_pandas(pdf, preserve_index=False) + for source in ( + pdf, + pl.from_pandas(pdf), + pl.from_pandas(pdf).lazy(), + expected, + ): + table = engine.to_arrow(source) + assert isinstance(table, pa.Table) + assert table.num_rows == len(pdf) + assert table.schema.names == list(pdf.columns) + + +def test_engine_to_polars_from_all_inputs(profiling_frame: pd.DataFrame): + pdf = profiling_frame + for source in ( + pdf, + pl.from_pandas(pdf), + pl.from_pandas(pdf).lazy(), + pa.Table.from_pandas(pdf, preserve_index=False), + ): + out = engine.to_polars(source) + assert isinstance(out, pl.DataFrame) + assert out.height == len(pdf) + assert out.columns == list(pdf.columns) + + +def test_engine_to_lazyframe_passthrough_and_convert(profiling_frame: pd.DataFrame): + lf = pl.from_pandas(profiling_frame).lazy() + # passthrough + assert engine.to_lazyframe(lf) is lf + # convert from pandas + converted = engine.to_lazyframe(profiling_frame) + assert isinstance(converted, pl.LazyFrame) + assert converted.collect().height == len(profiling_frame) + + +def test_engine_to_pandas_returns_pandas_identity(profiling_frame: pd.DataFrame): + # pandas inputs are returned untouched (object columns are never disturbed) + assert engine.to_pandas(profiling_frame) is profiling_frame + converted = engine.to_pandas(pl.from_pandas(profiling_frame)) + assert isinstance(converted, pd.DataFrame) + _assert_tabular_equivalent(profiling_frame, converted) + + +def test_engine_rejects_unsupported_type(): + with pytest.raises(TypeError): + engine.to_arrow(object()) + with pytest.raises(TypeError): + engine.to_polars(42) + + +# --------------------------------------------------------------------------- # +# Round-trip interchange guarantees +# --------------------------------------------------------------------------- # +def test_roundtrip_arrow(profiling_frame: pd.DataFrame): + cdf = CytoDataFrame(profiling_frame) + table = cdf.to_arrow() + restored = CytoDataFrame(table) + assert isinstance(restored, CytoDataFrame) + _assert_tabular_equivalent(profiling_frame, pd.DataFrame(restored)) + + +def test_roundtrip_polars(profiling_frame: pd.DataFrame): + cdf = CytoDataFrame(profiling_frame) + restored = CytoDataFrame(cdf.to_polars()) + _assert_tabular_equivalent(profiling_frame, pd.DataFrame(restored)) + + +def test_roundtrip_pandas(profiling_frame: pd.DataFrame): + cdf = CytoDataFrame(profiling_frame) + restored = CytoDataFrame(cdf.to_pandas()) + _assert_tabular_equivalent(profiling_frame, pd.DataFrame(restored)) + + +def test_roundtrip_parquet(profiling_frame: pd.DataFrame, tmp_path: pathlib.Path): + cdf = CytoDataFrame(profiling_frame) + out = tmp_path / "profiles.parquet" + cdf.export(str(out)) + restored = CytoDataFrame(str(out)) + _assert_tabular_equivalent(profiling_frame, pd.DataFrame(restored)) + + +def test_roundtrip_preserves_schema(profiling_frame: pd.DataFrame): + cdf = CytoDataFrame(profiling_frame) + # Arrow schema names + the inferred CytoSchema survive a polars round-trip. + before = cdf.cyto_schema.to_dict() + after = CytoDataFrame(cdf.to_polars()).cyto_schema.to_dict() + assert before == after + + +def test_scan_parquet_helper(profiling_frame: pd.DataFrame, tmp_path: pathlib.Path): + out = tmp_path / "profiles.parquet" + profiling_frame.to_parquet(out) + lf = engine.scan_parquet(str(out)) + assert isinstance(lf, pl.LazyFrame) + assert lf.collect().height == len(profiling_frame) diff --git a/tests/test_lazy.py b/tests/test_lazy.py new file mode 100644 index 0000000..daa1aa9 --- /dev/null +++ b/tests/test_lazy.py @@ -0,0 +1,164 @@ +""" +Tests for the CytoLazyFrame lazy query builder (lazy.py). + +Covers the lazy-execution surface from the evolution plan and differential +validation that lazy Polars execution matches the equivalent pandas result. +""" + +import pathlib + +import pandas as pd +import polars as pl +import pyarrow as pa +import pytest + +from cytodataframe import CytoDataFrame, CytoLazyFrame + + +@pytest.fixture(name="profiles") +def fixture_profiles() -> pd.DataFrame: + return pd.DataFrame( + { + "Metadata_Well": ["A01", "A01", "B02", "B02", "C03"], + "Metadata_ObjectNumber": [1, 2, 1, 2, 1], + "Cells_AreaShape_Area": [10.0, 20.0, 30.0, 40.0, 50.0], + "Nuclei_Location_Center_X": [1.0, 2.0, 3.0, 4.0, 5.0], + } + ) + + +def test_to_lazy_returns_cytolazyframe(profiles: pd.DataFrame): + lazy = CytoDataFrame(profiles).to_lazy() + assert isinstance(lazy, CytoLazyFrame) + assert lazy.columns == list(profiles.columns) + + +def test_lazy_filter_matches_pandas(profiles: pd.DataFrame): + cdf = CytoDataFrame(profiles) + lazy_result = ( + cdf.to_lazy().filter(pl.col("Cells_AreaShape_Area") >= 30.0).collect() + ) + pandas_result = profiles[profiles["Cells_AreaShape_Area"] >= 30.0] + + assert isinstance(lazy_result, CytoDataFrame) + assert len(lazy_result) == len(pandas_result) + assert ( + lazy_result["Cells_AreaShape_Area"].tolist() + == pandas_result["Cells_AreaShape_Area"].tolist() + ) + + +def test_lazy_eager_equivalence(profiles: pd.DataFrame): + """Lazy and eager polars execution produce identical results.""" + cdf = CytoDataFrame(profiles) + lazy_df = ( + cdf.to_lazy().filter(pl.col("Metadata_Well") == "B02").to_polars() + ) + eager_df = cdf.to_polars().filter(pl.col("Metadata_Well") == "B02") + assert lazy_df.equals(eager_df) + + +def test_lazy_select_features(profiles: pd.DataFrame): + cdf = CytoDataFrame(profiles) + result = cdf.to_lazy().select_features().collect() + # geometry column dropped; metadata + feature retained + assert "Nuclei_Location_Center_X" not in result.columns + assert "Cells_AreaShape_Area" in result.columns + assert "Metadata_Well" in result.columns + + +def test_lazy_select_features_explicit_no_metadata(profiles: pd.DataFrame): + cdf = CytoDataFrame(profiles) + result = ( + cdf.to_lazy() + .select_features(["Cells_AreaShape_Area"], keep_metadata=False) + .collect() + ) + assert list(result.columns) == ["Cells_AreaShape_Area"] + + +def test_lazy_group_by_agg(profiles: pd.DataFrame): + cdf = CytoDataFrame(profiles) + result = ( + cdf.to_lazy() + .group_by("Metadata_Well") + .agg(pl.col("Cells_AreaShape_Area").sum().alias("total")) + .collect() + ) + totals = dict( + zip( + result["Metadata_Well"].tolist(), + result["total"].tolist(), + strict=False, + ) + ) + expected = profiles.groupby("Metadata_Well")["Cells_AreaShape_Area"].sum() + assert totals["A01"] == expected["A01"] + assert totals["B02"] == expected["B02"] + + +def test_lazy_join(profiles: pd.DataFrame): + cdf = CytoDataFrame(profiles) + annotations = pl.DataFrame( + {"Metadata_Well": ["A01", "B02"], "treatment": ["drug", "ctrl"]} + ) + result = ( + cdf.to_lazy().join(annotations, on="Metadata_Well", how="inner").collect() + ) + assert "treatment" in result.columns + # only A01 (2 rows) + B02 (2 rows) survive the inner join + assert len(result) == 4 + + +def test_lazy_rename_and_drop(profiles: pd.DataFrame): + cdf = CytoDataFrame(profiles) + result = ( + cdf.to_lazy() + .rename({"Cells_AreaShape_Area": "area"}) + .drop("Nuclei_Location_Center_X") + .collect() + ) + assert "area" in result.columns + assert "Nuclei_Location_Center_X" not in result.columns + + +def test_lazy_to_arrow_and_polars(profiles: pd.DataFrame): + lazy = CytoDataFrame(profiles).to_lazy() + assert isinstance(lazy.to_arrow(), pa.Table) + assert isinstance(lazy.to_polars(), pl.DataFrame) + assert isinstance(lazy.to_pandas(), pd.DataFrame) + + +def test_lazy_context_carry_through(profiles: pd.DataFrame, tmp_path: pathlib.Path): + """Image/display context survives a lazy pipeline into the collected frame.""" + ctx_dir = str(tmp_path) + cdf = CytoDataFrame( + profiles, + data_context_dir=ctx_dir, + display_options={"width": 123}, + ) + collected = cdf.to_lazy().filter(pl.col("Metadata_Well") == "A01").collect() + assert collected._custom_attrs["data_context_dir"] == ctx_dir + assert collected._custom_attrs["display_options"] == {"width": 123} + + +def test_scan_parquet_pipeline(profiles: pd.DataFrame, tmp_path: pathlib.Path): + out = tmp_path / "profiles.parquet" + profiles.to_parquet(out) + result = ( + CytoDataFrame.scan_parquet(str(out), data_context_dir=str(tmp_path)) + .filter(pl.col("Metadata_Well") == "A01") + .select_features() + .collect() + ) + assert isinstance(result, CytoDataFrame) + assert len(result) == 2 + assert result._custom_attrs["data_context_dir"] == str(tmp_path) + + +def test_scan_parquet_returns_lazyframe(profiles: pd.DataFrame, tmp_path: pathlib.Path): + out = tmp_path / "profiles.parquet" + profiles.to_parquet(out) + scanned = CytoDataFrame.scan_parquet(str(out)) + assert isinstance(scanned, CytoLazyFrame) + assert "CytoLazyFrame" in repr(scanned) diff --git a/tests/test_schema.py b/tests/test_schema.py new file mode 100644 index 0000000..e4cced9 --- /dev/null +++ b/tests/test_schema.py @@ -0,0 +1,199 @@ +""" +Tests for the CytoDataFrame formal schema system (schema.py). + +Covers deterministic schema inference (differential against the hand-written +classification rules), property-based invariants via Hypothesis, and the +Arrow-native bounding-box / centroid struct helpers. +""" + +import pandas as pd +import polars as pl +import pyarrow as pa +import pytest +from hypothesis import given +from hypothesis import strategies as st + +from cytodataframe import CytoDataFrame, CytoSchema +from cytodataframe.schema import add_bbox_struct, add_centroid_struct + + +@pytest.fixture(name="cellprofiler_frame") +def fixture_cellprofiler_frame() -> pd.DataFrame: + """A frame mirroring CellProfiler-style single-cell output.""" + return pd.DataFrame( + { + "Metadata_Well": ["A01", "B02"], + "Metadata_Site": [1, 2], + "ImageNumber": [1, 1], + "ObjectNumber": [1, 2], + "Image_FileName_DNA": ["a.tif", "b.tif"], + "Image_PathName_DNA": ["/imgs", "/imgs"], + "Cells_AreaShape_Area": [100.0, 200.0], + "Cells_Intensity_MeanIntensity_DNA": [0.5, 0.6], + "Nuclei_Location_Center_X": [5.0, 6.0], + "Nuclei_Location_Center_Y": [7.0, 8.0], + "Cells_AreaShape_BoundingBoxMinimum_X": [0, 1], + } + ) + + +# --------------------------------------------------------------------------- # +# Deterministic / differential classification +# --------------------------------------------------------------------------- # +def test_schema_classification_buckets(cellprofiler_frame: pd.DataFrame): + schema = CytoSchema.from_pandas(cellprofiler_frame) + + assert schema.image_key == "Image_FileName_DNA" + assert schema.object_key == "ObjectNumber" + + # Features are numeric measurement columns only. + assert set(schema.feature_columns) == { + "Cells_AreaShape_Area", + "Cells_Intensity_MeanIntensity_DNA", + } + # Geometry columns are spatial coordinates. + assert set(schema.geometry_columns) == { + "Nuclei_Location_Center_X", + "Nuclei_Location_Center_Y", + "Cells_AreaShape_BoundingBoxMinimum_X", + } + # Metadata holds identifiers + image references. + assert "Metadata_Well" in schema.metadata_columns + assert "Image_FileName_DNA" in schema.metadata_columns + assert "ObjectNumber" in schema.metadata_columns + + +def test_schema_inference_matches_across_backends(cellprofiler_frame: pd.DataFrame): + """pandas, polars, and Arrow inference agree.""" + from_pandas = CytoSchema.from_pandas(cellprofiler_frame).to_dict() + from_polars = CytoSchema.from_polars( + pl.from_pandas(cellprofiler_frame) + ).to_dict() + from_arrow = CytoSchema.from_arrow( + pa.Table.from_pandas(cellprofiler_frame, preserve_index=False).schema + ).to_dict() + assert from_pandas == from_polars == from_arrow + + +def test_schema_infer_dispatch(cellprofiler_frame: pd.DataFrame): + table = pa.Table.from_pandas(cellprofiler_frame, preserve_index=False) + assert CytoSchema.infer(table).to_dict() == CytoSchema.infer( + cellprofiler_frame + ).to_dict() + assert CytoSchema.infer(pl.from_pandas(cellprofiler_frame).lazy()).to_dict() == ( + CytoSchema.infer(cellprofiler_frame).to_dict() + ) + + +def test_schema_validate_and_require(cellprofiler_frame: pd.DataFrame): + schema = CytoSchema.from_pandas(cellprofiler_frame) + assert schema.validate() == [] + assert schema.require("image_key", "object_key") is schema + + bare = CytoSchema.from_columns(["Cells_AreaShape_Area"]) + with pytest.raises(ValueError, match="missing required key"): + bare.require("image_key") + + +def test_schema_validate_detects_overlap(): + bad = CytoSchema( + feature_columns=["x"], + metadata_columns=["x"], + ) + issues = bad.validate() + assert any("feature and metadata" in issue for issue in issues) + with pytest.raises(ValueError): + bad.validate(strict=True) + + +def test_cytodataframe_cyto_schema_property(cellprofiler_frame: pd.DataFrame): + cdf = CytoDataFrame(cellprofiler_frame) + assert cdf.cyto_schema.image_key == "Image_FileName_DNA" + + +# --------------------------------------------------------------------------- # +# Property-based invariants +# --------------------------------------------------------------------------- # +_NAME_VOCAB = [ + "Metadata_Well", + "Metadata_Plate", + "ImageNumber", + "ObjectNumber", + "Image_FileName_DNA", + "Image_PathName_DNA", + "Cells_AreaShape_Area", + "Nuclei_Intensity_MeanIntensity", + "Cells_AreaShape_BoundingBox_Minimum_X", + "Nuclei_Location_Center_X", + "RandomFeature_1", + "AnnotationLabel", +] + + +@given( + columns=st.lists( + st.sampled_from(_NAME_VOCAB), min_size=1, max_size=12, unique=True + ), + numeric_seed=st.lists(st.booleans(), min_size=12, max_size=12), +) +def test_schema_partition_invariants(columns: list, numeric_seed: list): + numeric = { + name: numeric_seed[idx % len(numeric_seed)] + for idx, name in enumerate(columns) + } + schema = CytoSchema.from_columns(columns, numeric=numeric) + + meta = set(schema.metadata_columns) + feat = set(schema.feature_columns) + geom = set(schema.geometry_columns) + + # Every column is classified into exactly one of the three buckets. + assert meta | feat | geom == set(columns) + assert meta.isdisjoint(feat) + assert feat.isdisjoint(geom) + assert meta.isdisjoint(geom) + + # A non-numeric column is never treated as a feature. + for name in columns: + if not numeric[name]: + assert name not in feat + + +# --------------------------------------------------------------------------- # +# Arrow-native struct helpers (Phase 3) +# --------------------------------------------------------------------------- # +def test_add_bbox_struct_keeps_flattened(): + df = pl.DataFrame( + { + "Cells_AreaShape_BoundingBoxMinimum_X": [0, 1], + "Cells_AreaShape_BoundingBoxMinimum_Y": [0, 1], + "Cells_AreaShape_BoundingBoxMaximum_X": [10, 11], + "Cells_AreaShape_BoundingBoxMaximum_Y": [10, 11], + } + ) + out = add_bbox_struct(df) + assert "bbox" in out.columns + # flattened compatibility columns remain available + assert "Cells_AreaShape_BoundingBoxMinimum_X" in out.columns + struct = out["bbox"][0] + assert struct["min_x"] == 0 + assert struct["max_y"] == 10 + + +def test_add_centroid_struct_xy(): + df = pl.DataFrame( + { + "Nuclei_Location_Center_X": [5.0, 6.0], + "Nuclei_Location_Center_Y": [7.0, 8.0], + } + ) + out = add_centroid_struct(df) + assert "centroid" in out.columns + assert out["centroid"][0]["x"] == 5.0 + assert out["centroid"][0]["y"] == 7.0 + + +def test_struct_helpers_noop_without_columns(): + df = pl.DataFrame({"a": [1, 2]}) + assert add_bbox_struct(df).columns == ["a"] + assert add_centroid_struct(df).columns == ["a"] diff --git a/uv.lock b/uv.lock index cd3704a..92ceace 100644 --- a/uv.lock +++ b/uv.lock @@ -425,22 +425,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e6/51/aac7e419521d5519e13087a7198623655648c939822bd7f4bdc9ccbe07f9/botocore-1.42.42-py3-none-any.whl", hash = "sha256:1c9df5fc31e9073a9aa956271c4007d72f5d342cafca5f4154ea099bc6f83085", size = 14600186, upload-time = "2026-02-04T20:28:29.268Z" }, ] -[[package]] -name = "bqplot" -version = "0.12.45" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ipywidgets" }, - { name = "numpy" }, - { name = "pandas" }, - { name = "traitlets" }, - { name = "traittypes" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a3/e0/727335c5ff8cee68d21a8c79f5b8406011639a76ecd7a6462a60aa8b0608/bqplot-0.12.45.tar.gz", hash = "sha256:ede00e9fdf7d92e43cc2d1b9691c7da176b6216fdd187c8e92f19d7beaca5e2a", size = 1205882, upload-time = "2025-05-21T17:32:29.143Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/73/03/6b5370fc626e6f480c4a0b4cb25b3459d390745010618b21b4b573423a53/bqplot-0.12.45-py2.py3-none-any.whl", hash = "sha256:cf2e046adb401670902ab53a18d9f63540091279bc45c4ef281bfdadf6e7e92c", size = 1237450, upload-time = "2025-05-21T17:32:27.617Z" }, -] - [[package]] name = "certifi" version = "2026.1.4" @@ -779,17 +763,27 @@ dependencies = [ { name = "imagecodecs" }, { name = "imageio" }, { name = "ipython" }, - { name = "ipyvolume" }, { name = "ipywidgets" }, - { name = "matplotlib" }, - { name = "nest-asyncio" }, - { name = "ome-arrow" }, { name = "opencv-python" }, { name = "pandas" }, + { name = "polars" }, { name = "pyarrow" }, - { name = "pyvista" }, - { name = "pywavelets" }, { name = "scikit-image" }, +] + +[package.optional-dependencies] +all = [ + { name = "ome-arrow" }, + { name = "pyvista" }, + { name = "trame" }, + { name = "trame-vtk" }, + { name = "trame-vuetify" }, +] +ome = [ + { name = "ome-arrow" }, +] +viz3d = [ + { name = "pyvista" }, { name = "trame" }, { name = "trame-vtk" }, { name = "trame-vuetify" }, @@ -802,14 +796,20 @@ dev = [ { name = "coverage" }, { name = "duckdb" }, { name = "httpcore" }, + { name = "hypothesis" }, { name = "isort" }, { name = "jupyterlab" }, { name = "jupyterlab-code-formatter" }, { name = "jupytext" }, + { name = "ome-arrow" }, { name = "poethepoet" }, { name = "pytest" }, { name = "pytest-cov" }, + { name = "pyvista" }, { name = "sqlalchemy" }, + { name = "trame" }, + { name = "trame-vtk" }, + { name = "trame-vuetify" }, ] docs = [ { name = "dunamai" }, @@ -822,24 +822,23 @@ docs = [ [package.metadata] requires-dist = [ + { name = "cytodataframe", extras = ["ome", "viz3d"], marker = "extra == 'all'" }, { name = "imagecodecs", specifier = ">=2024.9.22,<2027" }, { name = "imageio", specifier = ">=2.37,<3" }, { name = "ipython", specifier = ">=8.12.3,<10" }, - { name = "ipyvolume", specifier = ">=0.6.3,<0.7" }, { name = "ipywidgets", specifier = ">=8.1.7,<9" }, - { name = "matplotlib", specifier = ">=3.9.3,<4" }, - { name = "nest-asyncio", specifier = ">=1.6,<2" }, - { name = "ome-arrow", specifier = ">=0.0.3,<0.0.9" }, + { name = "ome-arrow", marker = "extra == 'ome'", specifier = ">=0.0.3,<0.0.10" }, { name = "opencv-python", specifier = ">=4.10.0.84,<5" }, { name = "pandas", specifier = ">=2.2.2,<4" }, + { name = "polars", specifier = ">=1,<2" }, { name = "pyarrow", specifier = ">=16" }, - { name = "pyvista", specifier = ">=0.46.4" }, - { name = "pywavelets", specifier = ">1.4.1" }, + { name = "pyvista", marker = "extra == 'viz3d'", specifier = ">=0.46.4" }, { name = "scikit-image", specifier = ">0.19.3" }, - { name = "trame", specifier = ">=3.12" }, - { name = "trame-vtk", specifier = ">=2.10" }, - { name = "trame-vuetify", specifier = ">=3.1" }, + { name = "trame", marker = "extra == 'viz3d'", specifier = ">=3.12" }, + { name = "trame-vtk", marker = "extra == 'viz3d'", specifier = ">=2.10" }, + { name = "trame-vuetify", marker = "extra == 'viz3d'", specifier = ">=3.1" }, ] +provides-extras = ["ome", "viz3d", "all"] [package.metadata.requires-dev] dev = [ @@ -848,20 +847,26 @@ dev = [ { name = "coverage", specifier = ">=7.6,<8" }, { name = "duckdb", specifier = ">=1.1.3,<2" }, { name = "httpcore", specifier = ">=0.18,<1.1" }, + { name = "hypothesis", specifier = ">=6,<7" }, { name = "isort", specifier = ">=5.13.2,<9" }, { name = "jupyterlab", specifier = ">=4.3,<5" }, { name = "jupyterlab-code-formatter", specifier = ">=3.0.2,<4" }, { name = "jupytext", specifier = ">=1.16.4,<2" }, - { name = "poethepoet", specifier = ">=0.37,<0.43" }, + { name = "ome-arrow", specifier = ">=0.0.3,<0.0.10" }, + { name = "poethepoet", specifier = ">=0.37,<0.47" }, { name = "pytest", specifier = ">=8.3.3,<10" }, { name = "pytest-cov", specifier = ">=5,<8" }, + { name = "pyvista", specifier = ">=0.46.4" }, { name = "sqlalchemy", specifier = ">=1.3.6,<3" }, + { name = "trame", specifier = ">=3.12" }, + { name = "trame-vtk", specifier = ">=2.10" }, + { name = "trame-vuetify", specifier = ">=3.1" }, ] docs = [ { name = "dunamai", specifier = ">=1.22,<2" }, { name = "myst-nb", specifier = ">=1.1.2,<2" }, { name = "myst-parser", specifier = ">=3,<6" }, - { name = "pydata-sphinx-theme", specifier = ">=0.16,<0.17" }, + { name = "pydata-sphinx-theme", specifier = ">=0.16,<0.19" }, { name = "sphinx", specifier = ">=9,<9.1" }, { name = "sphinx-multiversion", git = "https://github.com/J-RN/sphinx-multiversion?rev=a77f0c862dace3a62c18fc866da60ef7dde3873d" }, ] @@ -1315,6 +1320,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[[package]] +name = "hypothesis" +version = "6.155.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sortedcontainers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/10/aa/9a91a4addf285702a98713da44b3581799539426436617bfb8914478c166/hypothesis-6.155.6.tar.gz", hash = "sha256:7569e1897690336c85d49d8391b49ec6ab83d951009515bfc29faebbac286cf5", size = 478038, upload-time = "2026-06-19T13:21:23.379Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/a9/4c17e962c2e9cbc314bb579ed2e2b2da45d7b6b942aab6948d14d85abfea/hypothesis-6.155.6-py3-none-any.whl", hash = "sha256:a96d9a29f6bbc8ccac39dd84e140892da76765464929f401a4181b90c20c9ad1", size = 544521, upload-time = "2026-06-19T13:21:20.934Z" }, +] + [[package]] name = "idna" version = "3.11" @@ -1385,20 +1402,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, ] -[[package]] -name = "ipydatawidgets" -version = "4.3.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ipywidgets" }, - { name = "numpy" }, - { name = "traittypes" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/bc/88/332ba20bb0e0b8078f97bc1469f332be796b804c565b41163b93241e0657/ipydatawidgets-4.3.5.tar.gz", hash = "sha256:394f2489576587cfd755377a09a067f46cad22081965092021fd1abcbe7852a8", size = 799182, upload-time = "2023-06-14T11:16:06.587Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f1/5b/e63c877c4c94382b66de5045e08ec8cd960e8a4d22f0d62a4dfb1f9e5ac6/ipydatawidgets-4.3.5-py2.py3-none-any.whl", hash = "sha256:d590cdb7c364f2f6ab346f20b9d2dd661d27a834ef7845bc9d7113118f05ec87", size = 271703, upload-time = "2023-06-14T11:16:03.955Z" }, -] - [[package]] name = "ipykernel" version = "7.2.0" @@ -1457,62 +1460,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" }, ] -[[package]] -name = "ipyvolume" -version = "0.6.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "bqplot" }, - { name = "ipyvue" }, - { name = "ipyvuetify" }, - { name = "ipywebrtc" }, - { name = "ipywidgets" }, - { name = "matplotlib" }, - { name = "numpy" }, - { name = "pillow" }, - { name = "pythreejs" }, - { name = "requests" }, - { name = "traitlets" }, - { name = "traittypes" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/bc/8c/560b41f231006d6b10749289aa33173268afc06cee92a77570d3fc4dff38/ipyvolume-0.6.3.tar.gz", hash = "sha256:823226f90a59ce08b1da2699a9ec505f34f65f01ce43accd80e7d3554082d035", size = 1596303, upload-time = "2023-06-02T14:33:08.671Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/88/ca/153406ca7ff41ea3ecf8c3b5c0db07364461e867fb197b1723bf0be2652d/ipyvolume-0.6.3-py3-none-any.whl", hash = "sha256:550761b5cc1a9fb0e8931056fd523b2f0074ddea46633a248f996168e5b0d7f6", size = 1612135, upload-time = "2023-06-02T14:33:05.246Z" }, -] - -[[package]] -name = "ipyvue" -version = "1.12.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ipywidgets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d2/37/7b66ea86cde30f4983566cbfb8bb133eed4d2252a7f0b941057855e666e7/ipyvue-1.12.0.tar.gz", hash = "sha256:408b5e6a64e203fc679f447a071e3dbc178ab2906982f248adf722fc84773ffa", size = 1749270, upload-time = "2026-02-11T10:07:43.884Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/48/be/cb0bd788bda9624a2facd270a6b1eef1b606bdeacee3a6c1cf9e79704afc/ipyvue-1.12.0-py2.py3-none-any.whl", hash = "sha256:c7f555a71c28724ceda344af294bdc48407eace17222065cfb7b4cff80665362", size = 2673161, upload-time = "2026-02-11T10:07:41.91Z" }, -] - -[[package]] -name = "ipyvuetify" -version = "1.11.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ipyvue" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a4/07/31c9615532b6c190a3033460e4aa83a64ac532281758ff734e1bc42e3c00/ipyvuetify-1.11.3.tar.gz", hash = "sha256:3580afa76d9add4ae04ccb7fd57d4a0cf03a261705742e7137def3ebb65ac71d", size = 6170730, upload-time = "2025-07-02T11:25:12.691Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/47/4d/fd1a6a888f8abb6b8dc316cc78b5153e75eff7ae66a94cf30b144fadd09d/ipyvuetify-1.11.3-py2.py3-none-any.whl", hash = "sha256:fa83aaf9f4ce669172d532094d60bd7c40d3cb9c5d6bb2f4a14565da2b09a8d8", size = 6290266, upload-time = "2025-07-02T11:25:10.553Z" }, -] - -[[package]] -name = "ipywebrtc" -version = "0.6.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/80/1f/7f603de52bb167eb37168c16dd5b0218cc3e336ef0538e178f0fbeff5e90/ipywebrtc-0.6.0.tar.gz", hash = "sha256:f8ac3cc02b3633b59f388aef67961cff57f90028fd303bb3886c63c3d631da13", size = 253863, upload-time = "2021-03-29T11:27:33.42Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e9/11/4b83894a009ef522b5751881e21ffec55d56b0900c0b788e2906ec01c51d/ipywebrtc-0.6.0-py2.py3-none-any.whl", hash = "sha256:01a6c9d79ab937c280ce4635a149c7b681457e99ea779c00c7a6aa44ee6916f8", size = 260745, upload-time = "2021-03-29T11:27:31.379Z" }, -] - [[package]] name = "ipywidgets" version = "8.1.8" @@ -2839,6 +2786,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/5e/0b83e0222ce5921b3f9081eeca8c6fb3e1cfd5ca0d06338adf93b28ce061/poethepoet-0.41.0-py3-none-any.whl", hash = "sha256:4bab9fd8271664c5d21407e8f12827daeb6aa484dc6cc7620f0c3b4e62b42ee4", size = 113590, upload-time = "2026-02-08T20:45:34.697Z" }, ] +[[package]] +name = "polars" +version = "1.41.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "polars-runtime-32" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/f9/aeda46259b0669247a160315d2d51269de9504b9dd2f70acadbcb22f46b7/polars-1.41.2.tar.gz", hash = "sha256:256d6731162371b77f3f29a55eacb8c0fc740ddb1a293a01d2ef5b5393c5c708", size = 737996, upload-time = "2026-05-29T17:39:15.604Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1f/22/28f62d24f7db56ac4343588f9362d49b7b4177e55ac47a466fe696b0099b/polars-1.41.2-py3-none-any.whl", hash = "sha256:23ce9a2910b6e3e8d4258770bf44aa17170958df7af6e85feedf4458a04d8d29", size = 833445, upload-time = "2026-05-29T17:37:05.576Z" }, +] + +[[package]] +name = "polars-runtime-32" +version = "1.41.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/56/54e3ea0e9b64f327179049e4742241cc6b1d3e8fa414b05a057dd26df367/polars_runtime_32-1.41.2.tar.gz", hash = "sha256:7af09ec1ab053da2c9669e8d15f809a4083a29be05db57111688b8051062af56", size = 2989474, upload-time = "2026-05-29T17:39:17.257Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d6/9b/fe72a3811c0357cdb06c67bdc7695fa1623ad47948fc523195f5ac31037f/polars_runtime_32-1.41.2-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:95a08346dac337357cdb825c8076df7d36da54c4caa59a5cb41d0a30691c5edd", size = 52265283, upload-time = "2026-05-29T17:37:09.407Z" }, + { url = "https://files.pythonhosted.org/packages/0a/93/fab9da803fd80d9e83ef88c20932f637a10bc611b20415fc322eec84bc44/polars_runtime_32-1.41.2-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:dedfaeec2c7f995298da7319dd9431d662e5dd1d0ec51b1459df4a0234ceff52", size = 46571222, upload-time = "2026-05-29T17:37:13.698Z" }, + { url = "https://files.pythonhosted.org/packages/c8/2a/8843f34a8ac57acd058a39b87b03b580dd352a490e9dae0415e02033bdd4/polars_runtime_32-1.41.2-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18eea22c5cc34e27f8a60950458ad81e6a9ea75e89363ca1367e14e7e7f781fc", size = 50409372, upload-time = "2026-05-29T17:37:17.875Z" }, + { url = "https://files.pythonhosted.org/packages/6c/c6/92b352fe88cf51bd0a19fb99e1c0cbe46aa26c14dcf7995b89869cd932ae/polars_runtime_32-1.41.2-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2630540dfdfb0f36f9b04a07c7c2e3f50bf2ad384113263c1c812007ee9141e0", size = 56405484, upload-time = "2026-05-29T17:37:22.684Z" }, + { url = "https://files.pythonhosted.org/packages/74/c4/bae3174c3b02f6b441d2e58594387abcd509f67a098f682a83b195f08966/polars_runtime_32-1.41.2-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:20e969e08f9b137e233c04cc04de73d9795f89eb77d34854e40a025965a43763", size = 50603512, upload-time = "2026-05-29T17:37:27.422Z" }, + { url = "https://files.pythonhosted.org/packages/f4/ed/f2d26ae02d92c2689056838ed59e2a626326ad23c2831d58637d25f6c82a/polars_runtime_32-1.41.2-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e7016a3deb641b64a31447abbbee0f34bd020a6a9ae34ee6b743837def15e2a4", size = 54328561, upload-time = "2026-05-29T17:37:32.587Z" }, + { url = "https://files.pythonhosted.org/packages/9b/c4/9c3831cc885dc7769e59abf8f583821a5fb4403fd0e4eba0ccc6d47a3d4b/polars_runtime_32-1.41.2-cp310-abi3-win_amd64.whl", hash = "sha256:1e5e5377c315e0dcafdfb2a31adc546abbaeb3f9cb1864e6536523d2af473265", size = 51978643, upload-time = "2026-05-29T17:37:37.443Z" }, + { url = "https://files.pythonhosted.org/packages/cd/c6/79e9f3f270270d7ed5575d92b7bfef49f01abd9275447161275b23b553a8/polars_runtime_32-1.41.2-cp310-abi3-win_arm64.whl", hash = "sha256:843d96f69d18eca53429c1198e58891db7f18111f83b9c419bb45ad9d73eaed5", size = 46006901, upload-time = "2026-05-29T17:37:42.522Z" }, +] + [[package]] name = "pooch" version = "1.9.0" @@ -3212,21 +3187,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/51/e5/fecf13f06e5e5f67e8837d777d1bc43fac0ed2b77a676804df5c34744727/python_json_logger-4.0.0-py3-none-any.whl", hash = "sha256:af09c9daf6a813aa4cc7180395f50f2a9e5fa056034c9953aec92e381c5ba1e2", size = 15548, upload-time = "2025-10-06T04:15:17.553Z" }, ] -[[package]] -name = "pythreejs" -version = "2.4.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ipydatawidgets" }, - { name = "ipywidgets" }, - { name = "numpy" }, - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/0a/2e/0ec94286b8eb3fe1200700080e8adb2c8d871bb8db589858a49600d97a7d/pythreejs-2.4.2.tar.gz", hash = "sha256:a568bfdc4c3797c4c2339158928edc7dcf6fa4a267b08e3cec5121e2078b5bd6", size = 4731310, upload-time = "2023-02-20T00:23:30.081Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d8/8b/e2bbeb42068f0c48899e8eddd34902afc0f7429d4d2a152d2dc2670dc661/pythreejs-2.4.2-py3-none-any.whl", hash = "sha256:8418807163ad91f4df53b58c4e991b26214852a1236f28f1afeaadf99d095818", size = 3363905, upload-time = "2023-02-20T00:23:27.283Z" }, -] - [[package]] name = "pytokens" version = "0.4.1" @@ -3279,49 +3239,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/95/c1/c8efb5c0696fa3f7f7c424234dc08fa1e0ecc2292c53500090d93c81a648/pyvista-0.47.0-py3-none-any.whl", hash = "sha256:35d9b003d3bfac709da5b76dd264919b6847c469be08283d3295833f6a7ea657", size = 2508448, upload-time = "2026-02-08T20:21:00.614Z" }, ] -[[package]] -name = "pywavelets" -version = "1.9.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5a/75/50581633d199812205ea8cdd0f6d52f12a624886b74bf1486335b67f01ff/pywavelets-1.9.0.tar.gz", hash = "sha256:148d12203377772bea452a59211d98649c8ee4a05eff019a9021853a36babdc8", size = 3938340, upload-time = "2025-08-04T16:20:04.978Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/bd/8b/ca700d0c174c3a4eec1fbb603f04374d1fed84255c2a9f487cfaa749c865/pywavelets-1.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:54662cce4d56f0d6beaa6ebd34b2960f3aa4a43c83c9098a24729e9dc20a4be2", size = 4323640, upload-time = "2025-08-04T16:18:51.683Z" }, - { url = "https://files.pythonhosted.org/packages/b5/f3/0fa57b6407ea9c4452b0bc182141256b9481b479ffbfc9d7fdb73afe193b/pywavelets-1.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0d8ed4b4d1eab9347e8fe0c5b45008ce5a67225ce5b05766b8b1fa923a5f8b34", size = 4294938, upload-time = "2025-08-04T16:18:53.818Z" }, - { url = "https://files.pythonhosted.org/packages/ea/95/a998313c8459a57e488ff2b18e24be9e836aedda3aa3a1673197deeaa59a/pywavelets-1.9.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:862be65481fdfecfd84c6b0ca132ba571c12697a082068921bca5b5e039f1371", size = 4472829, upload-time = "2025-08-04T16:18:55.508Z" }, - { url = "https://files.pythonhosted.org/packages/d8/8c/f316a153f7f89d2753df8a7371d15d0faab87e709fe02715dbc297c79385/pywavelets-1.9.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d76b7fa8fc500b09201d689b4f15bf5887e30ffbe2e1f338eb8470590eb4521a", size = 4524936, upload-time = "2025-08-04T16:18:57.146Z" }, - { url = "https://files.pythonhosted.org/packages/24/f7/89fdc1caef4b384a341a8e149253e23f36c1702bbb986a26123348624854/pywavelets-1.9.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aa859d0b686a697c87a47e29319aebe44125f114a4f8c7e444832b921f52de5a", size = 4481475, upload-time = "2025-08-04T16:18:58.725Z" }, - { url = "https://files.pythonhosted.org/packages/82/53/b733fbfb71853e4a5c430da56e325a763562d65241dd785f0fadb67aed6a/pywavelets-1.9.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:20e97b84a263003e2c7348bcf72beba96edda1a6169f072dc4e4d4ee3a6c7368", size = 4527994, upload-time = "2025-08-04T16:18:59.917Z" }, - { url = "https://files.pythonhosted.org/packages/ed/15/5f6a6e9fdad8341e42642ed622a5f3033da4ea9d426cc3e574ae418b4726/pywavelets-1.9.0-cp311-cp311-win32.whl", hash = "sha256:f8330cdbfa506000e63e79525716df888998a76414c5cd6ecd9a7e371191fb05", size = 4136109, upload-time = "2025-08-04T16:19:01.511Z" }, - { url = "https://files.pythonhosted.org/packages/fd/33/62dbb4aea86ec9d79b283127c42cc896f4d4ff265a9aeb1337a7836dd550/pywavelets-1.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:ed10959a17df294ef55948dcc76367d59ec7b6aad67e38dd4e313d2fe3ad47b2", size = 4228321, upload-time = "2025-08-04T16:19:03.164Z" }, - { url = "https://files.pythonhosted.org/packages/5c/37/3fda13fb2518fdd306528382d6b18c116ceafefff0a7dccd28f1034f4dd2/pywavelets-1.9.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:30baa0788317d3c938560c83fe4fc43817342d06e6c9662a440f73ba3fb25c9b", size = 4320835, upload-time = "2025-08-04T16:19:04.855Z" }, - { url = "https://files.pythonhosted.org/packages/36/65/a5549325daafc3eae4b52de076798839eaf529a07218f8fb18cccefe76a1/pywavelets-1.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:df7436a728339696a7aa955c020ae65c85b0d9d2b5ff5b4cf4551f5d4c50f2c7", size = 4290469, upload-time = "2025-08-04T16:19:06.178Z" }, - { url = "https://files.pythonhosted.org/packages/05/85/901bb756d37dfa56baa26ef4a3577aecfe9c55f50f51366fede322f8c91d/pywavelets-1.9.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:07b26526db2476974581274c43a9c2447c917418c6bd03c8d305ad2a5cd9fac3", size = 4437717, upload-time = "2025-08-04T16:19:07.514Z" }, - { url = "https://files.pythonhosted.org/packages/0f/34/0f54dd9c288941294898877008bcb5c07012340cc9c5db9cff1bd185d449/pywavelets-1.9.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:573b650805d2f3c981a0e5ae95191c781a722022c37a0f6eba3fa7eae8e0ee17", size = 4483843, upload-time = "2025-08-04T16:19:08.857Z" }, - { url = "https://files.pythonhosted.org/packages/48/1f/cff6bb4ea64ff508d8cac3fe113c0aa95310a7446d9efa6829027cc2afdf/pywavelets-1.9.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3747ec804492436de6e99a7b6130480e53406d047e87dc7095ab40078a515a23", size = 4442236, upload-time = "2025-08-04T16:19:11.061Z" }, - { url = "https://files.pythonhosted.org/packages/ce/53/a3846eeefe0fb7ca63ae045f038457aa274989a15af793c1b824138caf98/pywavelets-1.9.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5163665686219c3f43fd5bbfef2391e87146813961dad0f86c62d4aed561f547", size = 4488077, upload-time = "2025-08-04T16:19:12.333Z" }, - { url = "https://files.pythonhosted.org/packages/f7/98/44852d2fe94455b72dece2db23562145179d63186a1c971125279a1c381f/pywavelets-1.9.0-cp312-cp312-win32.whl", hash = "sha256:80b8ab99f5326a3e724f71f23ba8b0a5b03e333fa79f66e965ea7bed21d42a2f", size = 4134094, upload-time = "2025-08-04T16:19:13.564Z" }, - { url = "https://files.pythonhosted.org/packages/2c/a7/0d9ee3fe454d606e0f5c8e3aebf99d2ecddbfb681826a29397729538c8f1/pywavelets-1.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:92bfb8a117b8c8d3b72f2757a85395346fcbf37f50598880879ae72bd8e1c4b9", size = 4213900, upload-time = "2025-08-04T16:19:14.939Z" }, - { url = "https://files.pythonhosted.org/packages/db/a7/dec4e450675d62946ad975f5b4d924437df42d2fae46e91dfddda2de0f5a/pywavelets-1.9.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:74f8455c143818e4b026fc67b27fd82f38e522701b94b8a6d1aaf3a45fcc1a25", size = 4316201, upload-time = "2025-08-04T16:19:16.259Z" }, - { url = "https://files.pythonhosted.org/packages/aa/0c/b54b86596c0df68027e48c09210e907e628435003e77048384a2dd6767e3/pywavelets-1.9.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c50320fe0a4a23ddd8835b3dc9b53b09ee05c7cc6c56b81d0916f04fc1649070", size = 4286838, upload-time = "2025-08-04T16:19:17.92Z" }, - { url = "https://files.pythonhosted.org/packages/5a/9c/333969c3baad8af2e7999e83addcb7bb1d1fd48e2d812fb27e2e89582cb1/pywavelets-1.9.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d6e059265223ed659e5214ab52a84883c88ddf3decbf08d7ec6abb8e4c5ed7be", size = 4430753, upload-time = "2025-08-04T16:19:19.529Z" }, - { url = "https://files.pythonhosted.org/packages/e5/1b/a24c6ff03b026b826ad7b9267bd63cd34ce026795a0302f8a5403840b8e7/pywavelets-1.9.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ae10ed46c139c7ddb8b1249cfe0989f8ccb610d93f2899507b1b1573a0e424b5", size = 4491315, upload-time = "2025-08-04T16:19:20.717Z" }, - { url = "https://files.pythonhosted.org/packages/d7/c7/e3fbb502fca3469e51ced4f1e1326364c338be91edc5db5a8ddd26b303fa/pywavelets-1.9.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c8f8b1cc2df012401cb837ee6fa2f59607c7b4fe0ff409d9a4f6906daf40dc86", size = 4437654, upload-time = "2025-08-04T16:19:22.359Z" }, - { url = "https://files.pythonhosted.org/packages/92/44/c9b25084048d9324881a19b88e0969a4141bcfdc1d218f1b4b680b7af1c1/pywavelets-1.9.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:db43969c7a8fbb17693ecfd14f21616edc3b29f0e47a49b32fa4127c01312a67", size = 4496435, upload-time = "2025-08-04T16:19:23.842Z" }, - { url = "https://files.pythonhosted.org/packages/cd/b6/b27ec18c72b1dee3314e297af39c5f8136d43cc130dd93cb6c178ca820e5/pywavelets-1.9.0-cp313-cp313-win32.whl", hash = "sha256:9e7d60819d87dcd6c68a2d1bc1d37deb1f4d96607799ab6a25633ea484dcda41", size = 4132709, upload-time = "2025-08-04T16:19:25.415Z" }, - { url = "https://files.pythonhosted.org/packages/0a/87/78ef3f9fb36cdb16ee82371d22c3a7c89eeb79ec8c9daef6222060da6c79/pywavelets-1.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:0d70da9d7858c869e24dc254f16a61dc09d8a224cad85a10c393b2eccddeb126", size = 4213377, upload-time = "2025-08-04T16:19:26.875Z" }, - { url = "https://files.pythonhosted.org/packages/8b/cd/ca0d9db0ff29e3843f6af60c2f5eb588794e05ca8eeb872a595867b1f3f5/pywavelets-1.9.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4dc85f44c38d76a184a1aa2cb038f802c3740428c9bb877525f4be83a223b134", size = 4354336, upload-time = "2025-08-04T16:19:28.745Z" }, - { url = "https://files.pythonhosted.org/packages/82/d6/70afefcc1139f37d02018a3b1dba3b8fc87601bb7707d9616b7f7a76e269/pywavelets-1.9.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7acf6f950c6deaecd210fbff44421f234a8ca81eb6f4da945228e498361afa9d", size = 4335721, upload-time = "2025-08-04T16:19:30.371Z" }, - { url = "https://files.pythonhosted.org/packages/cd/3a/713f731b9ed6df0c36269c8fb62be8bb28eb343b9e26b13d6abda37bce38/pywavelets-1.9.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:144d4fc15c98da56654d0dca2d391b812b8d04127b194a37ad4a497f8e887141", size = 4418702, upload-time = "2025-08-04T16:19:31.743Z" }, - { url = "https://files.pythonhosted.org/packages/44/e8/f801eb4b5f7a316ba20054948c5d6b27b879c77fab2674942e779974bd86/pywavelets-1.9.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1aa3729585408a979d655736f74b995b511c86b9be1544f95d4a3142f8f4b8b5", size = 4470023, upload-time = "2025-08-04T16:19:32.963Z" }, - { url = "https://files.pythonhosted.org/packages/e9/cc/44b002cb16f2a392f2082308dd470b3f033fa4925d3efa7c46f790ce895a/pywavelets-1.9.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e0e24ad6b8eb399c49606dd1fcdcbf9749ad7f6d638be3fe6f59c1f3098821e2", size = 4426498, upload-time = "2025-08-04T16:19:34.151Z" }, - { url = "https://files.pythonhosted.org/packages/91/fe/2b70276ede7878c5fe8356ca07574db5da63e222ce39a463e84bfad135e8/pywavelets-1.9.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:3830e6657236b53a3aae20c735cccead942bb97c54bbca9e7d07bae01645fe9c", size = 4477528, upload-time = "2025-08-04T16:19:35.932Z" }, - { url = "https://files.pythonhosted.org/packages/e7/ed/d58b540c15e36508cfeded7b0d39493e811b0dce18d9d4e6787fb2e89685/pywavelets-1.9.0-cp313-cp313t-win32.whl", hash = "sha256:81bb65facfbd7b50dec50450516e72cdc51376ecfdd46f2e945bb89d39bfb783", size = 4186493, upload-time = "2025-08-04T16:19:37.198Z" }, - { url = "https://files.pythonhosted.org/packages/84/b2/12a849650d618a86bbe4d8876c7e20a7afe59a8cad6f49c57eca9af26dfa/pywavelets-1.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:47d52cf35e2afded8cfe1133663f6f67106a3220b77645476ae660ad34922cb4", size = 4274821, upload-time = "2025-08-04T16:19:38.436Z" }, -] - [[package]] name = "pywinpty" version = "3.0.3" @@ -4087,18 +4004,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, ] -[[package]] -name = "traittypes" -version = "0.2.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d6/8d/37d686f52dfbccc47b857751531ffdec262b0f35158dd3b306030dafdb83/traittypes-0.2.3.tar.gz", hash = "sha256:212feed38d566d772648768b78d3347c148ef23915b91c02078188e631316c86", size = 16003, upload-time = "2025-10-22T11:06:09.952Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/c0/fdf9d3ee103ce66a55f0532835ad5e154226c5222423c6636ba049dc42fc/traittypes-0.2.3-py2.py3-none-any.whl", hash = "sha256:49016082ce740d6556d9bb4672ee2d899cd14f9365f17cbb79d5d96b47096d4e", size = 8130, upload-time = "2025-10-22T11:06:08.824Z" }, -] - [[package]] name = "trame" version = "3.12.0" From 16e39778e977aa6d24ffff4dac3ad3b12c81201f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci-lite[bot]" <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> Date: Fri, 19 Jun 2026 17:02:28 +0000 Subject: [PATCH 2/3] [pre-commit.ci lite] apply automatic fixes --- .pre-commit-config.yaml | 4 ++-- pyproject.toml | 18 +++++++++--------- src/cytodataframe/engine.py | 4 +--- src/cytodataframe/schema.py | 13 ++++--------- tests/test_lazy.py | 12 +++--------- tests/test_schema.py | 14 ++++++-------- 6 files changed, 25 insertions(+), 40 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index abcf4db..2542871 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: - id: check-yaml - id: detect-private-key - repo: https://github.com/tox-dev/pyproject-fmt - rev: "v2.23.0" + rev: "v2.25.0" hooks: - id: pyproject-fmt - repo: https://github.com/codespell-project/codespell @@ -50,7 +50,7 @@ repos: hooks: - id: actionlint - repo: https://github.com/astral-sh/ruff-pre-commit - rev: "v0.15.15" + rev: "v0.15.18" hooks: - id: ruff-format - id: ruff-check diff --git a/pyproject.toml b/pyproject.toml index 5186635..4a475ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,9 @@ dependencies = [ "pyarrow>=16", "scikit-image>0.19.3", ] +optional-dependencies.all = [ + "cytodataframe[ome,viz3d]", +] # Optional feature stacks. Install with e.g. `pip install cytodataframe[viz3d,ome]`. # These are imported lazily, so the core package imports fine without them. optional-dependencies.ome = [ @@ -43,9 +46,6 @@ optional-dependencies.viz3d = [ "trame-vtk>=2.10", "trame-vuetify>=3.1", ] -optional-dependencies.all = [ - "cytodataframe[ome,viz3d]", -] [dependency-groups] dev = [ @@ -59,13 +59,13 @@ dev = [ "jupyterlab>=4.3,<5", "jupyterlab-code-formatter>=3.0.2,<4", "jupytext>=1.16.4,<2", + # optional-feature stacks needed to exercise the full test suite + "ome-arrow>=0.0.3,<0.0.10", "poethepoet>=0.37,<0.47", "pytest>=8.3.3,<10", "pytest-cov>=5,<8", - "sqlalchemy>=1.3.6,<3", - # optional-feature stacks needed to exercise the full test suite - "ome-arrow>=0.0.3,<0.0.10", "pyvista>=0.46.4", + "sqlalchemy>=1.3.6,<3", "trame>=3.12", "trame-vtk>=2.10", "trame-vuetify>=3.1", @@ -80,14 +80,14 @@ docs = [ ] [tool.setuptools] -package-dir = { "" = "src" } packages.find.where = [ "src" ] +package-dir = { "" = "src" } [tool.setuptools_scm] -root = "." +version_file = "src/cytodataframe/_version.py" version_scheme = "no-guess-dev" local_scheme = "no-local-version" -version_file = "src/cytodataframe/_version.py" +root = "." [tool.uv] default-groups = [ diff --git a/src/cytodataframe/engine.py b/src/cytodataframe/engine.py index 3b20b80..ecd1e92 100644 --- a/src/cytodataframe/engine.py +++ b/src/cytodataframe/engine.py @@ -197,9 +197,7 @@ def normalize_to_pandas(data: TabularData) -> pd.DataFrame: return to_pandas(data) -def scan_parquet( - source: Union[str, pathlib.Path], **kwargs: Any -) -> "pl.LazyFrame": +def scan_parquet(source: Union[str, pathlib.Path], **kwargs: Any) -> "pl.LazyFrame": """ Lazily scan a Parquet file/dataset into a :class:`polars.LazyFrame`. diff --git a/src/cytodataframe/schema.py b/src/cytodataframe/schema.py index 5ec5982..cc19a5f 100644 --- a/src/cytodataframe/schema.py +++ b/src/cytodataframe/schema.py @@ -75,8 +75,7 @@ def _is_image_column(name: str) -> bool: """Return True when a column name references an image filename or path.""" return bool( - _IMAGE_FILENAME_PATTERN.search(name) - or _IMAGE_PATHNAME_PATTERN.search(name) + _IMAGE_FILENAME_PATTERN.search(name) or _IMAGE_PATHNAME_PATTERN.search(name) ) @@ -300,13 +299,11 @@ def validate(self, strict: bool = False) -> List[str]: overlap_fg = feature_set & geometry_set if overlap_fm: issues.append( - f"Columns classified as both feature and metadata: " - f"{sorted(overlap_fm)}" + f"Columns classified as both feature and metadata: {sorted(overlap_fm)}" ) if overlap_fg: issues.append( - f"Columns classified as both feature and geometry: " - f"{sorted(overlap_fg)}" + f"Columns classified as both feature and geometry: {sorted(overlap_fg)}" ) if self.image_key is not None and self.image_key not in metadata_set: issues.append( @@ -328,9 +325,7 @@ def require(self, *keys: str) -> "CytoSchema": """ missing = [key for key in keys if getattr(self, key, None) is None] if missing: - raise ValueError( - f"CytoSchema is missing required key(s): {missing}" - ) + raise ValueError(f"CytoSchema is missing required key(s): {missing}") return self def to_dict(self) -> dict: diff --git a/tests/test_lazy.py b/tests/test_lazy.py index daa1aa9..c5af565 100644 --- a/tests/test_lazy.py +++ b/tests/test_lazy.py @@ -35,9 +35,7 @@ def test_to_lazy_returns_cytolazyframe(profiles: pd.DataFrame): def test_lazy_filter_matches_pandas(profiles: pd.DataFrame): cdf = CytoDataFrame(profiles) - lazy_result = ( - cdf.to_lazy().filter(pl.col("Cells_AreaShape_Area") >= 30.0).collect() - ) + lazy_result = cdf.to_lazy().filter(pl.col("Cells_AreaShape_Area") >= 30.0).collect() pandas_result = profiles[profiles["Cells_AreaShape_Area"] >= 30.0] assert isinstance(lazy_result, CytoDataFrame) @@ -51,9 +49,7 @@ def test_lazy_filter_matches_pandas(profiles: pd.DataFrame): def test_lazy_eager_equivalence(profiles: pd.DataFrame): """Lazy and eager polars execution produce identical results.""" cdf = CytoDataFrame(profiles) - lazy_df = ( - cdf.to_lazy().filter(pl.col("Metadata_Well") == "B02").to_polars() - ) + lazy_df = cdf.to_lazy().filter(pl.col("Metadata_Well") == "B02").to_polars() eager_df = cdf.to_polars().filter(pl.col("Metadata_Well") == "B02") assert lazy_df.equals(eager_df) @@ -102,9 +98,7 @@ def test_lazy_join(profiles: pd.DataFrame): annotations = pl.DataFrame( {"Metadata_Well": ["A01", "B02"], "treatment": ["drug", "ctrl"]} ) - result = ( - cdf.to_lazy().join(annotations, on="Metadata_Well", how="inner").collect() - ) + result = cdf.to_lazy().join(annotations, on="Metadata_Well", how="inner").collect() assert "treatment" in result.columns # only A01 (2 rows) + B02 (2 rows) survive the inner join assert len(result) == 4 diff --git a/tests/test_schema.py b/tests/test_schema.py index e4cced9..f3dbbc2 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -66,9 +66,7 @@ def test_schema_classification_buckets(cellprofiler_frame: pd.DataFrame): def test_schema_inference_matches_across_backends(cellprofiler_frame: pd.DataFrame): """pandas, polars, and Arrow inference agree.""" from_pandas = CytoSchema.from_pandas(cellprofiler_frame).to_dict() - from_polars = CytoSchema.from_polars( - pl.from_pandas(cellprofiler_frame) - ).to_dict() + from_polars = CytoSchema.from_polars(pl.from_pandas(cellprofiler_frame)).to_dict() from_arrow = CytoSchema.from_arrow( pa.Table.from_pandas(cellprofiler_frame, preserve_index=False).schema ).to_dict() @@ -77,9 +75,10 @@ def test_schema_inference_matches_across_backends(cellprofiler_frame: pd.DataFra def test_schema_infer_dispatch(cellprofiler_frame: pd.DataFrame): table = pa.Table.from_pandas(cellprofiler_frame, preserve_index=False) - assert CytoSchema.infer(table).to_dict() == CytoSchema.infer( - cellprofiler_frame - ).to_dict() + assert ( + CytoSchema.infer(table).to_dict() + == CytoSchema.infer(cellprofiler_frame).to_dict() + ) assert CytoSchema.infer(pl.from_pandas(cellprofiler_frame).lazy()).to_dict() == ( CytoSchema.infer(cellprofiler_frame).to_dict() ) @@ -138,8 +137,7 @@ def test_cytodataframe_cyto_schema_property(cellprofiler_frame: pd.DataFrame): ) def test_schema_partition_invariants(columns: list, numeric_seed: list): numeric = { - name: numeric_seed[idx % len(numeric_seed)] - for idx, name in enumerate(columns) + name: numeric_seed[idx % len(numeric_seed)] for idx, name in enumerate(columns) } schema = CytoSchema.from_columns(columns, numeric=numeric) From f7a15d62d6888622bae140f74521037e5e385dfd Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 19 Jun 2026 12:09:01 -0600 Subject: [PATCH 3/3] address coderabbit review --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 62a1c50..a4c82ea 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ With CytoDataFrame you can: - Automatically detect 3D image volumes and render interactive [trame](https://github.com/Kitware/trame) views in notebooks when 3D dependencies are installed (with graceful fallback otherwise). - Interoperate with the [Polars](https://pola.rs/) and [Apache Arrow](https://arrow.apache.org/) ecosystems while keeping the familiar Pandas-based experience. -### Polars and Arrow interoperability +## Polars and Arrow interoperability CytoDataFrame uses Apache Arrow as its canonical schema/interchange contract and Polars as an execution engine, while Pandas remains the compatibility layer. You