cytomining · d33bs · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
@@ -15,7 +15,7 @@ repos:
     -   id: check-yaml
     -   id: detect-private-key
 -   repo: https://github.com/tox-dev/pyproject-fmt
-    rev: "v2.23.0"
+    rev: "v2.25.0"
     hooks:
     -   id: pyproject-fmt
 -   repo: https://github.com/codespell-project/codespell
@@ -50,7 +50,7 @@ repos:
     hooks:
     -   id: actionlint
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.15.15"
+    rev: "v0.15.18"
     hooks:
     -   id: ruff-format
     -   id: ruff-check

@@ -23,6 +23,39 @@ With CytoDataFrame you can:
 - Highlight image objects using mask or outline files to understand their segmentation.
 - Adjust image displays on-the-fly using interactive slider widgets.
 - Automatically detect 3D image volumes and render interactive [trame](https://github.com/Kitware/trame) views in notebooks when 3D dependencies are installed (with graceful fallback otherwise).
+- Interoperate with the [Polars](https://pola.rs/) and [Apache Arrow](https://arrow.apache.org/) ecosystems while keeping the familiar Pandas-based experience.
+
+## Polars and Arrow interoperability
+
+CytoDataFrame uses Apache Arrow as its canonical schema/interchange contract and
+Polars as an execution engine, while Pandas remains the compatibility layer. You
+can move between representations and run lazy, scalable queries without leaving
+the CytoDataFrame API:
+
+```python
+import polars as pl
+from cytodataframe import CytoDataFrame
+
+# Construct from pandas, polars (DataFrame or LazyFrame), or a pyarrow Table.
+cdf = CytoDataFrame("profiles.parquet")
+
+# Convert out to any representation (Pandas stays a boundary layer).
+cdf.to_pandas()   # pandas.DataFrame
+cdf.to_polars()   # polars.DataFrame
+cdf.to_arrow()    # pyarrow.Table
+cdf.to_lazy()     # CytoLazyFrame (lazy, Polars-backed)
+
+# Inspect the inferred schema (metadata / feature / geometry roles).
+cdf.cyto_schema
+
+# Lazily scan large Parquet datasets with predicate/projection pushdown.
+result = (
+    CytoDataFrame.scan_parquet("profiles.parquet")
+    .filter(pl.col("Metadata_Well") == "A01")
+    .select_features()
+    .collect()  # -> CytoDataFrame
+)
+```
 
 For 3D notebook display behavior:
 
@@ -53,6 +86,20 @@ pip install cytodataframe
 pip install git+https://github.com/cytomining/CytoDataFrame.git
 ```
 
+The core install is intentionally lean. Heavier, feature-specific stacks are
+available as optional extras:
+
+```shell
+# interactive 3D volume rendering (trame / pyvista)
+pip install "cytodataframe[viz3d]"
+
+# OME-Arrow image read/write/embedding (to_ome_parquet, OME-Arrow columns)
+pip install "cytodataframe[ome]"
+
+# everything
+pip install "cytodataframe[all]"
+```
+
 ## Contributing, Development, and Testing
 
 Please see our [contributing](https://cytomining.github.io/CytoDataFrame/main/contributing) documentation for more details on contributions, development, and testing.

@@ -23,17 +23,25 @@ dependencies = [
   "imagecodecs>=2024.9.22,<2027",
   "imageio>=2.37,<3",
   "ipython>=8.12.3,<10",
-  "ipyvolume>=0.6.3,<0.7",
   "ipywidgets>=8.1.7,<9",
-  "matplotlib>=3.9.3,<4",
-  "nest-asyncio>=1.6,<2",
-  "ome-arrow>=0.0.3,<0.0.10",
   "opencv-python>=4.10.0.84,<5",
   "pandas>=2.2.2,<4",
+  "polars>=1,<2",
   "pyarrow>=16",
-  "pyvista>=0.46.4",
-  "pywavelets>1.4.1",
   "scikit-image>0.19.3",
+]
+optional-dependencies.all = [
+  "cytodataframe[ome,viz3d]",
+]
+# Optional feature stacks. Install with e.g. `pip install cytodataframe[viz3d,ome]`.
+# These are imported lazily, so the core package imports fine without them.
+optional-dependencies.ome = [
+  # OME-Arrow image read/write/embedding (to_ome_parquet, OME-Arrow columns).
+  "ome-arrow>=0.0.3,<0.0.10",
+]
+optional-dependencies.viz3d = [
+  # Interactive 3D volume rendering (trame/pyvista views in notebooks).
+  "pyvista>=0.46.4",
   "trame>=3.12",
   "trame-vtk>=2.10",
   "trame-vuetify>=3.1",
@@ -46,14 +54,21 @@ dev = [
   "coverage>=7.6,<8",
   "duckdb>=1.1.3,<2",
   "httpcore>=0.18,<1.1",
+  "hypothesis>=6,<7",
   "isort>=5.13.2,<9",
   "jupyterlab>=4.3,<5",
   "jupyterlab-code-formatter>=3.0.2,<4",
   "jupytext>=1.16.4,<2",
+  # optional-feature stacks needed to exercise the full test suite
+  "ome-arrow>=0.0.3,<0.0.10",
   "poethepoet>=0.37,<0.47",
   "pytest>=8.3.3,<10",
   "pytest-cov>=5,<8",
+  "pyvista>=0.46.4",
   "sqlalchemy>=1.3.6,<3",
+  "trame>=3.12",
+  "trame-vtk>=2.10",
+  "trame-vuetify>=3.1",
 ]
 docs = [
   "dunamai>=1.22,<2",
@@ -65,14 +80,14 @@ docs = [
 ]
 
 [tool.setuptools]
-package-dir = { "" = "src" }
 packages.find.where = [ "src" ]
+package-dir = { "" = "src" }
 
 [tool.setuptools_scm]
-root = "."
+version_file = "src/cytodataframe/_version.py"
 version_scheme = "no-guess-dev"
 local_scheme = "no-local-version"
-version_file = "src/cytodataframe/_version.py"
+root = "."
 
 [tool.uv]
 default-groups = [

@@ -2,7 +2,17 @@
 Initialization for cytodataframe package
 """
 
+from . import engine
 from .frame import CytoDataFrame
+from .lazy import CytoLazyFrame
+from .schema import CytoSchema
 
 # note: version placeholder is updated during builds
 __version__ = "0.0.0"
+
+__all__ = [
+    "CytoDataFrame",
+    "CytoLazyFrame",
+    "CytoSchema",
+    "engine",
+]
@@ -0,0 +1,214 @@
+"""
+Backend abstraction layer for CytoDataFrame.
+
+This module is the execution/interchange boundary described in the CytoDataFrame
+evolution plan. It treats Apache Arrow as the canonical schema and memory
+contract, Polars as the execution engine, and pandas as a compatibility layer.
+
+The functions here normalize the supported tabular inputs
+
+    * :class:`pandas.DataFrame` / :class:`pandas.Series`
+    * :class:`polars.DataFrame`
+    * :class:`polars.LazyFrame`
+    * :class:`pyarrow.Table`
+    * :class:`cytodataframe.frame.CytoDataFrame` (a ``pandas.DataFrame`` subclass)
+
+into the representation requested by the caller while preserving row counts,
+null semantics, column ordering, and schema.
+
+Design notes:
+    * Arrow is used as the bridge whenever a schema/serialization contract is
+      requested (``to_arrow``).
+    * Conversions intentionally avoid forcing existing *pandas* object columns
+      (which may hold numpy image arrays or OME-Arrow structs) through Arrow,
+      because Arrow cannot always round-trip arbitrary Python objects. Such
+      columns are only converted when the caller explicitly asks for an Arrow or
+      Polars representation.
+"""
+
+from __future__ import annotations
+
+import pathlib
+from typing import TYPE_CHECKING, Any, Union
+
+import pandas as pd
+
+if TYPE_CHECKING:  # pragma: no cover - typing only
+    import polars as pl
+    import pyarrow as pa
+
+# Public alias describing every tabular input CytoDataFrame's engine understands.
+TabularData = Union[
+    "pd.DataFrame",
+    "pd.Series",
+    "pl.DataFrame",
+    "pl.LazyFrame",
+    "pa.Table",
+]
+
+
+def _polars() -> Any:
+    """Import polars lazily so importing this module stays cheap."""
+    import polars as pl
+
+    return pl
+
+
+def _pyarrow() -> Any:
+    """Import pyarrow lazily so importing this module stays cheap."""
+    import pyarrow as pa
+
+    return pa
+
+
+def is_polars_dataframe(data: Any) -> bool:
+    """Return True when ``data`` is a :class:`polars.DataFrame`."""
+    try:
+        pl = _polars()
+    except ImportError:
+        return False
+    return isinstance(data, pl.DataFrame)
+
+
+def is_polars_lazyframe(data: Any) -> bool:
+    """Return True when ``data`` is a :class:`polars.LazyFrame`."""
+    try:
+        pl = _polars()
+    except ImportError:
+        return False
+    return isinstance(data, pl.LazyFrame)
+
+
+def is_arrow_table(data: Any) -> bool:
+    """Return True when ``data`` is a :class:`pyarrow.Table`."""
+    try:
+        pa = _pyarrow()
+    except ImportError:
+        return False
+    return isinstance(data, pa.Table)
+
+
+def is_supported(data: Any) -> bool:
+    """Return True when ``data`` is one of the supported tabular inputs."""
+    return (
+        isinstance(data, (pd.DataFrame, pd.Series))
+        or is_polars_dataframe(data)
+        or is_polars_lazyframe(data)
+        or is_arrow_table(data)
+    )
+
+
+def to_pandas(data: TabularData) -> pd.DataFrame:
+    """
+    Convert any supported tabular input to a :class:`pandas.DataFrame`.
+
+    pandas inputs (including ``CytoDataFrame``) are returned as-is so that object
+    columns holding images or OME-Arrow structs are never disturbed.
+    """
+    if isinstance(data, pd.DataFrame):
+        return data
+    if isinstance(data, pd.Series):
+        return data.to_frame()
+    if is_polars_lazyframe(data):
+        return data.collect().to_pandas()
+    if is_polars_dataframe(data):
+        return data.to_pandas()
+    if is_arrow_table(data):
+        return data.to_pandas()
+    raise TypeError(
+        f"Unsupported type for CytoDataFrame engine conversion: {type(data)!r}"
+    )
+
+
+def to_polars(data: TabularData) -> "pl.DataFrame":
+    """Convert any supported tabular input to an eager :class:`polars.DataFrame`."""
+    pl = _polars()
+    if isinstance(data, pl.DataFrame):
+        return data
+    if isinstance(data, pl.LazyFrame):
+        return data.collect()
+    if is_arrow_table(data):
+        return pl.from_arrow(data)
+    if isinstance(data, pd.Series):
+        data = data.to_frame()
+    if isinstance(data, pd.DataFrame):
+        # Strip any pandas subclass (e.g. CytoDataFrame) and index before handing
+        # the frame to polars, which has no index concept.
+        try:
+            return pl.from_pandas(pd.DataFrame(data))
+        except Exception as exc:
+            raise TypeError(
+                "Could not convert pandas data to polars. Columns holding "
+                "non-Arrow-compatible Python objects (e.g. numpy image arrays) "
+                "cannot be represented in polars/Arrow."
+            ) from exc
+    raise TypeError(
+        f"Unsupported type for CytoDataFrame engine conversion: {type(data)!r}"
+    )
+
+
+def to_lazyframe(data: TabularData) -> "pl.LazyFrame":
+    """Convert any supported tabular input to a :class:`polars.LazyFrame`."""
+    pl = _polars()
+    if isinstance(data, pl.LazyFrame):
+        return data
+    return to_polars(data).lazy()
+
+
+def to_arrow(data: TabularData, *, preserve_index: bool = False) -> "pa.Table":
+    """
+    Convert any supported tabular input to a :class:`pyarrow.Table`.
+
+    Arrow is the canonical schema/serialization contract, so this is the
+    conversion used whenever schema or interchange guarantees matter.
+    """
+    pa = _pyarrow()
+    if is_arrow_table(data):
+        return data
+    if is_polars_lazyframe(data):
+        return data.collect().to_arrow()
+    if is_polars_dataframe(data):
+        return data.to_arrow()
+    if isinstance(data, pd.Series):
+        data = data.to_frame()
+    if isinstance(data, pd.DataFrame):
+        try:
+            return pa.Table.from_pandas(
+                pd.DataFrame(data), preserve_index=preserve_index
+            )
+        except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as exc:
+            raise TypeError(
+                "Could not convert pandas data to an Arrow table. Columns "
+                "holding non-Arrow-compatible Python objects (e.g. numpy image "
+                "arrays) cannot be represented in Arrow."
+            ) from exc
+    raise TypeError(
+        f"Unsupported type for CytoDataFrame engine conversion: {type(data)!r}"
+    )
+
+
+def normalize_to_pandas(data: TabularData) -> pd.DataFrame:
+    """
+    Normalize a supported input to pandas for the compatibility facade.
+
+    This is the ingestion entry point used by ``CytoDataFrame.__init__`` to wrap
+    Polars/Arrow inputs while keeping pandas as the backing store.
+    """
+    return to_pandas(data)
+
+
+def scan_parquet(source: Union[str, pathlib.Path], **kwargs: Any) -> "pl.LazyFrame":
+    """
+    Lazily scan a Parquet file/dataset into a :class:`polars.LazyFrame`.
+
+    This enables predicate/projection pushdown for large profiling datasets
+    without materializing them eagerly.
+    """
+    pl = _polars()
+    return pl.scan_parquet(source, **kwargs)
+
+
+def read_parquet(source: Union[str, pathlib.Path], **kwargs: Any) -> "pl.DataFrame":
+    """Eagerly read a Parquet file into a :class:`polars.DataFrame`."""
+    pl = _polars()
+    return pl.read_parquet(source, **kwargs)