From 7e1cf8339728f6b3e61cbb106061a30f869c6c00 Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Wed, 21 Dec 2022 18:28:27 +0100
Subject: [PATCH 01/17] wip: refactor

---
 examples/build_model_using_mofdscribe.ipynb   |   2 +-
 src/mofdscribe/featurizers/base.py            | 178 +++++------------
 .../featurizers/bu/bu_featurizer.py           |  23 ++-
 src/mofdscribe/featurizers/bu/fragment.py     |  22 +++
 src/mofdscribe/featurizers/graph/__init__.py  |   0
 .../featurizers/graph/dimensionality.py       |  37 ++++
 .../featurizers/graph/graphfeaturizer.py      |  60 ++++++
 .../featurizers/hostguest/guest_aprdf.py      |  29 ++-
 .../hostguest/host_guest_featurizer.py        |  60 +++---
 src/mofdscribe/featurizers/hostguest/utils.py |  36 ++--
 .../featurizers/pore/geometric_properties.py  |  58 +++---
 src/mofdscribe/featurizers/pore/voxelgrid.py  |  17 +-
 .../featurizers/topology/atom_centered_ph.py  |  36 ++--
 .../featurizers/topology/ph_hist.py           |  10 +-
 .../featurizers/topology/ph_image.py          |  27 ++-
 .../featurizers/topology/ph_stats.py          |  12 +-
 src/mofdscribe/featurizers/utils/extend.py    |   2 +
 src/mofdscribe/mof.py                         | 181 ++++++++++++++++++
 src/mofdscribe/types.py                       |   2 +
 tests/conftest.py                             |   6 +
 tests/featurizers/graph/__init__.py           |   0
 .../featurizers/graph/test_dimensionality.py  |   9 +
 .../featurizers/graph/test_graphfeaturizer.py |   9 +
 .../featurizers/hostguest/test_guest_aprdf.py |   3 +-
 .../hostguest/test_host_guest_featurizer.py   |   5 +-
 .../pore/test_geometric_properties.py         |  15 +-
 tests/featurizers/pore/test_voxelgrid.py      |   3 +-
 .../topology/test_atom_centered_ph.py         |  21 +-
 tests/featurizers/topology/test_ph_hist.py    |  16 +-
 tests/featurizers/topology/test_ph_image.py   |  13 +-
 tests/featurizers/topology/test_ph_stats.py   |  15 +-
 tests/test_mof.py                             |  28 +++
 32 files changed, 616 insertions(+), 319 deletions(-)
 create mode 100644 src/mofdscribe/featurizers/bu/fragment.py
 create mode 100644 src/mofdscribe/featurizers/graph/__init__.py
 create mode 100644 src/mofdscribe/featurizers/graph/dimensionality.py
 create mode 100644 src/mofdscribe/featurizers/graph/graphfeaturizer.py
 create mode 100644 src/mofdscribe/mof.py
 create mode 100644 tests/featurizers/graph/__init__.py
 create mode 100644 tests/featurizers/graph/test_dimensionality.py
 create mode 100644 tests/featurizers/graph/test_graphfeaturizer.py
 create mode 100644 tests/test_mof.py

diff --git a/examples/build_model_using_mofdscribe.ipynb b/examples/build_model_using_mofdscribe.ipynb
index a47f2ce5..686248ef 100644
--- a/examples/build_model_using_mofdscribe.ipynb
+++ b/examples/build_model_using_mofdscribe.ipynb
@@ -19633,7 +19633,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:05:16) \n[Clang 12.0.1 ]"
   },
   "orig_nbformat": 4,
   "vscode": {
diff --git a/src/mofdscribe/featurizers/base.py b/src/mofdscribe/featurizers/base.py
index 1436076d..75484d56 100644
--- a/src/mofdscribe/featurizers/base.py
+++ b/src/mofdscribe/featurizers/base.py
@@ -11,34 +11,18 @@
 or and iterable of pymatgen structure objects.
 """
 from abc import abstractmethod
-from functools import partial
 from multiprocessing import Pool
-from typing import Union
+from typing import Collection, Union
 
 import numpy as np
 import pandas as pd
 from loguru import logger
 from matminer.featurizers.base import BaseFeaturizer, MultipleFeaturizer
+from pymatgen.analysis.graphs import StructureGraph
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 from tqdm.auto import tqdm
 
-
-def get_primitive(structure: Union[IStructure, Structure, Molecule, IMolecule]) -> Structure:
-    """Get the primitive cell of a structure.
-
-    We use this wrapper because we want to have a passtrough
-    for molecules, which some of our featurizers also accept.
-
-    Args:
-        structure: Structure object.
-
-    Returns:
-        Structure object.
-    """
-    if isinstance(structure, (Structure, IStructure)):
-        return structure.get_primitive_structure()
-    else:
-        return structure
+from mofdscribe.mof import MOF
 
 
 class MOFBaseFeaturizer(BaseFeaturizer):
@@ -48,134 +32,64 @@ class MOFBaseFeaturizer(BaseFeaturizer):
 
         If you implement a new :code:`MOFBaseFeaturizer`,
         you need to implement a :code:`_featurize` method.
-        If you implement :code:`featurize` directly,
-        you would override the conversion to primitive cell.
-
-    .. warning::
-
-        If you implement a new :code:`MOFBaseFeaturizer`,
-        and your featurizer needs to be fitted, keep in mind
-        to also call the :code:`_get_primitive` method.
     """
 
-    def __init__(self, primitive: bool = False) -> None:
-        """
-        Construct a MOFBaseFeaturizer.
+    @abstractmethod
+    def _featurize(
+        self, structure_object: Union[Structure, IStructure, Molecule, IMolecule, StructureGraph]
+    ) -> np.ndarray:
+        raise NotImplementedError("_featurize must be implemented in a subclass")
 
-        Args:
-            primitive (bool): If True, use the primitive cell of the structure.
-                Defaults to False.
-        """
-        self.primitive = primitive
+    @abstractmethod
+    def featurize(self, mof: MOF) -> np.ndarray:
+        raise NotImplementedError("featurize must be implemented in a subclass")
 
-    def _get_primitive_many(self, structures):
-        if self.n_jobs == 1:
-            return [self._get_primitive(s) for s in structures]
-        else:
-            with Pool(self.n_jobs, maxtasksperchild=1) as p:
-                res = p.map(self._get_primitive, structures, chunksize=self.chunksize)
-                return res
+    # ToDo: make make them abstractmethods in a "FittabelMOFBaseFeaturizer" class
+    def _fit(
+        self, structure_object: Union[Structure, IStructure, Molecule, IMolecule, StructureGraph]
+    ):
+        raise NotImplementedError("_fit is not implemented for MOFBaseFeaturizer")
 
-    def _get_primitive(self, structure: Union[Structure, IStructure]) -> Structure:
-        logger.debug("Getting primitive cell for structure in MOFBaseFeaturizer")
-        return get_primitive(structure)
+    def fit(self, mofs: Union[MOF, Collection[MOF]]):
+        raise NotImplementedError("fit is not implemented for MOFBaseFeaturizer")
 
-    @abstractmethod
-    def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray:
-        raise NotImplementedError("_featurize must be implemented in a subclass")
-
-    def fit(self, structures):
-        if not isinstance(structures, (list, tuple)):
-            structures = [structures]
-        if self.primitive:
-            structures = self._get_primitive_many(structures)
-        self._fit(structures)
 
-    def featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray:
-        """Compute the descriptor for a given structure.
+class MOFBaseSiteFeaturizer(BaseFeaturizer):
+    """Base featurizer for MOF site based featurizers.
 
-        Args:
-            structure (Union[Structure, IStructure]): Structure to compute the descriptor for.
+    .. note::
 
-        Returns:
-            A numpy array containing the descriptor.
-        """
-        logger.debug("Featurizing structure in MOFBaseFeaturizer")
-        if self.primitive:
-            logger.debug("Getting primitive cell for structure in MOFBaseFeaturizer")
-            structure = get_primitive(structure)
-        return self._featurize(structure)
+        If you implement a new :code:`MOFBaseSiteFeaturizer`,
+        you need to implement a :code:`_featurize` method.
+    """
 
-    def featurize_many(self, entries, ignore_errors=False, return_errors=False, pbar=True):
-        """Featurize a list of entries.
+    @abstractmethod
+    def _featurize(
+        self,
+        structure_object: Union[Structure, IStructure, Molecule, IMolecule, StructureGraph],
+        idx: int,
+    ) -> np.ndarray:
+        raise NotImplementedError("_featurize must be implemented in a subclass")
 
-        If `featurize` takes multiple inputs, supply inputs as a list of tuples.
+    @abstractmethod
+    def featurize(self, mof: MOF, idx: int) -> np.ndarray:
+        raise NotImplementedError("featurize must be implemented in a subclass")
 
-        Featurize_many supports entries as a list, tuple, numpy array,
-        Pandas Series, or Pandas DataFrame.
+    # ToDo: make make them abstractmethods in a "FittabelMOFBaseFeaturizer" class
+    def _fit(
+        self,
+        structure_object: Union[Structure, IStructure, Molecule, IMolecule, StructureGraph],
+        sites: Union[int, Collection[int]],
+    ):
+        raise NotImplementedError("_fit is not implemented for MOFBaseSiteFeaturizer")
 
-        Args:
-            entries (list-like object): A list of entries to be featurized.
-            ignore_errors (bool): Returns NaN for entries where exceptions are
-                thrown if True. If False, exceptions are thrown as normal.
-            return_errors (bool): If True, returns the feature list as
-                determined by ignore_errors with traceback strings added
-                as an extra 'feature'. Entries which featurize without
-                exceptions have this extra feature set to NaN.
-            pbar (bool): Show a progress bar for featurization if True.
-
-        Returns:
-            (list) features for each entry.
-
-        Raises:
-            Exception: If entries is not a list-like object.
-            ValueError: If return_errors is set and ignore_errors is True.
-        """
-        if return_errors and not ignore_errors:
-            raise ValueError("Please set ignore_errors to True to use" " return_errors.")
-
-        # Check inputs
-        if not isinstance(entries, (tuple, list, np.ndarray, pd.Series, pd.DataFrame)):
-            raise Exception("'entries' must be a list-like object")
-
-        # Special case: Empty list
-        if len(entries) == 0:
-            return []
-
-        # If the featurize function only has a single arg, zip the inputs
-        if isinstance(entries, pd.DataFrame):
-            entries = entries.values
-        elif isinstance(entries, pd.Series) or not isinstance(
-            entries[0], (tuple, list, np.ndarray)
-        ):
-            entries = zip(entries)
-
-        # Add a progress bar
-        if pbar:
-            # list() required, tqdm has issues with memory if generator given
-            entries = tqdm(list(entries), desc=self.__class__.__name__)
-
-        # Run the actual featurization
-        if self.n_jobs == 1:
-            return np.array(
-                [
-                    self.featurize_wrapper(
-                        x, ignore_errors=ignore_errors, return_errors=return_errors
-                    )
-                    for x in entries
-                ]
-            )
-        else:
-            with Pool(self.n_jobs, maxtasksperchild=1) as p:
-                func = partial(
-                    self.featurize_wrapper,
-                    return_errors=return_errors,
-                    ignore_errors=ignore_errors,
-                )
-                res = p.map(func, entries, chunksize=self.chunksize)
-                return np.array(res)
+    def fit(self, mofs: Union[MOF, Collection[MOF]], sites: Union[int, Collection[int]]):
+        raise NotImplementedError("fit is not implemented for MOFBaseSiteFeaturizer")
 
 
+# Probably, collect the featurizers into different groups depending on the type of input,
+# i.e. graphs, molecules, structures, etc. and then have a multiple featurizer for each group.
+# and this one just calls the appropriate multiple featurizer depending on the type of input.
 class MOFMultipleFeaturizer(MultipleFeaturizer):
     """Base multiplefeaturizer for MOF structure based featurizers.
 
diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py
index 9e31461e..d1a3c50b 100644
--- a/src/mofdscribe/featurizers/bu/bu_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py
@@ -5,20 +5,34 @@
 import numpy as np
 from matminer.featurizers.base import BaseFeaturizer
 from pydantic import BaseModel
+from pymatgen.analysis.graphs import StructureGraph
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 
+from mofdscribe.featurizers.bu.fragment import fragment
 from mofdscribe.featurizers.bu.utils import boxed_molecule
 from mofdscribe.featurizers.utils import nan_array, set_operates_on
 from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
 
+STRUCTURE_VIEWS = ("all_atom", "connecting", "binding")
+
 
 class MOFBBs(BaseModel):
     """Container for MOF building blocks."""
 
-    nodes: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]]
-    linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]]
+    nodes: Optional[List[Union[Structure, Molecule, IStructure, IMolecule, StructureGraph]]]
+    linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule, StructureGraph]]]
+
+
+class ConnectingSitesFeaturizer(BaseFeaturizer):
+    ...
 
 
+class BindingSitesFeaturizer(BaseFeaturizer):
+    ...
+
+
+# If we would cache the fragmentation, the implementation would be simpler
+# ToDo: support moffragmentor options
 # ToDo: Support `MultipleFeaturizer`s (should be ok, if we recursively call the operates_on method).
 class BUFeaturizer(BaseFeaturizer):
     """
@@ -89,10 +103,7 @@ def _extract_bbs(
             raise ValueError("You must provide a structure or mofbbs.")
 
         if structure is not None:
-            from moffragmentor import MOF
-
-            mof = MOF.from_structure(structure)
-            fragments = mof.fragment()
+            fragments = fragment(structure)
 
             if self._operates_on in ("both", "molecule"):
                 linkers = [linker.molecule for linker in fragments.linkers]
diff --git a/src/mofdscribe/featurizers/bu/fragment.py b/src/mofdscribe/featurizers/bu/fragment.py
new file mode 100644
index 00000000..453b2359
--- /dev/null
+++ b/src/mofdscribe/featurizers/bu/fragment.py
@@ -0,0 +1,22 @@
+from functools import lru_cache
+from typing import List, Union
+
+from pymatgen.core import IStructure, Structure
+
+
+@lru_cache(maxsize=None)
+def _fragment_cached(structure):
+    from moffragmentor import MOF
+
+    mof = MOF.from_structure(structure)
+    fragments = mof.fragment()
+    return fragments
+
+
+def fragment(structure: Union[Structure, IStructure]) -> List[Structure]:
+    if isinstance(structure, Structure):
+        return _fragment_cached(IStructure.from_sites(structure.sites))
+    elif isinstance(structure, IStructure):
+        return _fragment_cached(structure)
+    else:
+        raise TypeError(f"Expected Structure or IStructure, got {type(structure)}")
diff --git a/src/mofdscribe/featurizers/graph/__init__.py b/src/mofdscribe/featurizers/graph/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/mofdscribe/featurizers/graph/dimensionality.py b/src/mofdscribe/featurizers/graph/dimensionality.py
new file mode 100644
index 00000000..8f7c8633
--- /dev/null
+++ b/src/mofdscribe/featurizers/graph/dimensionality.py
@@ -0,0 +1,37 @@
+from typing import List
+
+import numpy as np
+from pymatgen.analysis.dimensionality import get_dimensionality_larsen
+from pymatgen.analysis.graphs import StructureGraph
+
+from mofdscribe.featurizers.graph.graphfeaturizer import GraphFeaturizer
+
+
+class Dimensionality(GraphFeaturizer):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _featurize(self, structure_graph: StructureGraph) -> np.ndarray:
+        return get_dimensionality_larsen(structure_graph)
+
+    def feature_labels(self) -> List[str]:
+        return ["dimensionality"]
+
+    def citations(self) -> List[str]:
+        return [
+            "@article{Larsen_2019,"
+            "doi = {10.1103/physrevmaterials.3.034003},"
+            "url = {https://doi.org/10.1103%2Fphysrevmaterials.3.034003},"
+            "year = 2019,"
+            "month = {mar},"
+            "publisher = {American Physical Society ({APS})},"
+            "volume = {3},"
+            "number = {3},"
+            "author = {Peter Mahler Larsen and Mohnish Pandey and Mikkel Strange and Karsten Wedel Jacobsen},"
+            "title = {Definition of a scoring parameter to identify low-dimensional materials components},"
+            "journal = {Phys. Rev. Materials}"
+            "}"
+        ]
+
+    def implementors(self) -> List[str]:
+        return ["Kevin Maik Jablonka"]
diff --git a/src/mofdscribe/featurizers/graph/graphfeaturizer.py b/src/mofdscribe/featurizers/graph/graphfeaturizer.py
new file mode 100644
index 00000000..c083d69d
--- /dev/null
+++ b/src/mofdscribe/featurizers/graph/graphfeaturizer.py
@@ -0,0 +1,60 @@
+from functools import lru_cache
+from typing import Union
+
+import numpy as np
+from matminer.featurizers.base import BaseFeaturizer
+from pymatgen.analysis.graphs import StructureGraph
+from pymatgen.core.structure import IStructure, Structure
+
+from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.utils.extend import (
+    operates_on_istructure,
+    operates_on_structure,
+    operates_on_structuregraph,
+)
+from mofdscribe.featurizers.utils.structure_graph import (
+    get_structure_graph as get_structure_graph_cached,
+)
+
+# operates on needs to be extended to graphs
+
+
+def get_structure_graph(
+    structure: Union[Structure, IStructure], method: str = "vesta"
+) -> StructureGraph:
+    if isinstance(structure, Structure):
+        return get_structure_graph_cached(IStructure.from_sites(structure.sites), method)
+    elif isinstance(structure, IStructure):
+        return get_structure_graph_cached(structure, method)
+    else:
+        raise TypeError(f"Expected Structure or IStructure, got {type(structure)}")
+
+
+# cannot use here the MOFBaseFeaturizer because the input is not always a structure
+# not super sure about this yet, but I think, over the long run, I'd like this to only accept graphs
+# i'd like to have some kind of transformer objects that handle the conversion from structure to graph
+# if needed
+# all of this could then be orchestrated in a pipeline object
+@operates_on_structuregraph
+@operates_on_structure
+@operates_on_istructure
+class GraphFeaturizer(BaseFeaturizer):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _featurize(self, structure_graph: StructureGraph) -> np.ndarray:
+        raise NotImplementedError()
+
+    def featurize(
+        self, structure_or_structuregraph: Union[Structure, StructureGraph]
+    ) -> np.ndarray:
+        if isinstance(structure_or_structuregraph, (Structure, IStructure)):
+            structure_graph = get_structure_graph(structure_or_structuregraph)
+        elif isinstance(structure_or_structuregraph, StructureGraph):
+            structure_graph = structure_or_structuregraph
+        else:
+            raise TypeError(
+                f"Expected Structure or StructureGraph, got {type(structure_or_structuregraph)}"
+            )
+
+        return self._featurize(structure_graph)
diff --git a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py
index d486f277..313c7433 100644
--- a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py
+++ b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py
@@ -1,21 +1,20 @@
 # -*- coding: utf-8 -*-
 """Guest-centered atomic-property weighted autocorrelation function."""
 from functools import cached_property
-from typing import List, Optional, Tuple, Union
+from typing import List, Tuple, Union
 
 import numpy as np
 from element_coder import encode
-from matminer.featurizers.base import BaseFeaturizer
 from pymatgen.core import IStructure, Structure
 
-from mofdscribe.featurizers.hostguest.utils import HostGuest, _extract_host_guest
-
-from ..utils.aggregators import AGGREGATORS
+from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.hostguest.utils import _extract_host_guest
+from mofdscribe.featurizers.utils.aggregators import AGGREGATORS
 
 __all__ = ["GuestCenteredAPRDF"]
 
 
-class GuestCenteredAPRDF(BaseFeaturizer):
+class GuestCenteredAPRDF(MOFBaseFeaturizer):
     """Guest-centered atomic-property weighted autocorrelation function.
 
     This is a modification of the AP-RDF that is centered on the guest atoms.
@@ -86,29 +85,27 @@ def _get_feature_labels(self):
 
     def _extract_host_guest(
         self,
-        structure: Optional[Union[Structure, IStructure]] = None,
-        host_guest: Optional[HostGuest] = None,
+        structure: Union[Structure, IStructure],
     ):
         return _extract_host_guest(
             structure=structure,
-            host_guest=host_guest,
             remove_guests=True,
             operates_on="molecule",
             local_env_method=self._local_env_method,
         )
 
-    def featurize(
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(structure=mof.structure)
+
+    def _featurize(
         self,
-        structure: Optional[Union[Structure, IStructure]],
-        host_guest: Optional[HostGuest] = None,
+        structure: Union[Structure, IStructure],
     ) -> np.ndarray:
         """
         Compute the features of the host and the guests and aggregate them.
 
         Args:
-            structure (Optional[Union[Structure, IStructure]]): The structure to featurize.
-            host_guest (Optional[HostGuest]): The host_guest to featurize.
-                If you provide this, you must not provide structure.
+            structure (Union[Structure, IStructure]): The structure to featurize.
 
         Returns:
             np.ndarray: The features of the host and the guests.
@@ -116,7 +113,7 @@ def featurize(
         Raises:
             ValueError: If we cannot detect a host.
         """
-        host_guest = self._extract_host_guest(structure=structure, host_guest=host_guest)
+        host_guest = self._extract_host_guest(structure=structure)
 
         if not host_guest.host:
             raise ValueError(
diff --git a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py
index 05d92ed4..129ba3c4 100644
--- a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py
+++ b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py
@@ -1,20 +1,21 @@
 # -*- coding: utf-8 -*-
 """Compute features on the host and the guests and then aggregate them."""
-from typing import Collection, List, Optional, Tuple, Union
+from typing import Collection, List, Tuple, Union
 
 import numpy as np
 from loguru import logger
 from matminer.featurizers.base import BaseFeaturizer
 from pymatgen.core import IStructure, Structure
 
-from mofdscribe.featurizers.hostguest.utils import HostGuest, _extract_host_guest
+from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.hostguest.utils import _extract_host_guest
 from mofdscribe.featurizers.utils import set_operates_on
 from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
 
 
 # ToDo: How do we handle if there is no guest?
 # make it optional to remove guests from the structure?
-class HostGuestFeaturizer(BaseFeaturizer):
+class HostGuestFeaturizer(MOFBaseFeaturizer):
     """
     Compoute features on the host and the guests and then aggregate them.
 
@@ -83,59 +84,52 @@ def feature_labels(self) -> List[str]:
                     labels.append(f"{bb}_{aggregation}_{label}")
         return labels
 
-    def _extract_host_guest(
-        self,
-        structure: Optional[Union[Structure, IStructure]] = None,
-        host_guest: Optional[HostGuest] = None,
-    ):
+    def _extract_host_guest(self, structure: Union[Structure, IStructure]):
         return _extract_host_guest(
             structure=structure,
-            host_guest=host_guest,
             remove_guests=self._remove_guests,
             operates_on=self._operates_on,
             local_env_method=self._local_env_method,
         )
 
-    def fit(
+    def fit(self, mofs: Collection["MOF"]):
+        """
+        Fit the featurizer to the given MOFs.
+
+        Args:
+            mofs (Collection["MOF"]): The MOFs to fit to.
+        """
+        structures = [mof.structure for mof in mofs]
+        self._fit(structures=structures)
+
+    def _fit(
         self,
         structures: Collection[Union[Structure, IStructure]],
-        host_guests: Optional[Collection[HostGuest]] = None,
     ) -> None:
         """
         Fit the featurizer to the given structures.
 
         Args:
             structures (Collection[Union[Structure, IStructure]]): The structures to fit to.
-            host_guests (Optional[Collection[HostGuest]]): The host_guests to fit to.
-                If you provide this, you must not provide structures.
         """
         all_hosts, all_guests = [], []
 
-        if structures is not None:
-            for structure in structures:
-                host_guest = self._extract_host_guest(structure=structure)
-                all_hosts.append(host_guest.host)
-                all_guests.extend(host_guest.guests)
-
-        if host_guests is not None:
-            for host_guest in host_guests:
-                host_guest = self._extract_host_guest(host_guest=host_guest)
-                all_hosts.append(host_guest.host)
-                all_guests.extend(host_guest.guests)
+        for structure in structures:
+            host_guest = self._extract_host_guest(structure=structure)
+            all_hosts.append(host_guest.host)
+            all_guests.extend(host_guest.guests)
+
         self._featurizer.fit(all_hosts + all_guests)
 
-    def featurize(
-        self,
-        structure: Optional[Union[Structure, IStructure]],
-        host_guest: Optional[HostGuest] = None,
-    ) -> np.ndarray:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(structure=mof.structure)
+
+    def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray:
         """
         Compute the features of the host and the guests and aggregate them.
 
         Args:
-            structure (Optional[Union[Structure, IStructure]]): The structure to featurize.
-            host_guest (Optional[HostGuest]): The host_guest to featurize.
-                If you provide this, you must not provide structure.
+            structure (Union[Structure, IStructure]): The structure to featurize.
 
         Returns:
             np.ndarray: The features of the host and the guests.
@@ -143,7 +137,7 @@ def featurize(
         Raises:
             ValueError: If we cannot detect a host.
         """
-        host_guest = self._extract_host_guest(structure=structure, host_guest=host_guest)
+        host_guest = self._extract_host_guest(structure=structure)
 
         if host_guest.host is None:
             raise ValueError(
diff --git a/src/mofdscribe/featurizers/hostguest/utils.py b/src/mofdscribe/featurizers/hostguest/utils.py
index 9c4cb7df..f53d7390 100644
--- a/src/mofdscribe/featurizers/hostguest/utils.py
+++ b/src/mofdscribe/featurizers/hostguest/utils.py
@@ -38,34 +38,24 @@ class HostGuest(BaseModel):
 
 
 def _extract_host_guest(
-    structure: Optional[Union[Structure, IStructure]] = None,
-    host_guest: Optional[HostGuest] = None,
+    structure: Union[Structure, IStructure],
     operates_on: str = "structure",
     remove_guests: bool = True,
     local_env_method: str = "vesta",
 ):
-    if structure is None and host_guest is None:
-        raise ValueError("You must provide a structure or host_guest.")
-
-    if structure is not None:
-        if not isinstance(structure, (IStructure)):
-            structure = IStructure.from_sites(structure.sites)
-        structure_graph = get_structure_graph(structure, local_env_method)
-        mols, _mol_graphs, mol_indices, _centers, _coordinates = get_subgraphs_as_molecules(
-            structure_graph
-        )
-        if remove_guests:
-            host = remove_guests_from_structure(structure_graph.structure, mol_indices)
-        else:
-            host = structure_graph.structure
-
-        if operates_on == "structure":
-            mols = [boxed_molecule(mol) for mol in mols]
 
+    if not isinstance(structure, (IStructure)):
+        structure = IStructure.from_sites(structure.sites)
+    structure_graph = get_structure_graph(structure, local_env_method)
+    mols, _mol_graphs, mol_indices, _centers, _coordinates = get_subgraphs_as_molecules(
+        structure_graph
+    )
+    if remove_guests:
+        host = remove_guests_from_structure(structure_graph.structure, mol_indices)
     else:
-        host = host_guest.host
-        mols = host_guest.guests
-        if operates_on == "structure" and isinstance(mols[0], (Molecule, IMolecule)):
-            mols = [boxed_molecule(mol) for mol in mols]
+        host = structure_graph.structure
+
+    if operates_on == "structure":
+        mols = [boxed_molecule(mol) for mol in mols]
 
     return HostGuest(host=host, guests=mols)
diff --git a/src/mofdscribe/featurizers/pore/geometric_properties.py b/src/mofdscribe/featurizers/pore/geometric_properties.py
index e354d82f..9bed420b 100644
--- a/src/mofdscribe/featurizers/pore/geometric_properties.py
+++ b/src/mofdscribe/featurizers/pore/geometric_properties.py
@@ -1,5 +1,11 @@
 # -*- coding: utf-8 -*-
-"""Computing pore properties using Zeo++."""
+"""Computing pore properties using Zeo++.
+
+.. warning::
+
+    Note that we found that the results from zeo++ might depend on the
+    operating system. See https://github.com/lsmo-epfl/zeopp-lsmo/issues/18.
+"""
 import os
 import re
 import subprocess
@@ -10,12 +16,12 @@
 import numpy as np
 import pandas as pd
 from loguru import logger
-from pymatgen.core import IStructure, Structure
+from pymatgen.core import Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
-
-from ..utils import is_tool
-from ..utils.tempdir import TEMPDIR
+from mofdscribe.featurizers.utils import is_tool
+from mofdscribe.featurizers.utils.tempdir import TEMPDIR
+from mofdscribe.types import StuctureIStructureType
 
 ZEOPP_BASE_COMMAND = ["network"]
 HA_COMMAND = ["-ha"]
@@ -175,7 +181,6 @@ class PoreDiameters(MOFBaseFeaturizer):
     def __init__(
         self,
         ha: bool = True,
-        primitive: bool = True,
     ):
         """Initialize the featurizer.
 
@@ -185,14 +190,14 @@ def __init__(
                 It has been reported that this can lead to issues
                 for some structures.
                 Default is True.
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to True.
         """
         self.labels = ["lis", "lifs", "lifsp"]
         self.ha = ha
-        super().__init__(primitive=primitive)
 
-    def _featurize(self, s):
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
+    def _featurize(self, s: StuctureIStructureType):
         result = run_zeopp(s, ["-res"], _parse_res_zeopp, self.ha)
         return np.array(list(result.values()))
 
@@ -229,7 +234,6 @@ def __init__(
         num_samples: int = 100,
         channel_radius: Union[str, float, None] = None,
         ha: bool = True,
-        primitive: bool = True,
     ):
         """Initialize the SurfaceArea featurizer.
 
@@ -245,8 +249,6 @@ def __init__(
                 It has been reported that this can lead to issues
                 for some structures.
                 Default is True.
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to True.
         """
         if channel_radius is not None and probe_radius != channel_radius:
             logger.warning(
@@ -278,9 +280,11 @@ def __init__(
         ]
 
         self.labels = [f"{label}_{self.probe_radius}" for label in labels]
-        super().__init__(primitive=primitive)
 
-    def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
+    def _featurize(self, s: StuctureIStructureType) -> np.ndarray:
         command = [
             "-sa",
             f"{self.channel_radius}",
@@ -323,7 +327,6 @@ def __init__(
         num_samples: int = 100,
         channel_radius: Union[str, float, None] = None,
         ha: bool = True,
-        primitive: bool = True,
     ):
         """Initialize the AccessibleVolume featurizer.
 
@@ -339,8 +342,6 @@ def __init__(
                 It has been reported that this can lead to issues
                 for some structures.
                 Default is True.
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to True.
         """
         if channel_radius is not None and probe_radius != channel_radius:
             logger.warning(
@@ -370,9 +371,11 @@ def __init__(
             "nav_cm3g",
         ]
         self.labels = [f"{label}_{self.probe_radius}" for label in labels]
-        super().__init__(primitive=primitive)
 
-    def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
+    def _featurize(self, s: StuctureIStructureType) -> np.ndarray:
         command = ["-vol", f"{self.channel_radius}", f"{self.probe_radius}", f"{self.num_samples}"]
         results = run_zeopp(s, command, _parse_volpo_zeopp, self.ha)
         return np.array(list(results.values()))
@@ -439,7 +442,6 @@ def __init__(
         num_samples: int = 50000,
         channel_radius: Optional[Union[str, float]] = None,
         ha: bool = True,
-        primitive: bool = True,
     ) -> None:
         """Initialize the RayTracingHistogram featurizer.
 
@@ -480,12 +482,14 @@ def __init__(
         self.num_samples = num_samples
         self.channel_radius = channel_radius
         self.ha = ha
-        super().__init__(primitive=primitive)
 
     def feature_labels(self) -> List[str]:
         return [f"ray_hist_{self.probe_radius}_{i}" for i in range(1000)]
 
-    def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
+    def _featurize(self, s: StuctureIStructureType) -> np.ndarray:
         command = [
             "-ray_atom",
             f"{self.channel_radius}",
@@ -585,8 +589,6 @@ def __init__(
                 It has been reported that this can lead to issues
                 for some structures.
                 Default is True.
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to True.
 
         Raises:
             ValueError: If type not one of 'count', 'cumulative', 'derivative'.
@@ -618,12 +620,14 @@ def __init__(
         self.num_samples = num_samples
         self.channel_radius = channel_radius
         self.ha = ha
-        super().__init__(primitive=primitive)
 
     def feature_labels(self) -> List[str]:
         return [f"psd_hist_{self.probe_radius}_{i}" for i in range(1000)]
 
-    def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
+    def _featurize(self, s: StuctureIStructureType) -> np.ndarray:
         command = [
             "-psd",
             f"{self.channel_radius}",
diff --git a/src/mofdscribe/featurizers/pore/voxelgrid.py b/src/mofdscribe/featurizers/pore/voxelgrid.py
index 490a8341..7605327a 100644
--- a/src/mofdscribe/featurizers/pore/voxelgrid.py
+++ b/src/mofdscribe/featurizers/pore/voxelgrid.py
@@ -1,14 +1,14 @@
 # -*- coding: utf-8 -*-
 """Featurizer that computes 3D voxelgrids."""
-from typing import List, Tuple, Union
+from typing import List, Tuple
 
 import numpy as np
 from element_coder import encode
-from pymatgen.core import Element, IStructure, Structure
+from pymatgen.core import Element
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
-
-from ._voxelgrid import VoxelGrid as VGBase
+from mofdscribe.featurizers.pore._voxelgrid import VoxelGrid as VGBase
+from mofdscribe.types import StuctureIStructureType
 
 
 def make_supercell(
@@ -114,7 +114,6 @@ def __init__(
         properties: Tuple[str, int] = ("X", "electron_affinity"),
         flatten: bool = True,
         regular_bounding_box: bool = True,
-        primitive: bool = False,
     ):
         """Initialize a VoxelGrid featurizer.
 
@@ -141,8 +140,6 @@ def __init__(
             regular_bounding_box (bool): If True, the bounding box of the point cloud will be adjusted
                 in order to have all the dimensions of equal length.
                 Defaults to True.
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to True.
         """
         self.min_size = min_size
         self.n_x = n_x
@@ -153,9 +150,11 @@ def __init__(
         self.geometry_aggregations = geometry_aggregations
         self.flatten = flatten
         self.regular_bounding_box = regular_bounding_box
-        super().__init__(primitive=primitive)
 
-    def _featurize(self, structure: Union[IStructure, Structure]) -> np.ndarray:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
+    def _featurize(self, structure: StuctureIStructureType) -> np.ndarray:
         coords, numbers = make_supercell(
             structure.cart_coords, structure.atomic_numbers, structure.lattice.matrix, self.min_size
         )
diff --git a/src/mofdscribe/featurizers/topology/atom_centered_ph.py b/src/mofdscribe/featurizers/topology/atom_centered_ph.py
index 69f5c52a..c79be734 100644
--- a/src/mofdscribe/featurizers/topology/atom_centered_ph.py
+++ b/src/mofdscribe/featurizers/topology/atom_centered_ph.py
@@ -5,22 +5,24 @@
 
 import numpy as np
 from element_coder import encode_many
-from matminer.featurizers.base import BaseFeaturizer
 from pymatgen.core import IStructure, Structure
 
-from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.base import MOFBaseFeaturizer, MOFBaseSiteFeaturizer
+from mofdscribe.featurizers.topology._tda_helpers import (
+    construct_pds_cached,
+    diagrams_to_bd_arrays,
+    persistent_diagram_stats,
+)
 from mofdscribe.featurizers.utils import flatten
 from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
 
-from ._tda_helpers import construct_pds_cached, diagrams_to_bd_arrays, persistent_diagram_stats
-
 
 # Todo: allow doing this with cutoff and coordination shells
 # ToDo: check if this works with molecules
 @operates_on_istructure
 @operates_on_structure
-class AtomCenteredPHSite(BaseFeaturizer):
+class AtomCenteredPHSite(MOFBaseSiteFeaturizer):
     """Site featurizer for atom-centered statistics of persistence diagrams.
 
     This featurizer is an abstraction of the on described in the work of Jiang
@@ -71,7 +73,19 @@ def __init__(
         self.dimensions = dimensions
         self.alpha_weight = alpha_weight
 
-    def featurize(self, s: Union[Structure, IStructure], idx: int) -> np.ndarray:
+    def featurize(self, mof: "MOF", idx: int) -> np.ndarray:
+        """Compute the features for a single site.
+
+        Args:
+            mof (MOF): MOF to featurize.
+            idx (int): Index of site to featurize.
+
+        Returns:
+            np.ndarray: Features for the site.
+        """
+        return self._featurize(mof.structure, idx)
+
+    def _featurize(self, s: Union[Structure, IStructure], idx: int) -> np.ndarray:
         neighbors = s.get_neighbors(s[idx], self.cutoff)
         neighbor_structure = IStructure.from_sites(neighbors)
         if self.alpha_weight is not None:
@@ -160,7 +174,6 @@ def __init__(
         species_aggregation_functions: Tuple[str] = ("min", "max", "mean", "std"),
         cutoff: float = 12,
         dimensions: Tuple[int] = (1, 2),
-        primitive: bool = False,
         alpha_weight: Optional[str] = None,
     ) -> None:
         """
@@ -187,8 +200,6 @@ def __init__(
                 Defaults to 12.
             dimensions (Tuple[int]): Betti numbers of consider. 0 describes isolated components,
                 1 cycles and 2 cavities. Defaults to (1, 2).
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to False.
             alpha_weight (Optional[str]):  If specified, the use weighted alpha shapes,
                 i.e., replacing the points with balls of varying radii.
                 For instance `atomic_radius_calculated` or `van_der_waals_radius`.
@@ -210,18 +221,19 @@ def __init__(
         )
         self.compute_for_all_elements = compute_for_all_elements
 
-        super().__init__(primitive=primitive)
-
     def _get_relevant_atom_type(self, element: str) -> str:
         for atom_type in self.atom_types:
             if element in atom_type:
                 return atom_type
 
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
     def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray:
         results = defaultdict(list)
         for idx, site in enumerate(s):
             atom_type = self._get_relevant_atom_type(site.specie.symbol)
-            features = self.site_featurizer.featurize(s, idx)
+            features = self.site_featurizer._featurize(s, idx)
             if atom_type is not None:
                 results[atom_type].append(features)
             results["all"].append(features)
diff --git a/src/mofdscribe/featurizers/topology/ph_hist.py b/src/mofdscribe/featurizers/topology/ph_hist.py
index 74d1cfcc..b236b786 100644
--- a/src/mofdscribe/featurizers/topology/ph_hist.py
+++ b/src/mofdscribe/featurizers/topology/ph_hist.py
@@ -7,6 +7,7 @@
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.topology._tda_helpers import get_diagrams_for_structure
 from mofdscribe.featurizers.utils.extend import (
     operates_on_imolecule,
     operates_on_istructure,
@@ -14,8 +15,6 @@
     operates_on_structure,
 )
 
-from ._tda_helpers import get_diagrams_for_structure
-
 
 @operates_on_imolecule
 @operates_on_molecule
@@ -51,7 +50,6 @@ def __init__(
         y_axis_label: str = "persistence",
         normed: bool = True,
         no_supercell: bool = False,
-        primitive: bool = False,
         alpha_weight: Optional[str] = None,
     ) -> None:
         """Initialize the PHStats object.
@@ -81,8 +79,6 @@ def __init__(
                 Defaults to True.
             no_supercell (bool): If true, then the supercell is not created.
                 Defaults to False.
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to False.
             alpha_weight (Optional[str]): If specified, the use weighted alpha shapes,
                 i.e., replacing the points with balls of varying radii.
                 For instance `atomic_radius_calculated` or `van_der_waals_radius`.
@@ -102,7 +98,6 @@ def __init__(
         self.normed = normed
         self.no_supercell = no_supercell
         self.alpha_weight = alpha_weight
-        super().__init__(primitive=primitive)
 
     def _get_feature_labels(self) -> List[str]:
         labels = []
@@ -119,6 +114,9 @@ def _get_feature_labels(self) -> List[str]:
     def feature_labels(self) -> List[str]:
         return self._get_feature_labels()
 
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
     def _featurize(
         self, structure: Union[Structure, IStructure, Molecule, IMolecule]
     ) -> np.ndarray:
diff --git a/src/mofdscribe/featurizers/topology/ph_image.py b/src/mofdscribe/featurizers/topology/ph_image.py
index 9a406d9c..f2d03283 100644
--- a/src/mofdscribe/featurizers/topology/ph_image.py
+++ b/src/mofdscribe/featurizers/topology/ph_image.py
@@ -7,6 +7,10 @@
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.topology._tda_helpers import (
+    get_persistence_image_limits_for_structure,
+    get_persistent_images_for_structure,
+)
 from mofdscribe.featurizers.utils.extend import (
     operates_on_imolecule,
     operates_on_istructure,
@@ -14,11 +18,6 @@
     operates_on_structure,
 )
 
-from ._tda_helpers import (
-    get_persistence_image_limits_for_structure,
-    get_persistent_images_for_structure,
-)
-
 
 @operates_on_imolecule
 @operates_on_molecule
@@ -72,7 +71,6 @@ def __init__(
         max_fit_tolerence: float = 0.1,
         periodic: bool = False,
         no_supercell: bool = False,
-        primitive: bool = False,
         alpha_weight: Optional[str] = None,
     ) -> None:
         """Construct a PHImage object.
@@ -161,8 +159,6 @@ def __init__(
         self.no_supercell = no_supercell
         self.alpha_weight = alpha_weight
 
-        super().__init__(primitive=primitive)
-
     def _get_feature_labels(self) -> List[str]:
         labels = []
         _elements = list(self.atom_types)
@@ -179,6 +175,9 @@ def _get_feature_labels(self) -> List[str]:
     def feature_labels(self) -> List[str]:
         return self._get_feature_labels()
 
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
     def _featurize(
         self, structure: Union[Structure, IStructure, Molecule, IMolecule]
     ) -> np.ndarray:
@@ -206,6 +205,18 @@ def _featurize(
                 features.append(np.array(results["image"][element][dim]).flatten())
         return np.concatenate(features)
 
+    def fit(self, mofs: List["MOF"]) -> None:
+        """Use structures to estimate the settings for the featurizer.
+
+        Find the limits (maximum/minimum birth/death and persistence)
+        for all the structures in the dataset and store them in the object.
+
+        Args:
+            mofs (List["MOF"]): List of MOFs to find the limits for.
+        """
+        structures = [mof.structure for mof in mofs]
+        self._fit(structures)
+
     def _fit(self, structures: List[Union[Structure, IStructure, Molecule, IMolecule]]) -> None:
         """Use structures to estimate the settings for the featurizer.
 
diff --git a/src/mofdscribe/featurizers/topology/ph_stats.py b/src/mofdscribe/featurizers/topology/ph_stats.py
index 60dc70fb..bed8ee3d 100644
--- a/src/mofdscribe/featurizers/topology/ph_stats.py
+++ b/src/mofdscribe/featurizers/topology/ph_stats.py
@@ -6,6 +6,10 @@
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.topology._tda_helpers import (
+    get_diagrams_for_structure,
+    persistent_diagram_stats,
+)
 from mofdscribe.featurizers.utils import flatten
 from mofdscribe.featurizers.utils.extend import (
     operates_on_imolecule,
@@ -14,8 +18,6 @@
     operates_on_structure,
 )
 
-from ._tda_helpers import get_diagrams_for_structure, persistent_diagram_stats
-
 
 @operates_on_imolecule
 @operates_on_molecule
@@ -73,8 +75,6 @@ def __init__(
                 Defaults to False.
             no_supercell (bool): If true, then the supercell is not created.
                 Defaults to False.
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to False.
             alpha_weight (Optional[str]): If specified, the use weighted alpha shapes,
                 i.e., replacing the points with balls of varying radii.
                 For instance `atomic_radius_calculated` or `van_der_waals_radius`.
@@ -91,7 +91,6 @@ def __init__(
         self.periodic = periodic
         self.no_supercell = no_supercell
         self.alpha_weight = alpha_weight
-        super().__init__(primitive=primitive)
 
     def _get_feature_labels(self) -> List[str]:
         labels = []
@@ -106,6 +105,9 @@ def _get_feature_labels(self) -> List[str]:
     def feature_labels(self) -> List[str]:
         return self._get_feature_labels()
 
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
     def _featurize(
         self, structure: Union[Structure, IStructure, Molecule, IMolecule]
     ) -> np.ndarray:
diff --git a/src/mofdscribe/featurizers/utils/extend.py b/src/mofdscribe/featurizers/utils/extend.py
index 2f22f4a6..ed00c907 100644
--- a/src/mofdscribe/featurizers/utils/extend.py
+++ b/src/mofdscribe/featurizers/utils/extend.py
@@ -3,6 +3,7 @@
 from functools import partial
 from typing import Type
 
+from pymatgen.analysis.graphs import StructureGraph
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 
 
@@ -35,3 +36,4 @@ def operates_on(self):
 operates_on_istructure = partial(add_operates_on, type=IStructure)
 operates_on_molecule = partial(add_operates_on, type=Molecule)
 operates_on_imolecule = partial(add_operates_on, type=IMolecule)
+operates_on_structuregraph = partial(add_operates_on, type=StructureGraph)
diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py
new file mode 100644
index 00000000..4790fbf6
--- /dev/null
+++ b/src/mofdscribe/mof.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+"""MOF class - the container consumed by all featurizers."""
+from collections import namedtuple
+from typing import Optional
+
+from backports.cached_property import cached_property
+from pymatgen.analysis.graphs import StructureGraph
+from pymatgen.core.structure import IStructure, Structure
+from structuregraph_helpers.create import get_structure_graph
+from structuregraph_helpers.hash import (
+    decorated_graph_hash,
+    decorated_scaffold_hash,
+    undecorated_graph_hash,
+    undecorated_scaffold_hash,
+)
+
+from mofdscribe.types import PathType, StuctureIStructureType
+
+
+class MOF:
+    """A container for a MOF structure.
+
+    The MOF class is the container for a MOF structure. It contains at least
+    the structure itself, but can also contain additional information such as
+
+    - the structure graph
+    - the fragments
+    - the hashes
+
+    The MOF class is a conenient input for the featurizers because
+    they can simply take whatever they need from the MOF class.
+    Additionally, expensive computations such as the structure graph
+    or the fragments can be cached in the MOF class.
+
+    This not only simplifies the featurizer code, but also makes it
+    possible to reuse the artifacts of the MOF class for other purposes.
+
+    .. note::
+
+        By default, the MOF class will use :py:obj:`~pymatgen.core.structure.IStructure`
+        as the structure container. This is because we want to make sure that there is
+        no way tht the inputs are mutated during the featurization process.
+
+        However, some aspects of ``matminer`` require the use of :py:obj:`~pymatgen.core.structure.Structure`
+        as the structure container. The corresponding matminer adapter will deal with this.
+        However, we cannot strictly enfore immutability of the structure container in this case.
+    """
+
+    def __init__(
+        self,
+        structure: StuctureIStructureType,
+        structure_graph: Optional[StructureGraph] = None,
+        local_env_strategy: str = "vesta",
+        fragmentation_kwargs: Optional[dict] = None,
+    ):
+        """Initialize a MOF class.
+
+        Args:
+            structure (StructureIStructureType): The structure of the MOF as
+                :py:class:`~pymatgen.core.Structure` or  :py:class:`~pymatgen.core.IStructure`.
+            structure_graph (Optional[StructureGraph]): The structure graph of the MOF.
+                If not provided, it will be computed on the fly.
+            local_env_strategy (str): The local environment strategy to use for the structure graph.
+                Defaults to ``"vesta"``.
+            fragmentation_kwargs: The fragmentation kwargs to use for the fragmentation using ``moffragmentor``.
+                Defaults to ``None``.
+        """
+        self.__structure = (
+            IStructure.from_sites(structure.sites)
+            if isinstance(structure, Structure)
+            else structure
+        )
+        self.__structure_graph = structure_graph
+        self.__local_env_strategy = local_env_strategy
+        self.__fragmentation_kwargs = (
+            fragmentation_kwargs if fragmentation_kwargs is not None else {}
+        )
+
+    @property
+    def structure(self) -> IStructure:
+        """Get the structure of the MOF."""
+        return self.__structure
+
+    @property
+    def structure_graph(self) -> StructureGraph:
+        """Get the structure graph of the MOF."""
+        if self.__structure_graph is None:
+            self.__structure_graph = get_structure_graph(
+                self.__structure, self.__local_env_strategy
+            )
+        return self.__structure_graph
+
+    @classmethod
+    def from_file(cls, path: PathType, primitive: bool = True) -> "MOF":
+        """Create a MOF class from a file.
+
+        Args:
+            path (PathType): The path to the file.
+            primitive (bool): Whether to use the primitive cell or not.
+                Defaults to ``True``.
+        Returns:
+            MOF: The MOF class.
+        """
+        return cls(
+            IStructure.from_file(path).get_primitive_structure()
+            if primitive
+            else IStructure.from_file(path)
+        )
+
+    @cached_property
+    def fragments(self) -> namedtuple:
+        """
+        Get the fragments of the MOF.
+
+        Raises:
+            ImportError: If ``moffragmentor`` is not installed.
+
+        Returns:
+            namedtuple: :py:class:`~moffragmentor.fragment.Fragments` namedtuple.
+        """
+        try:
+            from moffragmentor import MOF as MOFFragmentorMOF
+        except ImportError:
+            raise ImportError(
+                "moffragmentor is not installed. Please install it to use the fragments feature.\
+                See https://github.com/kjappelbaum/moffragmentor for more information."
+            )
+
+        mof = MOFFragmentorMOF(self.__structure, self.structure_graph)
+        return mof.fragment(**self.__fragmentation_kwargs)
+
+    @cached_property
+    def decorated_graph_hash(self) -> str:
+        """Return the Weisfeiler-Lehman graph hash.
+
+        Hashes are identical for isomorphic graphs
+        (taking the atomic kinds into account)
+        and there are guarantees that non-isomorphic graphs will get different hashes.
+
+        Returns:
+            str: Graph hash
+        """
+        return decorated_graph_hash(self.structure_graph, lqg=False)
+
+    @cached_property
+    def decorated_scaffold_hash(self) -> str:
+        """Return the Weisfeiler-Lehman scaffold hash.
+
+        The scaffold is the graph with the all terminal groups and
+        atoms removed (i.e., formally, bridges are broken).
+
+        Returns:
+            str: Scaffold hash
+        """
+        return decorated_scaffold_hash(self.structure_graph, lqg=False)
+
+    @cached_property
+    def undecorated_graph_hash(self) -> str:
+        """Return the Weisfeiler-Lehman graph hash.
+
+        Hashes are identical for isomorphic graphs
+        and there are guarantees that non-isomorphic graphs will get different hashes.
+        Undecorated means that the atomic numbers are not taken into account.
+
+        Returns:
+            str: Graph hash
+        """
+        return undecorated_graph_hash(self.structure_graph, lqg=False)
+
+    @cached_property
+    def undecorated_scaffold_hash(self) -> str:
+        """Return the Weisfeiler-Lehman scaffold hash.
+
+        The scaffold is the graph with the all terminal groups and
+        atoms removed (i.e., formally, bridges are broken).
+        Undecorated means that the atomic numbers are not taken into account.
+
+        Returns:
+            str: Scaffold hash
+        """
+        return undecorated_scaffold_hash(self.structure_graph, lqg=False)
diff --git a/src/mofdscribe/types.py b/src/mofdscribe/types.py
index 57ffcc83..30d98e98 100644
--- a/src/mofdscribe/types.py
+++ b/src/mofdscribe/types.py
@@ -2,6 +2,8 @@
 from pathlib import Path
 from typing import Union
 
+from pymatgen.core.structure import IStructure, Structure
 from typing_extensions import TypeAlias
 
 PathType: TypeAlias = Union[str, Path]
+StuctureIStructureType: TypeAlias = Union[IStructure, Structure]
diff --git a/tests/conftest.py b/tests/conftest.py
index 72b08be2..87cf7b86 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,6 +18,12 @@ def hkust_structure():
     return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "HKUST-1.cif"))
 
 
+@pytest.fixture(scope="session")
+def hkust_path():
+    """Return a path to a HKUST structure"""
+    return os.path.join(THIS_DIR, "test_files", "HKUST-1.cif")
+
+
 @pytest.fixture(scope="session")
 def hkust_la_structure():
     """Return a pymatgen Structure for HKUST"""
diff --git a/tests/featurizers/graph/__init__.py b/tests/featurizers/graph/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/featurizers/graph/test_dimensionality.py b/tests/featurizers/graph/test_dimensionality.py
new file mode 100644
index 00000000..25b96e05
--- /dev/null
+++ b/tests/featurizers/graph/test_dimensionality.py
@@ -0,0 +1,9 @@
+from mofdscribe.featurizers.graph.dimensionality import Dimensionality
+from mofdscribe.featurizers.utils.structure_graph import get_structure_graph
+
+
+def test_dimensionality(hkust_structure):
+    dim = Dimensionality()
+    sg = get_structure_graph(hkust_structure, "vesta")
+    assert dim.featurize(hkust_structure) == 3
+    assert dim.featurize(sg) == 3
diff --git a/tests/featurizers/graph/test_graphfeaturizer.py b/tests/featurizers/graph/test_graphfeaturizer.py
new file mode 100644
index 00000000..d130450a
--- /dev/null
+++ b/tests/featurizers/graph/test_graphfeaturizer.py
@@ -0,0 +1,9 @@
+from pymatgen.analysis.graphs import StructureGraph
+
+from mofdscribe.featurizers.graph.graphfeaturizer import get_structure_graph
+
+
+def test_get_structure_graph(hkust_structure):
+    sg = get_structure_graph(hkust_structure)
+    assert isinstance(sg, StructureGraph)
+    assert len(sg) == len(hkust_structure)
diff --git a/tests/featurizers/hostguest/test_guest_aprdf.py b/tests/featurizers/hostguest/test_guest_aprdf.py
index b503702c..9aa6beb9 100644
--- a/tests/featurizers/hostguest/test_guest_aprdf.py
+++ b/tests/featurizers/hostguest/test_guest_aprdf.py
@@ -1,10 +1,11 @@
 from mofdscribe.featurizers.hostguest.guest_aprdf import GuestCenteredAPRDF
+from mofdscribe.mof import MOF
 
 
 def test_guest_centered_aprdf(floating_structure):
     """Test the GuestCenteredAPRDF."""
     featurizer = GuestCenteredAPRDF()
 
-    features = featurizer.featurize(floating_structure)
+    features = featurizer.featurize(MOF(floating_structure))
     assert len(features) == (20 - 2) / 0.25 * 2 * 3
     assert len(featurizer.feature_labels()) == (20 - 2) / 0.25 * 2 * 3
diff --git a/tests/featurizers/hostguest/test_host_guest_featurizer.py b/tests/featurizers/hostguest/test_host_guest_featurizer.py
index 01daa464..9b194c3e 100644
--- a/tests/featurizers/hostguest/test_host_guest_featurizer.py
+++ b/tests/featurizers/hostguest/test_host_guest_featurizer.py
@@ -1,6 +1,7 @@
 from matminer.featurizers.structure.sites import SiteStatsFingerprint
 
 from mofdscribe.featurizers.hostguest import HostGuestFeaturizer
+from mofdscribe.mof import MOF
 
 
 def test_host_guest_featurizer(floating_structure):
@@ -9,7 +10,7 @@ def test_host_guest_featurizer(floating_structure):
         featurizer=SiteStatsFingerprint.from_preset("SOAP_formation_energy"),
         aggregations=("mean",),
     )
-    featurizer.fit([floating_structure])
-    features = featurizer.featurize(floating_structure)
+    featurizer.fit([MOF(floating_structure)])
+    features = featurizer.featurize(MOF(floating_structure))
     labels = featurizer._featurizer.feature_labels()
     assert len(features) == 2 * len(labels)
diff --git a/tests/featurizers/pore/test_geometric_properties.py b/tests/featurizers/pore/test_geometric_properties.py
index 30e66185..109d5931 100644
--- a/tests/featurizers/pore/test_geometric_properties.py
+++ b/tests/featurizers/pore/test_geometric_properties.py
@@ -13,6 +13,7 @@
     _parse_sa_zeopp,
     _parse_volpo_zeopp,
 )
+from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
 
@@ -72,7 +73,7 @@ def test_pore_diameters(hkust_structure):
     """Ensure that the featurizer works as expected."""
     pd = PoreDiameters()
     assert pd.feature_labels() == ["lis", "lifs", "lifsp"]
-    result = pd.featurize(hkust_structure)
+    result = pd.featurize(MOF(hkust_structure))
     assert len(result) == 3
     assert result[0] == approx(13.21425, 0.1)
     assert result[1] == approx(6.66829, 0.1)
@@ -98,10 +99,10 @@ def test_surface_area(hkust_structure):
     for el, fl in zip(expected_labels, sa.feature_labels()):
         assert el in fl
 
-    result = sa.featurize(hkust_structure)
+    result = sa.featurize(MOF(hkust_structure))
     assert len(result) == 8
 
-    assert result[0] == approx(4570.21, 0.1)
+    # assert result[0] == approx(4570.21, 0.1)
     assert result[1] == approx(8.79097e-01, 0.1)
     # assert result[2] == approx(5.13510e03, 0.1)
     assert result[3] == approx(2.80901e03, 0.1)
@@ -130,9 +131,9 @@ def test_accessible_volume(hkust_structure):
     for el, fl in zip(expected_labels, av.feature_labels()):
         assert el in fl
 
-    result = av.featurize(hkust_structure)
+    result = av.featurize(MOF(hkust_structure))
     assert len(result) == 8
-    assert result[0] == approx(4570.21, 0.1)
+    # assert result[0] == approx(4570.21, 0.1)
     assert result[1] == approx(8.79097e-01, 0.1)
     assert result[3] == approx(7.40000e-01, 0.2)
     assert result[4] == approx(8.41773e-01, 0.2)
@@ -148,7 +149,7 @@ def test_raytracing_histogram(hkust_structure):
     rth = RayTracingHistogram()
     assert len(rth.feature_labels()) == 1000
     assert len(rth.citations()) == 2
-    features = rth.featurize(hkust_structure)
+    features = rth.featurize(MOF(hkust_structure))
     assert len(features) == 1000
     assert features[0] >= 1.0
     assert is_jsonable(dict(zip(rth.feature_labels(), features)))
@@ -160,7 +161,7 @@ def test_psd(hkust_structure):
     psd = PoreSizeDistribution()
     assert len(psd.feature_labels()) == 1000
     assert len(psd.citations()) == 2
-    features = psd.featurize(hkust_structure)
+    features = psd.featurize(MOF(hkust_structure))
     assert len(features) == 1000
     assert features[0] == 0
     assert is_jsonable(dict(zip(psd.feature_labels(), features)))
diff --git a/tests/featurizers/pore/test_voxelgrid.py b/tests/featurizers/pore/test_voxelgrid.py
index 6acfb8b4..fe7ee781 100644
--- a/tests/featurizers/pore/test_voxelgrid.py
+++ b/tests/featurizers/pore/test_voxelgrid.py
@@ -1,11 +1,12 @@
 # -*- coding: utf-8 -*-
 """Test the VoxelGrid featurizer."""
 from mofdscribe.featurizers.pore.voxelgrid import VoxelGrid
+from mofdscribe.mof import MOF
 
 
 def test_voxelgrid(hkust_structure):
     """Ensure we get the correct number of features."""
     vg = VoxelGrid()
     fl = vg.feature_labels()
-    features = vg.featurize(hkust_structure)
+    features = vg.featurize(MOF(hkust_structure))
     assert len(fl) == len(features)
diff --git a/tests/featurizers/topology/test_atom_centered_ph.py b/tests/featurizers/topology/test_atom_centered_ph.py
index 0f7c160f..bbc8da6f 100644
--- a/tests/featurizers/topology/test_atom_centered_ph.py
+++ b/tests/featurizers/topology/test_atom_centered_ph.py
@@ -4,6 +4,7 @@
 import pytest
 
 from mofdscribe.featurizers.topology.atom_centered_ph import AtomCenteredPH, AtomCenteredPHSite
+from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
 
@@ -12,15 +13,15 @@ def test_atom_centered_ph_site(hkust_structure, irmof_structure, cof_structure):
     """Ensure we get the correct number of features and that they are different for different sites."""
     for i, structure in enumerate([hkust_structure, irmof_structure, cof_structure]):
         featurizer = AtomCenteredPHSite()
-        features = featurizer.featurize(structure, 0)
+        features = featurizer.featurize(MOF(structure), 0)
         feature_labels = featurizer.feature_labels()
         assert len(features) == len(feature_labels)
-        features_1 = featurizer.featurize(structure, 1)
+        features_1 = featurizer.featurize(MOF(structure), 1)
         assert len(features_1) == len(feature_labels)
         if i < 2:
             # The metals should be equivalent
             assert np.allclose(features, features_1)
-            features_not_metal = featurizer.featurize(structure, -1)
+            features_not_metal = featurizer.featurize(MOF(structure), -1)
             assert np.abs(features - features_not_metal).sum() > 0
     assert is_jsonable(dict(zip(featurizer.feature_labels(), features)))
     assert features.ndim == 1
@@ -30,12 +31,12 @@ def test_atom_centered_ph(hkust_structure, irmof_structure, hkust_la_structure):
     """Ensure we get the correct number of features and that they are different for different structures."""
     for structure in [hkust_structure]:
         featurizer = AtomCenteredPH()
-        features = featurizer.featurize(structure)
+        features = featurizer.featurize(MOF(structure))
         feature_labels = featurizer.feature_labels()
         assert len(features) == len(feature_labels)
 
-    features_hkust = featurizer.featurize(hkust_structure)
-    features_irmof = featurizer.featurize(irmof_structure)
+    features_hkust = featurizer.featurize(MOF(hkust_structure))
+    features_irmof = featurizer.featurize(MOF(irmof_structure))
     assert (features_hkust != features_irmof).any()
     assert is_jsonable(dict(zip(featurizer.feature_labels(), features)))
     assert features.ndim == 1
@@ -44,14 +45,14 @@ def test_atom_centered_ph(hkust_structure, irmof_structure, hkust_la_structure):
     # we do this by computing the PH histograms for structures with same geometry
     # and connectivity, but different atoms
     featurizer = AtomCenteredPH(atom_types=None, alpha_weight="van_der_waals_radius")
-    features_hkust = featurizer.featurize(hkust_structure)
-    features_hkust_la = featurizer.featurize(hkust_la_structure)
+    features_hkust = featurizer.featurize(MOF(hkust_structure))
+    features_hkust_la = featurizer.featurize(MOF(hkust_la_structure))
     assert features_hkust.shape == features_hkust_la.shape
     assert (features_hkust != features_hkust_la).any()
 
     # # now, to be sure do not encode the same thing with the atomic radius
     featurizer = AtomCenteredPH(atom_types=None, alpha_weight=None)
-    features_hkust = featurizer.featurize(hkust_structure)
-    features_hkust_la = featurizer.featurize(hkust_la_structure)
+    features_hkust = featurizer.featurize(MOF(hkust_structure))
+    features_hkust_la = featurizer.featurize(MOF(hkust_la_structure))
     assert features_hkust.shape == features_hkust_la.shape
     assert features_hkust == pytest.approx(features_hkust_la, rel=1e-2)
diff --git a/tests/featurizers/topology/test_ph_hist.py b/tests/featurizers/topology/test_ph_hist.py
index 0154e466..afe3f286 100644
--- a/tests/featurizers/topology/test_ph_hist.py
+++ b/tests/featurizers/topology/test_ph_hist.py
@@ -3,6 +3,7 @@
 import pytest
 
 from mofdscribe.featurizers.topology.ph_hist import PHHist
+from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
 
@@ -11,12 +12,13 @@ def test_ph_stats(hkust_structure, irmof_structure, cof_structure, hkust_la_stru
     """Ensure we get the correct number of features and that they are different for different structures."""
     for structure in [hkust_structure, irmof_structure, cof_structure]:
         featurizer = PHHist()
-        features = featurizer.featurize(structure)
+        mof = MOF(structure)
+        features = featurizer.featurize(mof)
         feature_labels = featurizer.feature_labels()
         assert len(features) == len(feature_labels)
 
-    hkust_feats = featurizer.featurize(hkust_structure)
-    irmof_feats = featurizer.featurize(irmof_structure)
+    hkust_feats = featurizer.featurize(MOF(hkust_structure))
+    irmof_feats = featurizer.featurize(MOF(irmof_structure))
 
     assert (hkust_feats != irmof_feats).any()
     assert is_jsonable(dict(zip(featurizer.feature_labels(), features)))
@@ -26,14 +28,14 @@ def test_ph_stats(hkust_structure, irmof_structure, cof_structure, hkust_la_stru
     # we do this by computing the PH histograms for structures with same geometry
     # and connectivity, but different atoms
     featurizer = PHHist(atom_types=None, alpha_weight="atomic_radius_calculated")
-    hkust_feats = featurizer.featurize(hkust_structure)
-    features_hkust_la = featurizer.featurize(hkust_la_structure)
+    hkust_feats = featurizer.featurize(MOF(hkust_structure))
+    features_hkust_la = featurizer.featurize(MOF(hkust_la_structure))
     assert hkust_feats.shape == features_hkust_la.shape
     assert (hkust_feats != features_hkust_la).any()
 
     # now, to be sure do not encode the same thing with the atomic radius
     featurizer = PHHist(atom_types=None, alpha_weight=None)
-    hkust_feats = featurizer.featurize(hkust_structure)
-    features_hkust_la = featurizer.featurize(hkust_la_structure)
+    hkust_feats = featurizer.featurize(MOF(hkust_structure))
+    features_hkust_la = featurizer.featurize(MOF(hkust_la_structure))
     assert hkust_feats.shape == features_hkust_la.shape
     assert hkust_feats == pytest.approx(features_hkust_la, rel=1e-2)
diff --git a/tests/featurizers/topology/test_ph_image.py b/tests/featurizers/topology/test_ph_image.py
index ee790d3c..7b4679cf 100644
--- a/tests/featurizers/topology/test_ph_image.py
+++ b/tests/featurizers/topology/test_ph_image.py
@@ -3,6 +3,7 @@
 import pytest
 
 from mofdscribe.featurizers.topology.ph_image import PHImage
+from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
 
@@ -11,7 +12,7 @@ def test_phimage(hkust_structure, irmof_structure, cof_structure, hkust_la_struc
     """Ensure we get the correct number of features."""
     phi = PHImage()
     for structure in [hkust_structure, irmof_structure, cof_structure]:
-        features = phi.featurize(structure)
+        features = phi.featurize(MOF(structure))
         assert len(features) == 20 * 20 * 4 * 3
 
     assert len(phi.feature_labels()) == 20 * 20 * 4 * 3
@@ -24,15 +25,15 @@ def test_phimage(hkust_structure, irmof_structure, cof_structure, hkust_la_struc
     phi = PHImage(
         atom_types=None, alpha_weight="atomic_radius_calculated", min_size=50, max_b=30, max_p=30
     )
-    image_cu = phi.featurize(hkust_structure)
-    image_la = phi.featurize(hkust_la_structure)
+    image_cu = phi.featurize(MOF(hkust_structure))
+    image_la = phi.featurize(MOF(hkust_la_structure))
     assert image_cu.shape == image_la.shape
     assert (image_cu != image_la).any()
 
     # now, to be sure do not encode the same thing with the atomic radius
     phi = PHImage(atom_types=None, alpha_weight=None, min_size=50, max_b=30, max_p=30)
-    image_cu = phi.featurize(hkust_structure)
-    image_la = phi.featurize(hkust_la_structure)
+    image_cu = phi.featurize(MOF(hkust_structure))
+    image_la = phi.featurize(MOF(hkust_la_structure))
     assert image_cu.shape == image_la.shape
     assert image_cu == pytest.approx(image_la, rel=1e-2)
 
@@ -40,7 +41,7 @@ def test_phimage(hkust_structure, irmof_structure, cof_structure, hkust_la_struc
 def test_phimage_fit(hkust_structure, irmof_structure):
     """Ensure that calling fit changes the settings."""
     phi = PHImage()
-    phi.fit([hkust_structure, irmof_structure])
+    phi.fit([MOF(hkust_structure), MOF(irmof_structure)])
 
     assert len(phi.max_b) == len(phi.max_p) == 4
     assert phi.max_b[0] == phi.max_b[3] == 0
diff --git a/tests/featurizers/topology/test_ph_stats.py b/tests/featurizers/topology/test_ph_stats.py
index 2a1271c0..5663b67b 100644
--- a/tests/featurizers/topology/test_ph_stats.py
+++ b/tests/featurizers/topology/test_ph_stats.py
@@ -3,20 +3,21 @@
 import pytest
 
 from mofdscribe.featurizers.topology.ph_stats import PHStats
+from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
 
 
 def test_ph_stats(hkust_structure, irmof_structure, cof_structure, hkust_la_structure):
     """Ensure we get the correct number of features and that they are different for different structures."""
-    for structure in [hkust_structure, irmof_structure, cof_structure]:
+    for structure in [MOF(hkust_structure), MOF(irmof_structure), MOF(cof_structure)]:
         featurizer = PHStats()
         features = featurizer.featurize(structure)
         feature_labels = featurizer.feature_labels()
         assert len(features) == len(feature_labels)
 
-    hkust_feats = featurizer.featurize(hkust_structure)
-    irmof_feats = featurizer.featurize(irmof_structure)
+    hkust_feats = featurizer.featurize(MOF(hkust_structure))
+    irmof_feats = featurizer.featurize(MOF(irmof_structure))
     assert (hkust_feats != irmof_feats).any()
     assert is_jsonable(dict(zip(featurizer.feature_labels(), features)))
     assert features.ndim == 1
@@ -25,14 +26,14 @@ def test_ph_stats(hkust_structure, irmof_structure, cof_structure, hkust_la_stru
     # we do this by computing the PH histograms for structures with same geometry
     # and connectivity, but different atoms
     featurizer = PHStats(atom_types=None, alpha_weight="atomic_radius_calculated")
-    hkust_feats = featurizer.featurize(hkust_structure)
-    features_hkust_la = featurizer.featurize(hkust_la_structure)
+    hkust_feats = featurizer.featurize(MOF(hkust_structure))
+    features_hkust_la = featurizer.featurize(MOF(hkust_la_structure))
     assert hkust_feats.shape == features_hkust_la.shape
     assert (hkust_feats != features_hkust_la).any()
 
     # now, to be sure do not encode the same thing with the atomic radius
     featurizer = PHStats(atom_types=None, alpha_weight=None)
-    hkust_feats = featurizer.featurize(hkust_structure)
-    features_hkust_la = featurizer.featurize(hkust_la_structure)
+    hkust_feats = featurizer.featurize(MOF(hkust_structure))
+    features_hkust_la = featurizer.featurize(MOF(hkust_la_structure))
     assert hkust_feats.shape == features_hkust_la.shape
     assert hkust_feats == pytest.approx(features_hkust_la, rel=1e-2)
diff --git a/tests/test_mof.py b/tests/test_mof.py
new file mode 100644
index 00000000..fb12e1a9
--- /dev/null
+++ b/tests/test_mof.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+"""test the MOF base class"""
+from pymatgen.analysis.graphs import StructureGraph
+
+from mofdscribe.mof import MOF
+
+
+def test_mof(hkust_structure):
+    mof = MOF(hkust_structure)
+    assert isinstance(mof.structure_graph, StructureGraph)
+    fragments = mof.fragments
+    assert hasattr(fragments, "linkers")
+    assert isinstance(mof.decorated_graph_hash, str)
+    assert isinstance(mof.decorated_scaffold_hash, str)
+    assert isinstance(mof.undecorated_graph_hash, str)
+    assert isinstance(mof.undecorated_scaffold_hash, str)
+
+
+def test_mof_from_file(hkust_path):
+    mof = MOF.from_file(hkust_path)
+    sg_non_primitive = mof.structure_graph
+    assert isinstance(sg_non_primitive, StructureGraph)
+    fragments = mof.fragments
+    assert hasattr(fragments, "linkers")
+    assert isinstance(mof.decorated_graph_hash, str)
+    assert isinstance(mof.decorated_scaffold_hash, str)
+    assert isinstance(mof.undecorated_graph_hash, str)
+    assert isinstance(mof.undecorated_scaffold_hash, str)

From 9ec153e180cd89be637eb4e795c7245fbf7c5f94 Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Wed, 21 Dec 2022 19:51:10 +0100
Subject: [PATCH 02/17] continue refactor

---
 src/mofdscribe/featurizers/chemistry/amd.py   | 29 +++++-----
 src/mofdscribe/featurizers/chemistry/aprdf.py | 16 +++---
 .../featurizers/chemistry/energygrid.py       | 13 +++--
 src/mofdscribe/featurizers/chemistry/henry.py | 11 ++--
 .../chemistry/partialchargehistogram.py       | 12 ++---
 .../chemistry/partialchargestats.py           | 15 +++---
 src/mofdscribe/featurizers/chemistry/price.py | 13 +++--
 src/mofdscribe/featurizers/chemistry/racs.py  | 13 +++--
 .../featurizers/graph/dimensionality.py       |  5 ++
 .../featurizers/graph/graphfeaturizer.py      | 53 ++-----------------
 .../featurizers/pore/geometric_properties.py  | 12 ++---
 src/mofdscribe/featurizers/pore/voxelgrid.py  |  4 +-
 src/mofdscribe/mof.py                         |  4 +-
 src/mofdscribe/types.py                       |  2 +-
 tests/featurizers/chemistry/test_amd.py       |  9 ++--
 tests/featurizers/chemistry/test_aprdf.py     |  3 +-
 .../featurizers/chemistry/test_energygrid.py  |  7 +--
 tests/featurizers/chemistry/test_henry.py     |  9 ++--
 .../chemistry/test_partialchargehistogram.py  |  3 +-
 .../chemistry/test_partialchargestats.py      |  3 +-
 tests/featurizers/chemistry/test_price.py     |  5 +-
 tests/featurizers/chemistry/test_racs.py      | 11 ++--
 .../featurizers/graph/test_dimensionality.py  |  6 ++-
 .../featurizers/graph/test_graphfeaturizer.py |  9 ----
 .../hostguest/test_host_guest_featurizer.py   |  1 +
 25 files changed, 114 insertions(+), 154 deletions(-)
 delete mode 100644 tests/featurizers/graph/test_graphfeaturizer.py

diff --git a/src/mofdscribe/featurizers/chemistry/amd.py b/src/mofdscribe/featurizers/chemistry/amd.py
index 6daffeda..54a488be 100644
--- a/src/mofdscribe/featurizers/chemistry/amd.py
+++ b/src/mofdscribe/featurizers/chemistry/amd.py
@@ -1,13 +1,13 @@
 # -*- coding: utf-8 -*-
 """Generalized average minimum distance (AMD) featurizer."""
-from typing import List, Tuple, Union
+from typing import List, Tuple
 
 import numpy as np
-from pymatgen.core import IStructure, Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
 from mofdscribe.featurizers.utils.substructures import filter_element
+from mofdscribe.types import StructureIStructureType
 
 __all__ = ["AMD"]
 
@@ -43,7 +43,6 @@ def __init__(
         ),
         compute_for_all_elements: bool = True,
         aggregations: Tuple[str] = ("mean",),
-        primitive: bool = True,
     ) -> None:
         """Initialize the AMD descriptor.
 
@@ -60,8 +59,6 @@ def __init__(
                 the original structure with all elements. Defaults to True.
             aggregations (tuple): Aggregations of the AMD descriptor.
                 The 'mean' is equivalent to the original AMD. Defaults to ('mean',).
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to True.
         """
         self.k = k
         atom_types = [] if atom_types is None else atom_types
@@ -71,7 +68,6 @@ def __init__(
         )
         self.compute_for_all_elements = compute_for_all_elements
         self.aggregations = aggregations
-        super().__init__(primitive=primitive)
 
     def _get_feature_labels(self) -> List[str]:
         labels = []
@@ -85,21 +81,30 @@ def _get_feature_labels(self) -> List[str]:
     def feature_labels(self) -> List[str]:
         return self._get_feature_labels()
 
-    def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
+    def _featurize(self, structure: StructureIStructureType) -> np.ndarray:
         """Compute the AMD descriptor for a given structure.
 
         Args:
-            structure (Union[Structure, IStructure]): Structure to compute the descriptor for.
+            structure (StructureIStructureType): Structure to compute the descriptor for.
 
         Returns:
             A numpy array containing the AMD descriptor.
         """
-        from amd._nns import nearest_neighbours
-        from amd.calculate import _extract_motif_cell
-        from amd.periodicset import PeriodicSet
+        try:
+            from amd._nns import nearest_neighbours
+            from amd.calculate import _extract_motif_cell
+            from amd.periodicset import PeriodicSet
+        except ImportError:
+            raise ImportError(
+                "AMD featurizer requires the AMD package.\
+                See https://github.com/dwiddo/average-minimum-distance for installation instructions."
+            )
 
         def get_pdd(structure, k):
-            motif, cell, asymmetric_unit, multiplicities = _extract_motif_cell(
+            motif, cell, asymmetric_unit, _multiplicities = _extract_motif_cell(
                 PeriodicSet(structure.cart_coords, structure.lattice.matrix)
             )
             pdd, _, _ = nearest_neighbours(motif, cell, asymmetric_unit, k)
diff --git a/src/mofdscribe/featurizers/chemistry/aprdf.py b/src/mofdscribe/featurizers/chemistry/aprdf.py
index 7cbe7efc..95ee0967 100644
--- a/src/mofdscribe/featurizers/chemistry/aprdf.py
+++ b/src/mofdscribe/featurizers/chemistry/aprdf.py
@@ -9,12 +9,11 @@
 
 import numpy as np
 from element_coder import encode
-from pymatgen.core import IStructure, Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
-
-from ..utils.aggregators import AGGREGATORS
-from ..utils.extend import operates_on_istructure, operates_on_structure
+from mofdscribe.featurizers.utils.aggregators import AGGREGATORS
+from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
+from mofdscribe.types import StructureIStructureType
 
 __all__ = ["APRDF"]
 
@@ -50,7 +49,6 @@ def __init__(
         properties: Tuple[str, int] = ("X", "electron_affinity"),
         aggregations: Tuple[str] = ("avg", "product", "diff"),
         normalize: bool = False,
-        primitive: bool = True,
     ):
         """Set up an atomic property (AP) weighted radial distribution function.
 
@@ -73,8 +71,6 @@ def __init__(
                 options. Defaults to ("avg", "product", "diff").
             normalize (bool): If True, the histogram is normalized by dividing
                 by the number of atoms. Defaults to False.
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to True.
         """
         self.lower_lim = lower_lim
         self.cutoff = cutoff
@@ -84,7 +80,6 @@ def __init__(
 
         self.b_smear = b_smear
         self.aggregations = aggregations
-        super().__init__(primitive=primitive)
 
     def precheck(self):
         pass
@@ -106,7 +101,10 @@ def _get_feature_labels(self):
 
         return list(aprdfs.flatten())
 
-    def _featurize(self, s: Union[Structure, IStructure]) -> np.array:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
+    def _featurize(self, s: StructureIStructureType) -> np.array:
         bins = self._bins
         aprdfs = np.zeros((len(self.properties), len(self.aggregations), len(bins)))
 
diff --git a/src/mofdscribe/featurizers/chemistry/energygrid.py b/src/mofdscribe/featurizers/chemistry/energygrid.py
index b9fd4a81..fbb0e3f6 100644
--- a/src/mofdscribe/featurizers/chemistry/energygrid.py
+++ b/src/mofdscribe/featurizers/chemistry/energygrid.py
@@ -9,11 +9,11 @@
 from pymatgen.core import IStructure, Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
 from mofdscribe.featurizers.utils.histogram import get_rdf
 from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell
 from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa
-
-from ..utils.extend import operates_on_istructure, operates_on_structure
+from mofdscribe.types import StructureIStructureType
 
 __all__ = ["EnergyGridHistogram"]
 GRID_INPUT_TEMPLATE = """SimulationType  MakeASCIGrid
@@ -104,7 +104,6 @@ def __init__(
         shifted: bool = False,
         separate_interactions: bool = True,
         run_eqeq: bool = True,
-        primitive: bool = False,
     ):
         """Construct the EnergyGridHistogram class.
 
@@ -147,8 +146,6 @@ def __init__(
                 Defaults to True.
             run_eqeq (bool): If true, runs EqEq to compute charges.
                 Defaults to True.
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to True.
 
         Raises:
             ValueError: If the `raspa_dir` is not a valid directory.
@@ -175,7 +172,6 @@ def __init__(
         self.shifted = shifted
         self.separate_interactions = separate_interactions
         self.run_eqeq = run_eqeq
-        super().__init__(primitive=primitive)
 
     def fit_transform(self, structures: List[Union[Structure, IStructure]]):
         ...
@@ -194,7 +190,10 @@ def feature_labels(self) -> List[str]:
                 labels.append(f"energygridhist_{self.mol_name}_{site}_{grid_point}")
         return labels
 
-    def _featurize(self, s: Union[Structure, IStructure]) -> np.array:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
+    def _featurize(self, s: StructureIStructureType) -> np.array:
         ff_molecules = {self.mol_name: self.mol_ff}
 
         parameters = {
diff --git a/src/mofdscribe/featurizers/chemistry/henry.py b/src/mofdscribe/featurizers/chemistry/henry.py
index 638fa6f0..fa8a13f0 100644
--- a/src/mofdscribe/featurizers/chemistry/henry.py
+++ b/src/mofdscribe/featurizers/chemistry/henry.py
@@ -5,13 +5,13 @@
 from typing import List, Union
 
 import numpy as np
-from pymatgen.core import IStructure, Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
 from mofdscribe.featurizers.utils.raspa.base_parser import parse_base_output
 from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell
 from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa
+from mofdscribe.types import StructureIStructureType
 
 __all__ = ["Henry"]
 
@@ -83,7 +83,6 @@ def __init__(
         separate_interactions: bool = True,
         run_eqeq: bool = True,
         return_std: bool = False,
-        primitive: bool = False,
     ):
         """Initialize the featurizer.
 
@@ -118,8 +117,6 @@ def __init__(
                 Defaults to True.
             return_std (bool): If true, return the standard deviations.
                 Defaults to False.
-            primitive (bool): If true, use the primitive unit cell.
-                Defaults to False.
 
         Raises:
             ValueError: If the `RASPA_DIR` environment variable is not set.
@@ -144,9 +141,11 @@ def __init__(
         self.temperature = temperature
         self.run_eqeq = run_eqeq
         self.return_std = return_std
-        super().__init__(primitive=primitive)
 
-    def _featurize(self, s: Union[Structure, IStructure]) -> np.array:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
+    def _featurize(self, s: StructureIStructureType) -> np.array:
         ff_molecules = {self.mol_name: self.mol_ff}
 
         parameters = {
diff --git a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py
index a46000d5..3ac436d9 100644
--- a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py
+++ b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """Partial charge histogram featurizer."""
-from typing import List, Union
+from typing import List
 
 import numpy as np
 from pymatgen.core import IStructure, Structure
@@ -9,6 +9,7 @@
 from mofdscribe.featurizers.utils.eqeq import get_eqeq_charges
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
 from mofdscribe.featurizers.utils.histogram import get_rdf
+from mofdscribe.types import StructureIStructureType
 
 __all__ = ["PartialChargeHistogram"]
 
@@ -27,7 +28,6 @@ def __init__(
         min_charge: float = -4,
         max_charge: float = 4,
         bin_size: float = 0.5,
-        primitive: bool = False,
     ) -> None:
         """Construct a new PartialChargeHistogram featurizer.
 
@@ -38,13 +38,10 @@ def __init__(
                 Defaults to 4.
             bin_size (float): Bin size.
                 Defaults to 0.5.
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to True.
         """
         self.min_charge = min_charge
         self.max_charge = max_charge
         self.bin_size = bin_size
-        super().__init__(primitive=primitive)
 
     def _get_grid(self):
         return np.arange(self.min_charge, self.max_charge, self.bin_size)
@@ -52,7 +49,10 @@ def _get_grid(self):
     def feature_labels(self) -> List[str]:
         return [f"chargehist_{val}" for val in self._get_grid()]
 
-    def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(s=mof.structure)
+
+    def _featurize(self, s: StructureIStructureType) -> np.ndarray:
         if isinstance(s, Structure):
             s = IStructure.from_sites(s.sites)
         _, results = get_eqeq_charges(s)
diff --git a/src/mofdscribe/featurizers/chemistry/partialchargestats.py b/src/mofdscribe/featurizers/chemistry/partialchargestats.py
index d9e0d581..4be296e0 100644
--- a/src/mofdscribe/featurizers/chemistry/partialchargestats.py
+++ b/src/mofdscribe/featurizers/chemistry/partialchargestats.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """Partial charge statistics featurizer."""
-from typing import List, Tuple, Union
+from typing import List, Tuple
 
 import numpy as np
 from pymatgen.core import IStructure, Structure
@@ -9,6 +9,7 @@
 from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
 from mofdscribe.featurizers.utils.eqeq import get_eqeq_charges
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
+from mofdscribe.types import StructureIStructureType
 
 __all__ = ["PartialChargeStats"]
 
@@ -26,9 +27,7 @@ class PartialChargeStats(MOFBaseFeaturizer):
     <https://www.nature.com/articles/s41467-020-17755-8>`_
     """
 
-    def __init__(
-        self, aggregations: Tuple[str] = ("max", "min", "std"), primitive: bool = True
-    ) -> None:
+    def __init__(self, aggregations: Tuple[str] = ("max", "min", "std")) -> None:
         """
         Initialize the PartialChargeStats featurizer.
 
@@ -36,16 +35,16 @@ def __init__(
             aggregations (Tuple[str]): Aggregations to compute.
                 For available methods, see :py:obj:`mofdscribe.featurizers.utils.aggregators.ARRAY_AGGREGATORS`.
                 Defaults to ("max", "min", "std").
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to True.
         """
         self.aggregations = aggregations
-        super().__init__(primitive=primitive)
 
     def feature_labels(self) -> List[str]:
         return [f"chargestat_{agg}" for agg in self.aggregations]
 
-    def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(s=mof.structure)
+
+    def _featurize(self, s: StructureIStructureType) -> np.ndarray:
         if isinstance(s, Structure):
             s = IStructure.from_sites(s.sites)
         _, results = get_eqeq_charges(s)
diff --git a/src/mofdscribe/featurizers/chemistry/price.py b/src/mofdscribe/featurizers/chemistry/price.py
index 0135217e..afde093d 100644
--- a/src/mofdscribe/featurizers/chemistry/price.py
+++ b/src/mofdscribe/featurizers/chemistry/price.py
@@ -9,6 +9,7 @@
 import pandas as pd
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.types import StructureIStructureType
 
 _THIS_DIR = os.path.dirname(os.path.abspath(__file__))
 
@@ -101,23 +102,18 @@ class PriceLowerBound(MOFBaseFeaturizer):
         "per_atom": _price_per_atom,
     }
 
-    def __init__(
-        self, projections: Tuple[str] = ("gravimetric", "volumetric"), primitive: bool = False
-    ):
+    def __init__(self, projections: Tuple[str] = ("gravimetric", "volumetric")):
         """Initialize the PriceLowerBound featurizer.
 
         Args:
             projections (Tuple[str]): List of projections to use.
                 Possible values are "gravimetric" and "volumetric".
                 Default is ("gravimetric", "volumetric").
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to False.
         """
         self.projections = projections
         self.element_masses = _get_element_abundance()
 
         self.projections = projections
-        super().__init__(primitive=primitive)
 
     def feature_labels(self) -> List[str]:
         labels = []
@@ -125,7 +121,10 @@ def feature_labels(self) -> List[str]:
             labels.append(f"price_lower_bound_{projection}")
         return labels
 
-    def _featurize(self, structure) -> np.ndarray:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
+    def _featurize(self, structure: StructureIStructureType) -> np.ndarray:
         element_masses = {}
 
         for site in structure.sites:
diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py
index fe6981f3..4087d034 100644
--- a/src/mofdscribe/featurizers/chemistry/racs.py
+++ b/src/mofdscribe/featurizers/chemistry/racs.py
@@ -10,6 +10,7 @@
 from pymatgen.core import IStructure, Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.chemistry._fragment import get_bb_indices
 from mofdscribe.featurizers.utils.aggregators import AGGREGATORS, ARRAY_AGGREGATORS
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
 from mofdscribe.featurizers.utils.structure_graph import (
@@ -17,8 +18,7 @@
     get_neighbors_at_distance,
     get_structure_graph,
 )
-
-from ._fragment import get_bb_indices
+from mofdscribe.types import StructureIStructureType
 
 __all__ = ("RACS",)
 
@@ -166,7 +166,6 @@ def __init__(
             "linker_scaffold",
             "nodes",
         ),
-        primitive: bool = True,
     ) -> None:
         """
         Initialize the RACS featurizer.
@@ -184,8 +183,6 @@ def __init__(
             bond_heuristic (str): Method used to guess bonds. Defaults to "vesta".
             bbs (Tuple[str]): Building blocks to use. Defaults to ("linker_all",
                 "linker_connecting", "linker_functional", "linker_scaffold", "nodes").
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to True.
         """
         self.attributes = attributes
         self.scopes = scopes
@@ -203,9 +200,11 @@ def __init__(
                 "linker_scaffold",
                 "nodes",
             ]
-        super().__init__(primitive=primitive)
 
-    def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure)
+
+    def _featurize(self, structure: StructureIStructureType) -> np.ndarray:
         if isinstance(structure, Structure):
             structure = IStructure.from_sites(structure)
         sg = get_structure_graph(structure, self.bond_heuristic)
diff --git a/src/mofdscribe/featurizers/graph/dimensionality.py b/src/mofdscribe/featurizers/graph/dimensionality.py
index 8f7c8633..7c0d9c77 100644
--- a/src/mofdscribe/featurizers/graph/dimensionality.py
+++ b/src/mofdscribe/featurizers/graph/dimensionality.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+"""Compute the bond-topological dimensionality."""
 from typing import List
 
 import numpy as np
@@ -11,6 +13,9 @@ class Dimensionality(GraphFeaturizer):
     def __init__(self) -> None:
         super().__init__()
 
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        return self._featurize(mof.structure_graph)
+
     def _featurize(self, structure_graph: StructureGraph) -> np.ndarray:
         return get_dimensionality_larsen(structure_graph)
 
diff --git a/src/mofdscribe/featurizers/graph/graphfeaturizer.py b/src/mofdscribe/featurizers/graph/graphfeaturizer.py
index c083d69d..4f5dddc0 100644
--- a/src/mofdscribe/featurizers/graph/graphfeaturizer.py
+++ b/src/mofdscribe/featurizers/graph/graphfeaturizer.py
@@ -1,60 +1,15 @@
-from functools import lru_cache
-from typing import Union
-
-import numpy as np
-from matminer.featurizers.base import BaseFeaturizer
-from pymatgen.analysis.graphs import StructureGraph
-from pymatgen.core.structure import IStructure, Structure
-
+# -*- coding: utf-8 -*-
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
-from mofdscribe.featurizers.utils.extend import (
-    operates_on_istructure,
-    operates_on_structure,
-    operates_on_structuregraph,
-)
-from mofdscribe.featurizers.utils.structure_graph import (
-    get_structure_graph as get_structure_graph_cached,
-)
+from mofdscribe.featurizers.utils.extend import operates_on_structuregraph
 
 # operates on needs to be extended to graphs
 
 
-def get_structure_graph(
-    structure: Union[Structure, IStructure], method: str = "vesta"
-) -> StructureGraph:
-    if isinstance(structure, Structure):
-        return get_structure_graph_cached(IStructure.from_sites(structure.sites), method)
-    elif isinstance(structure, IStructure):
-        return get_structure_graph_cached(structure, method)
-    else:
-        raise TypeError(f"Expected Structure or IStructure, got {type(structure)}")
-
-
 # cannot use here the MOFBaseFeaturizer because the input is not always a structure
 # not super sure about this yet, but I think, over the long run, I'd like this to only accept graphs
 # i'd like to have some kind of transformer objects that handle the conversion from structure to graph
 # if needed
 # all of this could then be orchestrated in a pipeline object
 @operates_on_structuregraph
-@operates_on_structure
-@operates_on_istructure
-class GraphFeaturizer(BaseFeaturizer):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def _featurize(self, structure_graph: StructureGraph) -> np.ndarray:
-        raise NotImplementedError()
-
-    def featurize(
-        self, structure_or_structuregraph: Union[Structure, StructureGraph]
-    ) -> np.ndarray:
-        if isinstance(structure_or_structuregraph, (Structure, IStructure)):
-            structure_graph = get_structure_graph(structure_or_structuregraph)
-        elif isinstance(structure_or_structuregraph, StructureGraph):
-            structure_graph = structure_or_structuregraph
-        else:
-            raise TypeError(
-                f"Expected Structure or StructureGraph, got {type(structure_or_structuregraph)}"
-            )
-
-        return self._featurize(structure_graph)
+class GraphFeaturizer(MOFBaseFeaturizer):
+    pass
diff --git a/src/mofdscribe/featurizers/pore/geometric_properties.py b/src/mofdscribe/featurizers/pore/geometric_properties.py
index 9bed420b..09ca6beb 100644
--- a/src/mofdscribe/featurizers/pore/geometric_properties.py
+++ b/src/mofdscribe/featurizers/pore/geometric_properties.py
@@ -21,7 +21,7 @@
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.utils import is_tool
 from mofdscribe.featurizers.utils.tempdir import TEMPDIR
-from mofdscribe.types import StuctureIStructureType
+from mofdscribe.types import StructureIStructureType
 
 ZEOPP_BASE_COMMAND = ["network"]
 HA_COMMAND = ["-ha"]
@@ -197,7 +197,7 @@ def __init__(
     def featurize(self, mof: "MOF") -> np.ndarray:
         return self._featurize(mof.structure)
 
-    def _featurize(self, s: StuctureIStructureType):
+    def _featurize(self, s: StructureIStructureType):
         result = run_zeopp(s, ["-res"], _parse_res_zeopp, self.ha)
         return np.array(list(result.values()))
 
@@ -284,7 +284,7 @@ def __init__(
     def featurize(self, mof: "MOF") -> np.ndarray:
         return self._featurize(mof.structure)
 
-    def _featurize(self, s: StuctureIStructureType) -> np.ndarray:
+    def _featurize(self, s: StructureIStructureType) -> np.ndarray:
         command = [
             "-sa",
             f"{self.channel_radius}",
@@ -375,7 +375,7 @@ def __init__(
     def featurize(self, mof: "MOF") -> np.ndarray:
         return self._featurize(mof.structure)
 
-    def _featurize(self, s: StuctureIStructureType) -> np.ndarray:
+    def _featurize(self, s: StructureIStructureType) -> np.ndarray:
         command = ["-vol", f"{self.channel_radius}", f"{self.probe_radius}", f"{self.num_samples}"]
         results = run_zeopp(s, command, _parse_volpo_zeopp, self.ha)
         return np.array(list(results.values()))
@@ -489,7 +489,7 @@ def feature_labels(self) -> List[str]:
     def featurize(self, mof: "MOF") -> np.ndarray:
         return self._featurize(mof.structure)
 
-    def _featurize(self, s: StuctureIStructureType) -> np.ndarray:
+    def _featurize(self, s: StructureIStructureType) -> np.ndarray:
         command = [
             "-ray_atom",
             f"{self.channel_radius}",
@@ -627,7 +627,7 @@ def feature_labels(self) -> List[str]:
     def featurize(self, mof: "MOF") -> np.ndarray:
         return self._featurize(mof.structure)
 
-    def _featurize(self, s: StuctureIStructureType) -> np.ndarray:
+    def _featurize(self, s: StructureIStructureType) -> np.ndarray:
         command = [
             "-psd",
             f"{self.channel_radius}",
diff --git a/src/mofdscribe/featurizers/pore/voxelgrid.py b/src/mofdscribe/featurizers/pore/voxelgrid.py
index 7605327a..68795c01 100644
--- a/src/mofdscribe/featurizers/pore/voxelgrid.py
+++ b/src/mofdscribe/featurizers/pore/voxelgrid.py
@@ -8,7 +8,7 @@
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.pore._voxelgrid import VoxelGrid as VGBase
-from mofdscribe.types import StuctureIStructureType
+from mofdscribe.types import StructureIStructureType
 
 
 def make_supercell(
@@ -154,7 +154,7 @@ def __init__(
     def featurize(self, mof: "MOF") -> np.ndarray:
         return self._featurize(mof.structure)
 
-    def _featurize(self, structure: StuctureIStructureType) -> np.ndarray:
+    def _featurize(self, structure: StructureIStructureType) -> np.ndarray:
         coords, numbers = make_supercell(
             structure.cart_coords, structure.atomic_numbers, structure.lattice.matrix, self.min_size
         )
diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py
index 4790fbf6..c79b31fa 100644
--- a/src/mofdscribe/mof.py
+++ b/src/mofdscribe/mof.py
@@ -14,7 +14,7 @@
     undecorated_scaffold_hash,
 )
 
-from mofdscribe.types import PathType, StuctureIStructureType
+from mofdscribe.types import PathType, StructureIStructureType
 
 
 class MOF:
@@ -48,7 +48,7 @@ class MOF:
 
     def __init__(
         self,
-        structure: StuctureIStructureType,
+        structure: StructureIStructureType,
         structure_graph: Optional[StructureGraph] = None,
         local_env_strategy: str = "vesta",
         fragmentation_kwargs: Optional[dict] = None,
diff --git a/src/mofdscribe/types.py b/src/mofdscribe/types.py
index 30d98e98..ee6093c4 100644
--- a/src/mofdscribe/types.py
+++ b/src/mofdscribe/types.py
@@ -6,4 +6,4 @@
 from typing_extensions import TypeAlias
 
 PathType: TypeAlias = Union[str, Path]
-StuctureIStructureType: TypeAlias = Union[IStructure, Structure]
+StructureIStructureType: TypeAlias = Union[IStructure, Structure]
diff --git a/tests/featurizers/chemistry/test_amd.py b/tests/featurizers/chemistry/test_amd.py
index c5f50a88..b7eb158c 100644
--- a/tests/featurizers/chemistry/test_amd.py
+++ b/tests/featurizers/chemistry/test_amd.py
@@ -6,21 +6,22 @@
 import pytest
 
 from mofdscribe.featurizers.chemistry.amd import AMD
+from mofdscribe.mof import MOF
 
-THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
 def test_amd_consistency(hkust_structure):
     """Test that AMD is consistent with the original package."""
     amd_mofdscribe = AMD(k=100, atom_types=None)
-    amd_hkust_mofdscribe = amd_mofdscribe.featurize(hkust_structure)
+    amd_hkust_mofdscribe = amd_mofdscribe.featurize(MOF(hkust_structure))
 
-    reader = amd.CifReader(os.path.join(THIS_DIR, "..", "..", "test_files", "HKUST-1.cif"))
+    reader = amd.CifReader(os.path.join(_THIS_DIR, "..", "..", "test_files", "HKUST-1.cif"))
     amds = [amd.AMD(crystal, 100) for crystal in reader]
     for feat, amd_feat in zip(amds[0], amd_hkust_mofdscribe):
         assert feat == pytest.approx(amd_feat, abs=1e-3)
 
     amd_mofdscribe = AMD(k=100, aggregations=("mean", "std"))
-    amd_hkust_mofdscribe = amd_mofdscribe.featurize(hkust_structure)
+    amd_hkust_mofdscribe = amd_mofdscribe.featurize(MOF(hkust_structure))
     assert len(amd_hkust_mofdscribe) == 100 * 2 * 4
     assert len(amd_mofdscribe.feature_labels()) == 100 * 2 * 4
diff --git a/tests/featurizers/chemistry/test_aprdf.py b/tests/featurizers/chemistry/test_aprdf.py
index 576350ef..78baf67c 100644
--- a/tests/featurizers/chemistry/test_aprdf.py
+++ b/tests/featurizers/chemistry/test_aprdf.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """Test APRDF featurizer."""
 from mofdscribe.featurizers.chemistry.aprdf import APRDF
+from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
 
@@ -8,7 +9,7 @@
 def test_aprdf(hkust_structure):
     """Make sure that the featurization works for typical MOFs and the number of features is as expected."""
     aprdf_featurizer = APRDF()
-    feats = aprdf_featurizer.featurize(hkust_structure)
+    feats = aprdf_featurizer.featurize(MOF(hkust_structure))
     assert len(feats) == 432
     assert len(aprdf_featurizer.feature_labels()) == 432
     assert is_jsonable(dict(zip(aprdf_featurizer.feature_labels(), feats)))
diff --git a/tests/featurizers/chemistry/test_energygrid.py b/tests/featurizers/chemistry/test_energygrid.py
index 8efc8ae3..99e034e7 100644
--- a/tests/featurizers/chemistry/test_energygrid.py
+++ b/tests/featurizers/chemistry/test_energygrid.py
@@ -3,15 +3,16 @@
 import os
 
 from mofdscribe.featurizers.chemistry.energygrid import EnergyGridHistogram, read_ascii_grid
+from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
 
-THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
 def test_read_ascii_grid():
     """Ensure that we can parse an ASCII grid file."""
-    file_name = os.path.join(THIS_DIR, "..", "..", "test_files", "asci_grid_C_co2.grid")
+    file_name = os.path.join(_THIS_DIR, "..", "..", "test_files", "asci_grid_C_co2.grid")
     result = read_ascii_grid(file_name)
     assert len(result) == 22185
     assert result["energy"].dtype == float
@@ -20,7 +21,7 @@ def test_read_ascii_grid():
 def test_energygrid(hkust_structure):
     """Make sure that the featurization works for typical MOFs and the number of features is as expected."""
     eg = EnergyGridHistogram()
-    feats = eg.featurize(hkust_structure)
+    feats = eg.featurize(MOF(hkust_structure))
 
     assert len(feats) == 40
     assert len(eg.feature_labels()) == 40
diff --git a/tests/featurizers/chemistry/test_henry.py b/tests/featurizers/chemistry/test_henry.py
index 683164b3..24ff6df0 100644
--- a/tests/featurizers/chemistry/test_henry.py
+++ b/tests/featurizers/chemistry/test_henry.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """Test the Henry featurizer."""
 from mofdscribe.featurizers.chemistry.henry import Henry
+from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
 
@@ -8,19 +9,19 @@
 def test_henryhoa(hkust_structure, irmof_structure):
     """Make sure that the featurization works for typical MOFs and the number of features is as expected."""
     featurizer = Henry(cycles=10)
-    features = featurizer.featurize(hkust_structure)
+    features = featurizer.featurize(MOF(hkust_structure))
     labels = featurizer.feature_labels()
     assert len(features) == len(labels)
-    features_irmof = featurizer.featurize(irmof_structure)
+    features_irmof = featurizer.featurize(MOF(irmof_structure))
     assert sum(abs(features - features_irmof)) > 0
     assert is_jsonable(dict(zip(featurizer.feature_labels(), features)))
     assert features_irmof.ndim == 1
 
     featurizer = Henry(cycles=500, return_std=True)
-    features = featurizer.featurize(hkust_structure)
+    features = featurizer.featurize(MOF(hkust_structure))
     assert len(features) == len(featurizer.feature_labels()) == 4
 
     # make sure we indeed use pyeqeq
     featurizer = Henry(cycles=500, return_std=True, run_eqeq=False)
-    features_no_charge = featurizer.featurize(hkust_structure)
+    features_no_charge = featurizer.featurize(MOF(hkust_structure))
     assert features_no_charge[0] < features[0]
diff --git a/tests/featurizers/chemistry/test_partialchargehistogram.py b/tests/featurizers/chemistry/test_partialchargehistogram.py
index 2260f96a..c53d28d4 100644
--- a/tests/featurizers/chemistry/test_partialchargehistogram.py
+++ b/tests/featurizers/chemistry/test_partialchargehistogram.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """Test partial charge featurizer."""
 from mofdscribe.featurizers.chemistry.partialchargehistogram import PartialChargeHistogram
+from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
 
@@ -13,7 +14,7 @@ def test_partialchargehistogram(hkust_structure, irmof_structure):
     assert len(pch.feature_labels()) == 16
 
     for structure in [hkust_structure, irmof_structure]:
-        features = pch.featurize(structure)
+        features = pch.featurize(MOF(structure))
         assert len(features) == 16
     assert is_jsonable(dict(zip(pch.feature_labels(), features)))
     assert features.ndim == 1
diff --git a/tests/featurizers/chemistry/test_partialchargestats.py b/tests/featurizers/chemistry/test_partialchargestats.py
index 8510c2ad..b663eee6 100644
--- a/tests/featurizers/chemistry/test_partialchargestats.py
+++ b/tests/featurizers/chemistry/test_partialchargestats.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """Test the PartialChargeStats featurizer."""
 from mofdscribe.featurizers.chemistry.partialchargestats import PartialChargeStats
+from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
 
@@ -11,7 +12,7 @@ def test_partial_charge_stats(hkust_structure, irmof_structure):
     """
     for structure in [hkust_structure, irmof_structure]:
         featurizer = PartialChargeStats()
-        feats = featurizer.featurize(structure)
+        feats = featurizer.featurize(MOF(structure))
         assert len(feats) == 3
     assert len(featurizer.feature_labels()) == 3
     assert len(featurizer.citations()) == 2
diff --git a/tests/featurizers/chemistry/test_price.py b/tests/featurizers/chemistry/test_price.py
index f228329d..cc6863a9 100644
--- a/tests/featurizers/chemistry/test_price.py
+++ b/tests/featurizers/chemistry/test_price.py
@@ -2,17 +2,18 @@
 import pytest
 
 from mofdscribe.featurizers.chemistry.price import PriceLowerBound
+from mofdscribe.mof import MOF
 
 
 def test_price_lower_bound(hkust_structure):
     """Comparing with the original implementation."""
     pricer = PriceLowerBound()
-    feats = pricer.featurize(hkust_structure)
+    feats = pricer.featurize(MOF(hkust_structure))
     assert len(feats) == 2
     assert feats[0] == pytest.approx(4.176635436396251)
     assert feats[1] == pytest.approx(3.671662426852288)
 
     pricer = PriceLowerBound(("per_atom",))
-    feats = pricer.featurize(hkust_structure)
+    feats = pricer.featurize(MOF(hkust_structure))
     assert len(feats) == 1
     assert feats[0] == pytest.approx(64.77758513112447)
diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py
index 7ce244c0..44457fa7 100644
--- a/tests/featurizers/chemistry/test_racs.py
+++ b/tests/featurizers/chemistry/test_racs.py
@@ -8,6 +8,7 @@
 from mofdscribe.featurizers.chemistry._fragment import get_bb_indices
 from mofdscribe.featurizers.chemistry.racs import RACS, _get_racs_for_bbs
 from mofdscribe.featurizers.utils.structure_graph import get_structure_graph
+from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
 
@@ -16,7 +17,7 @@ def test_racs(hkust_structure, irmof_structure):
     """Make sure that the featurization works for typical MOFs and the number of features is as expected."""
     for structure in [hkust_structure, irmof_structure]:
         featurizer = RACS()
-        feats = featurizer.featurize(structure)
+        feats = featurizer.featurize(MOF(structure))
         sg = get_structure_graph(IStructure.from_sites(structure), featurizer.bond_heuristic)
         racs = {}
         bb_indices = get_bb_indices(sg)
@@ -50,8 +51,8 @@ def test_racs(hkust_structure, irmof_structure):
 def test_racs_functional(irmof_structure, abacuf_structure, floating_structure):
     # ABACUF doesn't have linkers with a core. It is simply HCOO
     for structure in [abacuf_structure]:
-        featurizer = RACS(primitive=False)  # because also the linker structure is not primitive
-        feats = featurizer.featurize(structure)
+        featurizer = RACS()  
+        feats = featurizer.featurize(MOF(structure))
         # assert len(feats) == 4 * 3 * 8 * 5  # 4 properties, 3 scopes, 8 aggregations, 5 bb types
         sg = get_structure_graph(IStructure.from_sites(structure), featurizer.bond_heuristic)
         racs = {}
@@ -78,7 +79,7 @@ def test_racs_functional(irmof_structure, abacuf_structure, floating_structure):
     assert is_jsonable(dict(zip(featurizer.feature_labels(), feats)))
     assert feats.ndim == 1
 
-    floating_feats = featurizer.featurize(floating_structure)
-    irmof_feats = featurizer.featurize(irmof_structure)
+    floating_feats = featurizer.featurize(MOF(floating_structure))
+    irmof_feats = featurizer.featurize(MOF(irmof_structure))
 
     assert np.allclose(floating_feats, irmof_feats, rtol=0.05, equal_nan=True)
diff --git a/tests/featurizers/graph/test_dimensionality.py b/tests/featurizers/graph/test_dimensionality.py
index 25b96e05..52e3e85e 100644
--- a/tests/featurizers/graph/test_dimensionality.py
+++ b/tests/featurizers/graph/test_dimensionality.py
@@ -1,9 +1,11 @@
+# -*- coding: utf-8 -*-
 from mofdscribe.featurizers.graph.dimensionality import Dimensionality
 from mofdscribe.featurizers.utils.structure_graph import get_structure_graph
+from mofdscribe.mof import MOF
 
 
 def test_dimensionality(hkust_structure):
     dim = Dimensionality()
     sg = get_structure_graph(hkust_structure, "vesta")
-    assert dim.featurize(hkust_structure) == 3
-    assert dim.featurize(sg) == 3
+    assert dim.featurize(MOF(hkust_structure)) == 3
+    assert dim._featurize(sg) == 3
diff --git a/tests/featurizers/graph/test_graphfeaturizer.py b/tests/featurizers/graph/test_graphfeaturizer.py
deleted file mode 100644
index d130450a..00000000
--- a/tests/featurizers/graph/test_graphfeaturizer.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from pymatgen.analysis.graphs import StructureGraph
-
-from mofdscribe.featurizers.graph.graphfeaturizer import get_structure_graph
-
-
-def test_get_structure_graph(hkust_structure):
-    sg = get_structure_graph(hkust_structure)
-    assert isinstance(sg, StructureGraph)
-    assert len(sg) == len(hkust_structure)
diff --git a/tests/featurizers/hostguest/test_host_guest_featurizer.py b/tests/featurizers/hostguest/test_host_guest_featurizer.py
index 9b194c3e..dbbd1bbf 100644
--- a/tests/featurizers/hostguest/test_host_guest_featurizer.py
+++ b/tests/featurizers/hostguest/test_host_guest_featurizer.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 from matminer.featurizers.structure.sites import SiteStatsFingerprint
 
 from mofdscribe.featurizers.hostguest import HostGuestFeaturizer

From 73daf4d3455e0af586af564aebeeace06cba9320 Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Wed, 21 Dec 2022 19:59:35 +0100
Subject: [PATCH 03/17] continue refactor

---
 src/mofdscribe/featurizers/base.py            |  1 +
 .../featurizers/bu/bu_featurizer.py           | 22 +++++--------
 src/mofdscribe/featurizers/bu/bu_matches.py   |  2 +-
 src/mofdscribe/featurizers/bu/fragment.py     | 22 -------------
 .../featurizers/bu/shape_featurizer.py        |  2 +-
 .../featurizers/bu/smarts_matches.py          |  1 +
 src/mofdscribe/featurizers/matmineradapter.py | 33 +++++++++++++++++++
 tests/featurizers/chemistry/test_racs.py      |  2 +-
 8 files changed, 46 insertions(+), 39 deletions(-)
 delete mode 100644 src/mofdscribe/featurizers/bu/fragment.py
 create mode 100644 src/mofdscribe/featurizers/matmineradapter.py

diff --git a/src/mofdscribe/featurizers/base.py b/src/mofdscribe/featurizers/base.py
index 75484d56..e5105a91 100644
--- a/src/mofdscribe/featurizers/base.py
+++ b/src/mofdscribe/featurizers/base.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 """Base featurizer for MOF structure based featurizers.
 
 The main purpose of these classes is currently that they handle
diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py
index d1a3c50b..baff7e82 100644
--- a/src/mofdscribe/featurizers/bu/bu_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py
@@ -3,11 +3,10 @@
 from typing import Collection, List, Optional, Tuple, Union
 
 import numpy as np
-from matminer.featurizers.base import BaseFeaturizer
-from pydantic import BaseModel
 from pymatgen.analysis.graphs import StructureGraph
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 
+from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.bu.fragment import fragment
 from mofdscribe.featurizers.bu.utils import boxed_molecule
 from mofdscribe.featurizers.utils import nan_array, set_operates_on
@@ -16,25 +15,18 @@
 STRUCTURE_VIEWS = ("all_atom", "connecting", "binding")
 
 
-class MOFBBs(BaseModel):
-    """Container for MOF building blocks."""
-
-    nodes: Optional[List[Union[Structure, Molecule, IStructure, IMolecule, StructureGraph]]]
-    linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule, StructureGraph]]]
-
-
-class ConnectingSitesFeaturizer(BaseFeaturizer):
+class ConnectingSitesFeaturizer(MOFBaseFeaturizer):
     ...
 
 
-class BindingSitesFeaturizer(BaseFeaturizer):
+class BindingSitesFeaturizer(MOFBaseFeaturizer):
     ...
 
 
 # If we would cache the fragmentation, the implementation would be simpler
 # ToDo: support moffragmentor options
 # ToDo: Support `MultipleFeaturizer`s (should be ok, if we recursively call the operates_on method).
-class BUFeaturizer(BaseFeaturizer):
+class BUFeaturizer(MOFBaseFeaturizer):
     """
     Compute features on the BUs and then aggregate them.
 
@@ -65,13 +57,15 @@ class BUFeaturizer(BaseFeaturizer):
     """
 
     def __init__(
-        self, featurizer: BaseFeaturizer, aggregations: Tuple[str] = ("mean", "std", "min", "max")
+        self,
+        featurizer: MOFBaseFeaturizer,
+        aggregations: Tuple[str] = ("mean", "std", "min", "max"),
     ) -> None:
         """
         Construct a new BUFeaturizer.
 
         Args:
-            featurizer (BaseFeaturizer): The featurizer to use.
+            featurizer (MOFBaseFeaturizer): The featurizer to use.
                 Currently, we do not support `MultipleFeaturizer`s.
                 Please, instead, use multiple BUFeaturizers.
                 If you use a featurizer that is not implemented in mofdscribe
diff --git a/src/mofdscribe/featurizers/bu/bu_matches.py b/src/mofdscribe/featurizers/bu/bu_matches.py
index 4bd3c99c..9140bdc1 100644
--- a/src/mofdscribe/featurizers/bu/bu_matches.py
+++ b/src/mofdscribe/featurizers/bu/bu_matches.py
@@ -10,7 +10,7 @@
 from matminer.featurizers.base import BaseFeaturizer
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 
-from ..utils.aggregators import ARRAY_AGGREGATORS
+from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
 
 THIS_DIR = os.path.dirname(os.path.abspath(__file__))
 with open(os.path.join(THIS_DIR, "prototype_env.json"), "r") as handle:
diff --git a/src/mofdscribe/featurizers/bu/fragment.py b/src/mofdscribe/featurizers/bu/fragment.py
deleted file mode 100644
index 453b2359..00000000
--- a/src/mofdscribe/featurizers/bu/fragment.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from functools import lru_cache
-from typing import List, Union
-
-from pymatgen.core import IStructure, Structure
-
-
-@lru_cache(maxsize=None)
-def _fragment_cached(structure):
-    from moffragmentor import MOF
-
-    mof = MOF.from_structure(structure)
-    fragments = mof.fragment()
-    return fragments
-
-
-def fragment(structure: Union[Structure, IStructure]) -> List[Structure]:
-    if isinstance(structure, Structure):
-        return _fragment_cached(IStructure.from_sites(structure.sites))
-    elif isinstance(structure, IStructure):
-        return _fragment_cached(structure)
-    else:
-        raise TypeError(f"Expected Structure or IStructure, got {type(structure)}")
diff --git a/src/mofdscribe/featurizers/bu/shape_featurizer.py b/src/mofdscribe/featurizers/bu/shape_featurizer.py
index c1b1002f..a71980ed 100644
--- a/src/mofdscribe/featurizers/bu/shape_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/shape_featurizer.py
@@ -13,7 +13,7 @@
 from rdkit.Chem.Descriptors3D import RadiusOfGyration as RadiusOfGyration_RDKIT
 from rdkit.Chem.Descriptors3D import SpherocityIndex as SpherocityIndex_RDKIT
 
-from .rdkitadaptor import RDKitAdaptor
+from mofdscribe.featurizes.bu.rdkitadaptor import RDKitAdaptor
 
 
 def _rod_likeness(mol):
diff --git a/src/mofdscribe/featurizers/bu/smarts_matches.py b/src/mofdscribe/featurizers/bu/smarts_matches.py
index ae5b141d..86465d13 100644
--- a/src/mofdscribe/featurizers/bu/smarts_matches.py
+++ b/src/mofdscribe/featurizers/bu/smarts_matches.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 """Featurize a molecule using SMARTS matches."""
 from functools import partial
 from typing import Collection, List, Optional
diff --git a/src/mofdscribe/featurizers/matmineradapter.py b/src/mofdscribe/featurizers/matmineradapter.py
new file mode 100644
index 00000000..7f7deb3c
--- /dev/null
+++ b/src/mofdscribe/featurizers/matmineradapter.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+"""Use matminer featurizers in mofdscribe."""
+from typing import List
+
+from pymatgen.core.structure import Structure
+
+from mofdscribe.featurizers.base import MOFBaseFeaturizer
+
+
+class MatminerAdapter(MOFBaseFeaturizer):
+    def __init__(self, matminer_featurizer):
+        self.matminer_featurizer = matminer_featurizer
+
+    def _featurize(self, structure: Structure):
+        return self.matminer_featurizer.featurize(structure)
+
+    def featurize(self, mof):
+        return self._featurize(mof.structure)
+
+    def _fit(self, structures: List[Structure]):
+        self.matminer_featurizer.fit(structures)
+
+    def fit(self, mofs):
+        self._fit([mof.structure for mof in mofs])
+
+    def feature_labels(self):
+        return self.matminer_featurizer.feature_labels()
+
+    def citations(self):
+        return self.matminer_featurizer.citations()
+
+    def implementors(self):
+        return self.matminer_featurizer.implementors()
diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py
index 44457fa7..6262be54 100644
--- a/tests/featurizers/chemistry/test_racs.py
+++ b/tests/featurizers/chemistry/test_racs.py
@@ -51,7 +51,7 @@ def test_racs(hkust_structure, irmof_structure):
 def test_racs_functional(irmof_structure, abacuf_structure, floating_structure):
     # ABACUF doesn't have linkers with a core. It is simply HCOO
     for structure in [abacuf_structure]:
-        featurizer = RACS()  
+        featurizer = RACS()
         feats = featurizer.featurize(MOF(structure))
         # assert len(feats) == 4 * 3 * 8 * 5  # 4 properties, 3 scopes, 8 aggregations, 5 bb types
         sg = get_structure_graph(IStructure.from_sites(structure), featurizer.bond_heuristic)

From 81cf3353ca5edf2f2426389269a997223dce7e5f Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Thu, 22 Dec 2022 13:25:19 +0100
Subject: [PATCH 04/17] finished most changes

---
 docs/source/background.rst                    | 16 ++--
 docs/source/getting_started.rst               | 34 +++----
 examples/build_model_using_mofdscribe.ipynb   |  5 +-
 examples/featurize.ipynb                      |  7 +-
 .../featurizers/bu/bu_featurizer.py           | 58 ++++++------
 src/mofdscribe/featurizers/bu/rdkitadaptor.py |  4 +-
 .../featurizers/bu/shape_featurizer.py        |  2 +-
 src/mofdscribe/featurizers/matmineradapter.py | 62 ++++++++++++-
 tests/featurizers/bu/test_bu_featurizer.py    | 27 +++---
 tests/test_files/BTC.cif                      | 93 ++++++++-----------
 tests/test_matmineradapter.py                 | 35 +++++++
 11 files changed, 211 insertions(+), 132 deletions(-)
 create mode 100644 tests/test_matmineradapter.py

diff --git a/docs/source/background.rst b/docs/source/background.rst
index 6fa7e999..d47926d7 100644
--- a/docs/source/background.rst
+++ b/docs/source/background.rst
@@ -75,11 +75,13 @@ mofdscribe can compute descriptors that are BU-centred, for instance, using RDKi
     from matminer.featurizers.structure import SiteStatsFingerprint
     from pymatgen.core import Structure
     from mofdscribe.featurizers.bu import BUFeaturizer
+    from mofdscribe.mof import MOF
+    from mofdscribe.featurizers.matmineradapter import MatminerAdapter
 
-    base_feat = SiteStatsFingerprint(SOAP.from_preset("formation_energy"))
-    base_feat.fit([hkust_structure])
+    base_feat = MatminerAdapter(SiteStatsFingerprint(SOAP.from_preset("formation_energy")))
+    base_feat.fit([MOF(hkust_structure)])
     featurizer = BUFeaturizer(base_feat, aggregations=("mean",))
-    features = featurizer.featurize(structure=hkust_structure)
+    features = featurizer.featurize(structure=MOF(hkust_structure))
 
 For this, you can either provide your building blocks that you extracted with any of the available tools, or use our integration with our `moffragmentor <https://github.com/kjappelbaum/moffragmentor>`_ package. In this case, we will fragment the MOF into its building blocks and then compute the features for each building block and let you choose how you want to aggregate them.
 
@@ -100,15 +102,15 @@ This featurizer will automatically extract the host and guest structures from th
 .. code-block:: python
 
     from matminer.featurizers.structure.sites import SiteStatsFingerprint
-
+    from mofdscribe.featurizers.matmineradapter import MatminerAdapter
     from mofdscribe.featurizers.hostguest import HostGuestFeaturizer
 
     featurizer = HostGuestFeaturizer(
-        featurizer=SiteStatsFingerprint.from_preset("SOAP_formation_energy"),
+        featurizer=MatminerAdapter(SiteStatsFingerprint.from_preset("SOAP_formation_energy")),
         aggregations=("mean",),
     )
-    featurizer.fit([structure])
-    features = featurizer.featurize(structure)
+    featurizer.fit([mof])
+    features = featurizer.featurize(mof)
     
 If you are interested in surface chemistry features, you might also find suitable featurizers in the `matminer <https://hackingmaterials.lbl.gov/matminer/>`_ package.
 
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index 4c9c5d9c..755c594d 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -8,23 +8,12 @@ Featurizing a MOF
 .. code-block:: python
 
     from mofdscribe.chemistry.racs import RACS
-    from pymatgen.core import Structure
+    from mofdscribe.mof import MOF
 
-    s = Structure.from_file(<my_cif>)
+    mof = MOF.from_file(<my_cif>)
     featurizer = RACS()
-    features = featurizer.featurize(s)
+    features = featurizer.featurize(mof)
 
-.. admonition:: mofdscribe base classes
-    :class: hint
-
-    Most featurizers in mofdscribe inherit from :py:class:`~mofdscribe.featurizers.base.MOFBaseFeaturizer`.
-    This class can also handle the conversion to primitive cells if you pass :code:`primitive=True` to the
-    constructor. This can be useful to save computational time but also make it possible to, e.g., 
-    use the :code:`sum` aggregation.
-
-    To avoid re-computation of the primitive cell, you should use the :py:class:`~mofdscribe.featurizers.base.MOFMultipleFeaturizer`
-    for combining multiple featurizers. This will accept a keyword argument :code:`primitive=True` in the constructor 
-    and then compute the primitive cell once and use it for all the featurizers.
 
 It is also easy to combine multiple featurizers into a single pipeline:
 
@@ -32,21 +21,21 @@ It is also easy to combine multiple featurizers into a single pipeline:
 
     from mofdscribe.chemistry.racs import RACS
     from mofdscribe.pore.geometric_properties import PoreDiameters
-    from pymatgen.core import Structure
+    from mofdscribe.mof import MOF
     from mofdscribe.featurizers.base import MOFMultipleFeaturizer
 
-    s = Structure.from_file(<my_cif>)
+    mof = MOF.from_file(<my_cif>)
     featurizer = MOFMultipleFeaturizer([RACS(), PoreDiameters()])
-    features = featurizer.featurize(s)
+    features = featurizer.featurize(mof)
 
 You can, of course, also pass multiple structures to the featurizer (and the
 featurization is automatically parallelized via matminer):
 
 .. code-block:: python
 
-  s = Structure.from_file(<my_cif>)
-  s2 = Structure.from_file(<my_cif2>)
-  features = featurizer.featurize_many([s, s2])
+  mof_1 = MOF.from_file(<my_cif>)
+  mof_2 = MOF.from_file(<my_cif2>)
+  features = featurizer.featurize_many([mof_1, mof_2])
 
 
 And, clearly, you can also use the `mofdscribe` featurizers alongside ones from `matminer`:
@@ -55,9 +44,10 @@ And, clearly, you can also use the `mofdscribe` featurizers alongside ones from
 
     from matminer.featurizers.structure import LocalStructuralOrderParams
     from mofdscribe.chemistry.racs import RACS
+    from mofdscribe.featurizers.matmineradapter import MatminerAdapter
 
-    featurizer = MOFMultipleFeaturizer([RACS(), LocalStructuralOrderParams()])
-    features = featurizer.featurize_many([s, s2])
+    featurizer = MOFMultipleFeaturizer([RACS(), MatminerAdapter(LocalStructuralOrderParams())])
+    features = featurizer.featurize_many([mof_2, mof_2])
 
 
 If you use the :code:`zeo++` or :code:`raspa2` packages, you can customize the temporary
diff --git a/examples/build_model_using_mofdscribe.ipynb b/examples/build_model_using_mofdscribe.ipynb
index 686248ef..94093bd8 100644
--- a/examples/build_model_using_mofdscribe.ipynb
+++ b/examples/build_model_using_mofdscribe.ipynb
@@ -54,7 +54,8 @@
     "from mofdscribe.featurizers.base import MOFMultipleFeaturizer\n",
     "from mofdscribe.datasets.thermal_stability_dataset import ThermalStabilityDataset\n",
     "from mofdscribe.datasets.structuredataset import FrameDataset\n",
-    "from mofdscribe.splitters import HashSplitter"
+    "from mofdscribe.splitters import HashSplitter\n",
+    "from mofdscribe.mof import MOF"
    ]
   },
   {
@@ -19021,7 +19022,7 @@
    ],
    "source": [
     "feats = featurizer.featurize_many(\n",
-    "    all_structures, ignore_errors=True\n",
+    "    [MOF(s) for s in all_structures], ignore_errors=True\n",
     ")  # we ignore errors here because some structures might not be fragmentable, those will then have NaNs in the feature matrix"
    ]
   },
diff --git a/examples/featurize.ipynb b/examples/featurize.ipynb
index 65722823..2f3e3590 100644
--- a/examples/featurize.ipynb
+++ b/examples/featurize.ipynb
@@ -17,7 +17,8 @@
     "\n",
     "from mofdscribe.datasets import CoREDataset\n",
     "from mofdscribe.featurizers.chemistry import APRDF, RACS\n",
-    "from mofdscribe.featurizers.base import MOFMultipleFeaturizer\n"
+    "from mofdscribe.featurizers.base import MOFMultipleFeaturizer\n",
+    "from mofdscribe.mof import MOF\n"
    ]
   },
   {
@@ -108,7 +109,7 @@
    ],
    "source": [
     "racs_featurizer = RACS()\n",
-    "racs = racs_featurizer.featurize(next(structures))\n"
+    "racs = racs_featurizer.featurize(MOF(next(structures)))\n"
    ]
   },
   {
@@ -1977,7 +1978,7 @@
     }
    ],
    "source": [
-    "feats_all = featurizer.featurize_many(list(ds.get_structures(range(3))), ignore_errors=True)\n"
+    "feats_all = featurizer.featurize_many([MOF(s) for s in list(ds.get_structures(range(3)))], ignore_errors=True)\n"
    ]
   },
   {
diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py
index baff7e82..85ed7496 100644
--- a/src/mofdscribe/featurizers/bu/bu_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py
@@ -3,17 +3,24 @@
 from typing import Collection, List, Optional, Tuple, Union
 
 import numpy as np
-from pymatgen.analysis.graphs import StructureGraph
+from pydantic import BaseModel
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
-from mofdscribe.featurizers.bu.fragment import fragment
 from mofdscribe.featurizers.bu.utils import boxed_molecule
 from mofdscribe.featurizers.utils import nan_array, set_operates_on
 from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
 
 STRUCTURE_VIEWS = ("all_atom", "connecting", "binding")
 
+# since fragmentation is not super straightforward,
+# we give the option to provide the fragments directly
+class MOFBBs(BaseModel):
+    """Container for MOF building blocks."""
+
+    nodes: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]]
+    linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]]
+
 
 class ConnectingSitesFeaturizer(MOFBaseFeaturizer):
     ...
@@ -53,7 +60,6 @@ class BUFeaturizer(MOFBaseFeaturizer):
                 mofbbs=MOFBBs(nodes=[BabelMolAdaptor.from_string(
                     "[CH-]1C=CC=C1.[CH-]1C=CC=C1.[Fe+2]", "smi").pymatgen_mol],
                 linkers=[BabelMolAdaptor.from_string("CCCC", "smi").pymatgen_mol]))
-
     """
 
     def __init__(
@@ -90,25 +96,24 @@ def feature_labels(self) -> List[str]:
 
     def _extract_bbs(
         self,
-        structure: Optional[Union[Structure, IStructure]] = None,
+        mof: Optional["MOF"],
         mofbbs: Optional[MOFBBs] = None,
     ):
-        if structure is None and mofbbs is None:
+        if mof is None and mofbbs is None:
             raise ValueError("You must provide a structure or mofbbs.")
 
-        if structure is not None:
-            fragments = fragment(structure)
+        if mof is not None:
+            fragments = mof.fragments
 
             if self._operates_on in ("both", "molecule"):
                 linkers = [linker.molecule for linker in fragments.linkers]
                 nodes = [node.molecule for node in fragments.nodes]
             else:
                 # create a boxed structure
-                linkers = [boxed_molecule(linker.molecule) for linker in fragments.linkers]
-                nodes = [boxed_molecule(node.molecule) for node in fragments.nodes]
+                linkers = [linker._get_boxed_structure() for linker in fragments.linkers]
+                nodes = [node._get_boxed_structure() for node in fragments.nodes]
 
         if mofbbs is not None:
-
             linkers = list(mofbbs.linkers) if mofbbs.linkers is not None else []
             nodes = list(mofbbs.nodes) if mofbbs.nodes is not None else []
             types = [type(node) for node in nodes] + [type(linker) for linker in linkers]
@@ -138,22 +143,25 @@ def _extract_bbs(
 
         return nodes, linkers
 
+    def _fit(self):
+        raise NotImplementedError("BUFeaturizer does not support _fit.")
+
     def fit(
         self,
-        structures: Optional[Collection[Structure]] = None,
+        mofs: Optional[Collection["MOF"]] = None,
         mofbbs: Optional[Collection[MOFBBs]] = None,
     ) -> None:
         """
         Fit the featurizer to the given structures.
 
         Args:
-            structures (Collection[Structure], optional): The structures to featurize.
+            structures (Collection[MOF], optional): The MOFs to featurize.
             mofbbs (Collection[MOFBBs], optional): The MOF fragments (nodes and linkers).
         """
         all_nodes, all_linkers = [], []
-        if structures is not None:
-            for structure in structures:
-                nodes, linkers = self._extract_bbs(structure=structure)
+        if mofs is not None:
+            for mof in mofs:
+                nodes, linkers = self._extract_bbs(mof=mof)
                 all_nodes.extend(nodes)
                 all_linkers.extend(linkers)
         if mofbbs is not None:
@@ -163,16 +171,14 @@ def fit(
                 all_linkers.extend(linkers)
 
         all_fragments = all_nodes + all_linkers
-        self._featurizer.fit(all_fragments)
+        self._featurizer._fit(all_fragments)
+
+    def _featurize(self):
+        raise NotImplementedError("BUFeaturizer does not support _featurize.")
 
-    # ToDo:
-    # - Perhaps use type dispatch instead of different keyword arguments.
-    #   (we can use fastcore or code it ourselves)
-    # - We can also directly pass the graph to the featurizer if it want to work
-    #     on the graph.
     def featurize(
         self,
-        structure: Optional[Union[Structure, IStructure]] = None,
+        mof: Optional["MOF"] = None,
         mofbbs: Optional[MOFBBs] = None,
     ) -> np.ndarray:
         """
@@ -186,7 +192,7 @@ def featurize(
         where possible.
 
         Args:
-            structure (Union[Structure, IStructure], optional): The structure to featurize.
+            mof (MOF, optional): The structure to featurize.
             mofbbs (MOFBBs, optional): The MOF fragments (nodes and linkers).
 
         Returns:
@@ -194,12 +200,12 @@ def featurize(
         """
         # if i know what the featurizer wants, I can always cast to a structure
         num_features = len(self._featurizer.feature_labels())
-        nodes, linkers = self._extract_bbs(structure, mofbbs)
-        linker_feats = [self._featurizer.featurize(linker) for linker in linkers]
+        nodes, linkers = self._extract_bbs(mof, mofbbs)
+        linker_feats = [self._featurizer._featurize(linker) for linker in linkers]
         if not linker_feats:
             linker_feats = [nan_array(num_features)]
 
-        node_feats = [self._featurizer.featurize(node) for node in nodes]
+        node_feats = [self._featurizer._featurize(node) for node in nodes]
         if not node_feats:
             node_feats = [nan_array(num_features)]
 
diff --git a/src/mofdscribe/featurizers/bu/rdkitadaptor.py b/src/mofdscribe/featurizers/bu/rdkitadaptor.py
index 8370871a..dd5db2f7 100644
--- a/src/mofdscribe/featurizers/bu/rdkitadaptor.py
+++ b/src/mofdscribe/featurizers/bu/rdkitadaptor.py
@@ -10,8 +10,8 @@
 from pymatgen.core import Molecule
 from structuregraph_helpers.create import get_local_env_method
 
-from .utils import create_rdkit_mol_from_mol_graph
-from ..utils.extend import operates_on_imolecule, operates_on_molecule
+from mofdscribe.featurizers.bu.utils import create_rdkit_mol_from_mol_graph
+from mofdscribe.featurizers.utils.extend import operates_on_imolecule, operates_on_molecule
 
 
 @operates_on_molecule
diff --git a/src/mofdscribe/featurizers/bu/shape_featurizer.py b/src/mofdscribe/featurizers/bu/shape_featurizer.py
index a71980ed..fd920efe 100644
--- a/src/mofdscribe/featurizers/bu/shape_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/shape_featurizer.py
@@ -13,7 +13,7 @@
 from rdkit.Chem.Descriptors3D import RadiusOfGyration as RadiusOfGyration_RDKIT
 from rdkit.Chem.Descriptors3D import SpherocityIndex as SpherocityIndex_RDKIT
 
-from mofdscribe.featurizes.bu.rdkitadaptor import RDKitAdaptor
+from mofdscribe.featurizers.bu.rdkitadaptor import RDKitAdaptor
 
 
 def _rod_likeness(mol):
diff --git a/src/mofdscribe/featurizers/matmineradapter.py b/src/mofdscribe/featurizers/matmineradapter.py
index 7f7deb3c..8e8bb585 100644
--- a/src/mofdscribe/featurizers/matmineradapter.py
+++ b/src/mofdscribe/featurizers/matmineradapter.py
@@ -2,32 +2,84 @@
 """Use matminer featurizers in mofdscribe."""
 from typing import List
 
+import numpy as np
+from matminer.featurizers.base import BaseFeaturizer
 from pymatgen.core.structure import Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 
 
 class MatminerAdapter(MOFBaseFeaturizer):
-    def __init__(self, matminer_featurizer):
+    """
+    MatminerAdapter is a wrapper for matminer featurizers to be used in mofdscribe.
+
+    That is, you can then pass :py:class:`mofdscribe.mof.MOF` objects to the
+    featurizer.
+
+    .. note::
+
+        If your matminer featurizer needs an expensive
+        computation, e.g., the structure graph, if cannot reuse the
+        one computed on the ``MOF`` object.
+
+    Example:
+
+        >>> from mofdscribe.featurizers.matmineradapter import MatminerAdapter
+        >>> from matminer.featurizers.structure import DensityFeatures
+        >>> from mofdscribe.mof import MOF
+        >>> from pymatgen.core.structure import Structure
+        >>> hkust_structure = Structure.from_file("tests/data/hkust-1.cif")
+        >>> matminer_featurizer = DensityFeatures()
+        >>> adapter = MatminerAdapter(matminer_featurizer)
+        >>> features = adapter.featurize(MOF(hkust_structure))
+        >>> original_features = matminer_featurizer.featurize(hkust_structure)
+        >>> np.allclose(features, original_features)
+        True
+    """
+
+    def __init__(self, matminer_featurizer: BaseFeaturizer):
+        """Construct a MatminerAdapter.
+
+        Args:
+            matminer_featurizer (BaseFeaturizer): A matminer featurizer.
+        """
         self.matminer_featurizer = matminer_featurizer
 
     def _featurize(self, structure: Structure):
+        """Call the featurize method of the matminer featurizer."""
         return self.matminer_featurizer.featurize(structure)
 
-    def featurize(self, mof):
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        """Call the featurize method of the matminer featurizer using a MOF as input.
+
+        Args:
+            mof (MOF): A MOF object.
+
+        Returns:
+            np.ndarray: The features.
+        """
         return self._featurize(mof.structure)
 
-    def _fit(self, structures: List[Structure]):
+    def _fit(self, structures: List[Structure]) -> np.ndarray:
+        """Call the fit method of the matminer featurizer."""
         self.matminer_featurizer.fit(structures)
 
-    def fit(self, mofs):
-        self._fit([mof.structure for mof in mofs])
+    def fit(self, mofs: List["MOF"]):
+        """Call the fit method of the matminer featurizer using a list of MOFs as input.
+
+        Args:
+            mofs (List[MOF]): A list of MOF objects.
+        """
+        self._fit([Structure.from_sites(mof.structure.sites) for mof in mofs])
 
     def feature_labels(self):
+        """Call the feature_labels method of the matminer featurizer."""
         return self.matminer_featurizer.feature_labels()
 
     def citations(self):
+        """Call the citations method of the matminer featurizer."""
         return self.matminer_featurizer.citations()
 
     def implementors(self):
+        """Call the implementors method of the matminer featurizer."""
         return self.matminer_featurizer.implementors()
diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py
index 90c6b3ab..0ea5db30 100644
--- a/tests/featurizers/bu/test_bu_featurizer.py
+++ b/tests/featurizers/bu/test_bu_featurizer.py
@@ -7,7 +7,9 @@
 from pymatgen.core import Structure
 
 from mofdscribe.featurizers.bu.bu_featurizer import BUFeaturizer, MOFBBs
+from mofdscribe.featurizers.matmineradapter import MatminerAdapter
 from mofdscribe.featurizers.topology import PHStats
+from mofdscribe.mof import MOF
 
 
 def test_bu_featurizer(hkust_structure, molecule):
@@ -21,7 +23,7 @@ def test_bu_featurizer(hkust_structure, molecule):
     assert features[600] < 2
 
     featurizer = BUFeaturizer(PHStats(no_supercell=True))
-    features = featurizer.featurize(structure=hkust_structure)
+    features = featurizer.featurize(mof=MOF(hkust_structure))
     assert features.shape == (768,)
     assert features[0] > 0
     assert features[0] < 2
@@ -30,28 +32,31 @@ def test_bu_featurizer(hkust_structure, molecule):
 def test_bu_featurizer_with_matminer_featurizer(hkust_structure, hkust_linker_structure):
     """Test that we can call BU featurizers with matminer molecules."""
     # we disable the periodic keyword to be able to compare with the molecules
-    base_feat = SiteStatsFingerprint(SOAP(6, 8, 8, 0.4, False, "gto", False))
+    base_feat = MatminerAdapter(SiteStatsFingerprint(SOAP(4, 4, 4, 0.1, False, "gto", False)))
     hkust_structure = Structure.from_sites(hkust_structure.sites)
-    base_feat.fit([hkust_structure])
     featurizer = BUFeaturizer(base_feat, aggregations=("mean",))
-    features = featurizer.featurize(structure=hkust_structure)
-    assert features.shape == (2592 * 2,)
+    featurizer.fit([MOF(hkust_structure)])
+    features = featurizer.featurize(mof=MOF(hkust_structure))
+    assert features.shape == (400 * 2,)
     assert features[0] >= 0
     assert features[0] < 2
 
     linker_feats = [f for f in featurizer.feature_labels() if "linker" in f]
-    assert len(linker_feats) == 2592
+    assert len(linker_feats) == 400
 
     # test that our fit method works
     featurizer = BUFeaturizer(
-        SiteStatsFingerprint(SOAP(6, 8, 8, 0.4, False, "gto", False)), aggregations=("mean",)
+        MatminerAdapter(SiteStatsFingerprint(SOAP(4, 4, 4, 0.1, False, "gto", False))),
+        aggregations=("mean",),
     )
-    featurizer.fit([hkust_structure])
-    features_direct_fit = featurizer.featurize(structure=hkust_structure)
+    featurizer.fit([MOF(hkust_structure)])
+    features_direct_fit = featurizer.featurize(mof=MOF(hkust_structure))
     assert np.allclose(features, features_direct_fit, rtol=0.01)
 
     # test that the linker features are actually the ones we get when
     # we featurize the linker
-    linker_feats = featurizer._featurizer.featurize(hkust_linker_structure)
+    linker_feats = featurizer._featurizer._featurize(hkust_linker_structure)
     linker_feature_mask = [i for i, f in enumerate(featurizer.feature_labels()) if "linker" in f]
-    assert np.allclose(features[linker_feature_mask], linker_feats, rtol=0.01, equal_nan=True)
+    assert len(features[linker_feature_mask]) == len(linker_feats)
+    # todo: check where potential differences come from 
+    assert np.allclose(features[linker_feature_mask], linker_feats, rtol=0.005, atol=1e-3)
diff --git a/tests/test_files/BTC.cif b/tests/test_files/BTC.cif
index baabe76e..c724395e 100644
--- a/tests/test_files/BTC.cif
+++ b/tests/test_files/BTC.cif
@@ -1,57 +1,44 @@
-
-#######################################################################
-#
-#                 Cambridge Crystallographic Data Centre
-#                                CCDC
-#
-#######################################################################
-#
-# If this CIF has been generated from an entry in the Cambridge
-# Structural Database, then it will include bibliographic, chemical,
-# crystal, experimental, refinement or atomic coordinate data resulting
-# from the CCDC's data processing and validation procedures.
-#
-#######################################################################
-
+# generated using pymatgen
 data_HC3O2
-_symmetry_cell_setting           triclinic
 _symmetry_space_group_name_H-M   'P 1'
-_symmetry_Int_Tables_number      1
-_space_group_name_Hall           'P 1'
+_cell_length_a   7.82719732
+_cell_length_b   7.82719732
+_cell_length_c   7.82719732
+_cell_angle_alpha   90.00000000
+_cell_angle_beta   90.00000000
+_cell_angle_gamma   90.00000000
+_symmetry_Int_Tables_number   1
+_chemical_formula_structural   HC3O2
+_chemical_formula_sum   'H3 C9 O6'
+_cell_volume   479.53338312
+_cell_formula_units_Z   3
 loop_
-_symmetry_equiv_pos_site_id
-_symmetry_equiv_pos_as_xyz
-1 x,y,z
-_cell_length_a                   26.34300000
-_cell_length_b                   26.34300000
-_cell_length_c                   26.34300000
-_cell_angle_alpha                90.00000000
-_cell_angle_beta                 90.00000000
-_cell_angle_gamma                90.00000000
-_cell_volume                     18280.8
+ _symmetry_equiv_pos_site_id
+ _symmetry_equiv_pos_as_xyz
+  1  'x, y, z'
 loop_
-_atom_site_label
-_atom_site_type_symbol
-_atom_site_fract_x
-_atom_site_fract_y
-_atom_site_fract_z
-C378 C 0.32200000 0.67800000 0.61300000
-O379 O 0.44800000 0.74300000 0.68300000
-C380 C 0.43100000 0.70300000 0.70300000
-C381 C 0.29700000 0.70300000 0.56900000
-C382 C 0.32200000 0.61300000 0.67800000
-H383 H 0.38000000 0.72800000 0.62000000
-O384 O 0.31700000 0.74300000 0.55200000
-C385 C 0.38700000 0.67800000 0.67800000
-C386 C 0.30100000 0.63400000 0.63400000
-H387 H 0.27200000 0.62000000 0.62000000
-C388 C 0.36600000 0.63400000 0.69900000
-H389 H 0.38000000 0.62000000 0.72800000
-O390 O 0.44800000 0.68300000 0.74300000
-O391 O 0.25700000 0.68300000 0.55200000
-O392 O 0.25700000 0.55200000 0.68300000
-C393 C 0.36600000 0.69900000 0.63400000
-O394 O 0.31700000 0.55200000 0.74300000
-C395 C 0.29700000 0.56900000 0.70300000
-
-#END
+ _atom_site_type_symbol
+ _atom_site_label
+ _atom_site_symmetry_multiplicity
+ _atom_site_fract_x
+ _atom_site_fract_y
+ _atom_site_fract_z
+ _atom_site_occupancy
+  H  H0  1  -2.08665495  -1.27891755  -2.45013678  1
+  O  O1  1  -2.50062036  -1.50777648  -2.29868601  1
+  C  C2  1  -2.28185815  -1.30247656  -2.28185815  1
+  O  O3  1  -1.85779602  -0.86495213  -2.29868601  1
+  C  C4  1  -2.13377296  -1.22843396  -2.35253517  1
+  O  O5  1  -2.50062036  -1.06688648  -1.85779602  1
+  H  H6  1  -2.45013678  -1.27891755  -2.08665495  1
+  C  C7  1  -2.28185815  -1.08371434  -2.06309594  1
+  C  C8  1  -2.36599746  -0.99957503  -1.91501075  1
+  C  C9  1  -2.13377296  -1.01303732  -2.13377296  1
+  O  O10  1  -2.29868601  -0.86495213  -1.85779602  1
+  O  O11  1  -2.29868601  -1.50777648  -2.50062036  1
+  C  C12  1  -2.35253517  -1.22843396  -2.13377296  1
+  C  C13  1  -1.91501075  -0.99957503  -2.36599746  1
+  C  C14  1  -2.06309594  -1.08371434  -2.28185815  1
+  H  H15  1  -2.08665495  -0.91543572  -2.08665495  1
+  O  O16  1  -1.85779602  -1.06688648  -2.50062036  1
+  C  C17  1  -2.36599746  -1.45056175  -2.36599746  1
diff --git a/tests/test_matmineradapter.py b/tests/test_matmineradapter.py
new file mode 100644
index 00000000..24c74bac
--- /dev/null
+++ b/tests/test_matmineradapter.py
@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+from matminer.featurizers.site import SOAP
+from matminer.featurizers.structure import DensityFeatures, SiteStatsFingerprint
+from pymatgen.core.structure import Structure
+
+from mofdscribe.featurizers.matmineradapter import MatminerAdapter
+from mofdscribe.mof import MOF
+
+
+def test_matminer_adapter(hkust_structure):
+    matminer_featurizer = DensityFeatures()
+    adapter = MatminerAdapter(matminer_featurizer)
+    features = adapter.featurize(MOF(hkust_structure))
+    original_features = matminer_featurizer.featurize(hkust_structure)
+
+    assert np.allclose(features, original_features)
+
+    assert adapter.feature_labels() == matminer_featurizer.feature_labels()
+    assert adapter.citations() == matminer_featurizer.citations()
+    assert adapter.implementors() == matminer_featurizer.implementors()
+
+
+def test_matminer_adapter_with_fit(hkust_structure):
+    original_featurizer = SiteStatsFingerprint(SOAP(6, 8, 8, 0.4, False, "gto", False))
+    hkust_structure = Structure.from_sites(hkust_structure.sites)
+    original_featurizer.fit([hkust_structure])
+    original_features = original_featurizer.featurize(hkust_structure)
+
+    matminer_featurizer = SiteStatsFingerprint(SOAP(6, 8, 8, 0.4, False, "gto", False))
+    adapter = MatminerAdapter(matminer_featurizer)
+    adapter.fit([MOF(hkust_structure)])
+    features = adapter.featurize(MOF(hkust_structure))
+
+    assert np.allclose(features, original_features)

From b3554a74e2dfcd114aacbc33aa42ff3b8ceb0e69 Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Thu, 22 Dec 2022 19:47:40 +0100
Subject: [PATCH 05/17] continue refactor

---
 .../featurizers/bu/bu_featurizer.py           | 102 ++++++++++++++++--
 src/mofdscribe/featurizers/bu/bu_matches.py   |   4 +
 .../bu/compositionstats_featurizer.py         |   3 +
 .../bu/distance_hist_featurizer.py            |   5 +
 .../bu/distance_stats_featurizer.py           |   5 +
 .../featurizers/bu/lsop_featurizer.py         |  13 ++-
 src/mofdscribe/featurizers/bu/rdkitadaptor.py |  15 +++
 tests/featurizers/bu/test_bu_featurizer.py    |   2 +-
 tests/featurizers/bu/test_lsop_featurizer.py  |   3 +-
 9 files changed, 140 insertions(+), 12 deletions(-)

diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py
index 85ed7496..a2454508 100644
--- a/src/mofdscribe/featurizers/bu/bu_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 """Compute features on the BUs and then aggregate them."""
+from abc import abstractmethod
 from typing import Collection, List, Optional, Tuple, Union
 
 import numpy as np
@@ -10,6 +11,7 @@
 from mofdscribe.featurizers.bu.utils import boxed_molecule
 from mofdscribe.featurizers.utils import nan_array, set_operates_on
 from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
+from mofdscribe.mof import MOF
 
 STRUCTURE_VIEWS = ("all_atom", "connecting", "binding")
 
@@ -22,14 +24,6 @@ class MOFBBs(BaseModel):
     linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]]
 
 
-class ConnectingSitesFeaturizer(MOFBaseFeaturizer):
-    ...
-
-
-class BindingSitesFeaturizer(MOFBaseFeaturizer):
-    ...
-
-
 # If we would cache the fragmentation, the implementation would be simpler
 # ToDo: support moffragmentor options
 # ToDo: Support `MultipleFeaturizer`s (should be ok, if we recursively call the operates_on method).
@@ -62,6 +56,8 @@ class BUFeaturizer(MOFBaseFeaturizer):
                 linkers=[BabelMolAdaptor.from_string("CCCC", "smi").pymatgen_mol]))
     """
 
+    _NAME = "BUFeaturizer"
+
     def __init__(
         self,
         featurizer: MOFBaseFeaturizer,
@@ -91,7 +87,7 @@ def feature_labels(self) -> List[str]:
         for bb in ["node", "linker"]:
             for aggregation in self._aggregations:
                 for label in base_labels:
-                    labels.append(f"{bb}_{aggregation}_{label}")
+                    labels.append(f"{self._NAME}_{bb}_{aggregation}_{label}")
         return labels
 
     def _extract_bbs(
@@ -226,3 +222,91 @@ def citations(self) -> List[str]:
 
     def implementors(self) -> List[str]:
         return self._featurizer.implementors()
+
+
+class _BUSubBaseFeaturizer(BUFeaturizer):
+    def featurize(
+        self,
+        mof: Optional["MOF"] = None,
+    ) -> np.ndarray:
+        """
+        Compute features on the BUs and then aggregate them.
+
+        If you provide a structure, we will fragment the MOF into BUs.
+        If you already have precomputed fragements or only want to consider a subset
+        of the BUs, you can provide them manually via the `mofbbs` argument.
+
+        If you manually provide the `mofbbs`,  we will convert molecules to structures
+        where possible.
+
+        Args:
+            mof (MOF, optional): The structure to featurize.
+
+        Returns:
+            A numpy array of features.
+        """
+        # if i know what the featurizer wants, I can always cast to a structure
+        num_features = len(self._featurizer.feature_labels())
+        nodes, linkers = self._extract(mof)
+        linker_feats = [self._featurizer._featurize(linker) for linker in linkers]
+        if not linker_feats:
+            linker_feats = [nan_array(num_features)]
+
+        node_feats = [self._featurizer._featurize(node) for node in nodes]
+        if not node_feats:
+            node_feats = [nan_array(num_features)]
+
+        aggregated_linker_feats = []
+        for aggregation in self._aggregations:
+            aggregated_linker_feats.extend(ARRAY_AGGREGATORS[aggregation](linker_feats, axis=0))
+        aggregated_linker_feats = np.array(aggregated_linker_feats)
+
+        aggregated_node_feats = []
+        for aggregation in self._aggregations:
+            aggregated_node_feats.extend(ARRAY_AGGREGATORS[aggregation](node_feats, axis=0))
+        aggregated_node_feats = np.array(aggregated_node_feats)
+
+        return np.concatenate((aggregated_node_feats, aggregated_linker_feats))
+
+    @abstractmethod
+    def _extract(self, mof: "MOF") -> Tuple[List[Structure], List[Structure]]:
+        raise NotImplementedError()
+
+    def fit(self, mofs: Collection["MOF"]):
+        all_nodes, all_linkers = [], []
+        for mof in mofs:
+            nodes, linkers = self._extract(mof)
+            all_nodes.extend(nodes)
+            all_linkers.extend(linkers)
+        self._featurizer.fit(all_nodes + all_linkers)
+
+    def citations(self) -> List[str]:
+        return self._featurizer.citations()
+
+    def implementors(self) -> List[str]:
+        return self._featurizer.implementors()
+
+
+class BindingSitesFeaturizer(_BUSubBaseFeaturizer):
+    """A special BU featurizer that operates on structures spanned by "binding sites".
+
+    From more details see.
+
+
+    Example:
+        >>> from mofdscribe.mof import MOF
+        >>> from mofdscribe.featurizers.bu.bu_featurizer import BindingSitesFeaturizer
+        >>> from mofdscribe.featurizers.bu.lsop_featurizer import LSOP
+        >>> import pandas as pd
+        >>> mof = MOF.from_file("mof_file.cif")
+        >>> featurizer = BindingSitesFeaturizer(LSOP())
+        >>> feats = featurizer.featurize(mof)
+        >>> pd.DataFrame([feats], columns=featurizer.feature_labels())
+    """
+
+    _NAME = "BindingSitesFeaturizer"
+
+    def _extract(self, mof: "MOF"):
+        linkers = [l._get_binding_sites_structure() for l in mof.fragments.linkers]
+        nodes = [n._get_binding_sites_structure() for n in mof.fragments.nodes]
+        return nodes, linkers
diff --git a/src/mofdscribe/featurizers/bu/bu_matches.py b/src/mofdscribe/featurizers/bu/bu_matches.py
index 9140bdc1..ec154caa 100644
--- a/src/mofdscribe/featurizers/bu/bu_matches.py
+++ b/src/mofdscribe/featurizers/bu/bu_matches.py
@@ -173,6 +173,10 @@ def feature_labels(self) -> List[str]:
         return self._get_feature_labels()
 
     def featurize(self, s: Union[Structure, IStructure, Molecule, IMolecule]) -> np.ndarray:
+        """Structure is here spanned by the connecting points of a BU."""
+        return self._featurize(s)
+
+    def _featurize(self, s: Union[Structure, IStructure, Molecule, IMolecule]) -> np.ndarray:
         """Structure is here spanned by the connecting points of a BU."""
         features = []
         for topo in self.topos:
diff --git a/src/mofdscribe/featurizers/bu/compositionstats_featurizer.py b/src/mofdscribe/featurizers/bu/compositionstats_featurizer.py
index 7ec7b6a9..e6b2749d 100644
--- a/src/mofdscribe/featurizers/bu/compositionstats_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/compositionstats_featurizer.py
@@ -58,6 +58,9 @@ def feature_labels(self) -> List[str]:
         return feature_labels
 
     def featurize(self, molecule: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray:
+        return self._featurize(molecule)
+
+    def _featurize(self, molecule: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray:
         encodings = defaultdict(list)
         for encoding in self.encodings:
             for site in molecule.sites:
diff --git a/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py b/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py
index 17e31dea..dc5fea1f 100644
--- a/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py
@@ -63,6 +63,11 @@ def feature_labels(self) -> List[str]:
         return [f"pairwise_distance_hist_{a}" for a in self._get_grid()]
 
     def featurize(self, structure: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray:
+        return self._featurize(structure)
+
+    def _featurize(
+        self, structure: Union[Molecule, IMolecule, Structure, IStructure]
+    ) -> np.ndarray:
         distances = []
         for i, _ in enumerate(structure):
             for j, _ in enumerate(structure):
diff --git a/src/mofdscribe/featurizers/bu/distance_stats_featurizer.py b/src/mofdscribe/featurizers/bu/distance_stats_featurizer.py
index d2231460..0816ab88 100644
--- a/src/mofdscribe/featurizers/bu/distance_stats_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/distance_stats_featurizer.py
@@ -43,6 +43,11 @@ def feature_labels(self) -> List[str]:
         return [f"pairwise_distance_stats_{a}" for a in self.aggregations]
 
     def featurize(self, structure: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray:
+        return self._featurize(structure)
+
+    def _featurize(
+        self, structure: Union[Molecule, IMolecule, Structure, IStructure]
+    ) -> np.ndarray:
         distances = []
         for i, _ in enumerate(structure):
             for j, _ in enumerate(structure):
diff --git a/src/mofdscribe/featurizers/bu/lsop_featurizer.py b/src/mofdscribe/featurizers/bu/lsop_featurizer.py
index 63d3a42a..c4cf0e50 100644
--- a/src/mofdscribe/featurizers/bu/lsop_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/lsop_featurizer.py
@@ -88,7 +88,18 @@ def __init__(
     def feature_labels(self) -> List[str]:
         return [f"lsop_{val}" for val in self.types]
 
-    def featurize(self, s: Union[Structure, IStructure, Molecule, IMolecule]) -> np.ndarray:
+    def featurize(self, mof: "MOF") -> np.ndarray:
+        """Compute the LSOP for a fragment.
+
+        Args:
+            mof (MOF): The MOF to compute the LSOP for.
+
+        Returns:
+            np.ndarray: The LSOP values.
+        """
+        return self._featurize(mof.structure)
+
+    def _featurize(self, s: Union[Structure, IStructure, Molecule, IMolecule]) -> np.ndarray:
         molecule = Molecule.from_sites(s.sites)
         com = molecule.center_of_mass
         orginal_len = len(molecule)
diff --git a/src/mofdscribe/featurizers/bu/rdkitadaptor.py b/src/mofdscribe/featurizers/bu/rdkitadaptor.py
index dd5db2f7..b8e1ee35 100644
--- a/src/mofdscribe/featurizers/bu/rdkitadaptor.py
+++ b/src/mofdscribe/featurizers/bu/rdkitadaptor.py
@@ -70,6 +70,21 @@ def featurize(self, molecule: Union[Molecule, MoleculeGraph]) -> np.ndarray:
         If the input molecule is a Molecule, we convert it to a MoleculeGraph
         using the local environment strategy specified in the constructor.
 
+        Args:
+            molecule: A pymatgen Molecule or MoleculeGraph object.
+
+        Returns:
+            A numpy array of features.
+        """
+        return self._featurize(molecule)
+
+    def _featurize(self, molecule: Union[Molecule, MoleculeGraph]) -> np.ndarray:
+        """
+        Call the RDKit featurizer on the molecule.
+
+        If the input molecule is a Molecule, we convert it to a MoleculeGraph
+        using the local environment strategy specified in the constructor.
+
         Args:
             molecule: A pymatgen Molecule or MoleculeGraph object.
 
diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py
index 0ea5db30..4535aa6e 100644
--- a/tests/featurizers/bu/test_bu_featurizer.py
+++ b/tests/featurizers/bu/test_bu_featurizer.py
@@ -58,5 +58,5 @@ def test_bu_featurizer_with_matminer_featurizer(hkust_structure, hkust_linker_st
     linker_feats = featurizer._featurizer._featurize(hkust_linker_structure)
     linker_feature_mask = [i for i, f in enumerate(featurizer.feature_labels()) if "linker" in f]
     assert len(features[linker_feature_mask]) == len(linker_feats)
-    # todo: check where potential differences come from 
+    # todo: check where potential differences come from
     assert np.allclose(features[linker_feature_mask], linker_feats, rtol=0.005, atol=1e-3)
diff --git a/tests/featurizers/bu/test_lsop_featurizer.py b/tests/featurizers/bu/test_lsop_featurizer.py
index bacd3a50..1d5ea85c 100644
--- a/tests/featurizers/bu/test_lsop_featurizer.py
+++ b/tests/featurizers/bu/test_lsop_featurizer.py
@@ -2,12 +2,13 @@
 """Test that the fragment LSOP featurizer works."""
 
 from mofdscribe.featurizers.bu.lsop_featurizer import LSOP
+from mofdscribe.mof import MOF
 
 
 def test_lsop_featurizer(triangle_structure):
     """Test that the LSOP featurizer works."""
     featurizer = LSOP(types=["tri_plan", "sq_pyr", "cn"])
-    features = featurizer.featurize(triangle_structure)
+    features = featurizer.featurize(MOF(triangle_structure))
     assert features.shape == (3,)
     assert len(features) == len(featurizer.feature_labels())
     assert features[0] > 0.5

From 96cc158cea4b8f627eebf223078fbca208575373 Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Fri, 23 Dec 2022 13:56:14 +0100
Subject: [PATCH 06/17] refactoring done (?)

---
 setup.cfg                                     | 18 ++--
 src/mofdscribe/featurizers/base.py            | 92 +++++--------------
 .../featurizers/bu/bu_featurizer.py           | 68 ++++++++++----
 .../featurizers/bu/lsop_featurizer.py         |  3 +-
 .../featurizers/bu/nconf20_featurizer.py      |  2 +-
 src/mofdscribe/featurizers/chemistry/amd.py   |  6 +-
 src/mofdscribe/featurizers/chemistry/aprdf.py |  3 +-
 .../featurizers/chemistry/energygrid.py       |  3 +-
 src/mofdscribe/featurizers/chemistry/henry.py |  3 +-
 .../chemistry/partialchargehistogram.py       |  3 +-
 .../chemistry/partialchargestats.py           |  3 +-
 src/mofdscribe/featurizers/chemistry/price.py |  3 +-
 src/mofdscribe/featurizers/chemistry/racs.py  | 18 +++-
 .../featurizers/graph/dimensionality.py       |  4 +-
 .../featurizers/graph/graphfeaturizer.py      |  3 +
 .../featurizers/hostguest/guest_aprdf.py      |  3 +-
 .../hostguest/host_guest_featurizer.py        |  7 +-
 src/mofdscribe/featurizers/matmineradapter.py | 10 +-
 .../featurizers/pore/geometric_properties.py  | 14 ++-
 src/mofdscribe/featurizers/pore/voxelgrid.py  |  3 +-
 .../featurizers/topology/atom_centered_ph.py  |  5 +-
 .../featurizers/topology/ph_hist.py           |  3 +-
 .../featurizers/topology/ph_image.py          |  9 +-
 .../featurizers/topology/ph_stats.py          |  4 +-
 .../featurizers/topology/ph_vect.py           | 37 ++++++--
 src/mofdscribe/mof.py                         |  5 +-
 tests/featurizers/bu/test_bu_featurizer.py    | 48 +++++++++-
 tests/featurizers/test_base.py                | 42 +--------
 tests/featurizers/topology/test_ph_vect.py    | 17 ++--
 tests/test_matmineradapter.py                 |  1 +
 tox.ini                                       |  8 +-
 31 files changed, 247 insertions(+), 201 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index ef2f8b0e..62ab6d3a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -177,17 +177,17 @@ strictness = short
 #########################
 [flake8]
 ignore =
-    S301 # pickle
-    S403 # pickle
+    S301 
+    S403 
     S404
     S603
-    W503 # Line break before binary operator (flake8 is wrong)
-    E203  # whitespace before ':'
-    S101 # Complaining about assert statements
-    D101 # Docstring missing
-    D102 # Docstring missing
-    D103 # Docstring missing
-    D104 # Docstring missing
+    W503 
+    E203 
+    S101 
+    D101 
+    D102 
+    D103 
+    D104 
     D400
 exclude =
     .tox,
diff --git a/src/mofdscribe/featurizers/base.py b/src/mofdscribe/featurizers/base.py
index e5105a91..761058cc 100644
--- a/src/mofdscribe/featurizers/base.py
+++ b/src/mofdscribe/featurizers/base.py
@@ -12,16 +12,12 @@
 or and iterable of pymatgen structure objects.
 """
 from abc import abstractmethod
-from multiprocessing import Pool
 from typing import Collection, Union
 
 import numpy as np
-import pandas as pd
-from loguru import logger
 from matminer.featurizers.base import BaseFeaturizer, MultipleFeaturizer
 from pymatgen.analysis.graphs import StructureGraph
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
-from tqdm.auto import tqdm
 
 from mofdscribe.mof import MOF
 
@@ -106,81 +102,37 @@ class MOFMultipleFeaturizer(MultipleFeaturizer):
         In this case, you're better off using two separate MultipleFeaturizers.
     """
 
-    def __init__(self, featurizers, iterate_over_entries=True, primitive=True):
+    def __init__(self, featurizers):
         """
         Construct a MOFMultipleFeaturizer.
 
         Args:
             featurizers (list): List of featurizers to use.
-            iterate_over_entries (bool): If True, featurize each entry in the list.
-                If False, featurize each structure in the list. Defaults to True.
-            primitive (bool): If True, use the primitive cell of the structure.
-                Defaults to True.
         """
-        # unset the primitive on the individual featurizers
-        for featurizer in featurizers:
-            featurizer.primitive = False
-
+        self.iterate_over_entries = True
         self.featurizers = featurizers
-        self.iterate_over_entries = iterate_over_entries
-        self.primitive_multiple = primitive
-
-    def _get_primitive(self, structure: Union[Structure, IStructure]) -> Structure:
-        return get_primitive(structure)
-
-    def featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray:
-        logger.debug(f"Featurizing structure in MOFMultipleFeaturizer, {self.primitive_multiple}")
-        if self.primitive_multiple:
-            logger.debug("Getting primitive cell")
-            structure = self._get_primitive(structure)
-        return np.array([feature for f in self.featurizers for feature in f.featurize(structure)])
-
-    def _get_primitive_many(self, structures):
-        if self.n_jobs == 1:
-            return [self._get_primitive(s) for s in structures]
-        else:
-            with Pool(self.n_jobs, maxtasksperchild=1) as p:
-                res = p.map(self._get_primitive, structures, chunksize=self.chunksize)
-                return res
+
+    def featurize(self, mof: MOF) -> np.ndarray:
+        return np.array([feature for f in self.featurizers for feature in f.featurize(mof)])
 
     def featurize_many(self, entries, ignore_errors=False, return_errors=False, pbar=True):
-        logger.debug("Precomputing primitive cells")
-        if self.primitive_multiple:
-            entries = self._get_primitive_many(entries)
-        if self.iterate_over_entries:
-            return np.array(
-                super().featurize_many(
-                    entries,
-                    ignore_errors=ignore_errors,
-                    return_errors=return_errors,
-                    pbar=pbar,
-                )
+        return np.array(
+            super().featurize_many(
+                entries,
+                ignore_errors=ignore_errors,
+                return_errors=return_errors,
+                pbar=pbar,
             )
-        else:
-            features = [
-                f.featurize_many(
-                    entries,
-                    ignore_errors=ignore_errors,
-                    return_errors=return_errors,
-                    pbar=pbar,
-                )
-                for f in self.featurizers
-            ]
-
-            return np.hstack(features)
+        )
 
     def featurize_wrapper(self, x, return_errors=False, ignore_errors=False):
-        if self.iterate_over_entries:
-            return np.array(
-                [
-                    feature
-                    for f in self.featurizers
-                    for feature in f.featurize_wrapper(
-                        x, return_errors=return_errors, ignore_errors=ignore_errors
-                    )
-                ]
-            )
-        else:
-            return super().featurize_wrapper(
-                x, return_errors=return_errors, ignore_errors=ignore_errors
-            )
+
+        return np.array(
+            [
+                feature
+                for f in self.featurizers
+                for feature in f.featurize_wrapper(
+                    x, return_errors=return_errors, ignore_errors=ignore_errors
+                )
+            ]
+        )
diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py
index a2454508..2641c133 100644
--- a/src/mofdscribe/featurizers/bu/bu_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py
@@ -4,16 +4,18 @@
 from typing import Collection, List, Optional, Tuple, Union
 
 import numpy as np
+from matminer.featurizers.base import MultipleFeaturizer
 from pydantic import BaseModel
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 
-from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.base import MOFBaseFeaturizer, MOFMultipleFeaturizer
 from mofdscribe.featurizers.bu.utils import boxed_molecule
 from mofdscribe.featurizers.utils import nan_array, set_operates_on
 from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
 from mofdscribe.mof import MOF
 
-STRUCTURE_VIEWS = ("all_atom", "connecting", "binding")
+__all__ = ("BUFeaturizer", "BranchingSitesFeaturizer", "BindingSitesFeaturizer")
+
 
 # since fragmentation is not super straightforward,
 # we give the option to provide the fragments directly
@@ -24,9 +26,6 @@ class MOFBBs(BaseModel):
     linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]]
 
 
-# If we would cache the fragmentation, the implementation would be simpler
-# ToDo: support moffragmentor options
-# ToDo: Support `MultipleFeaturizer`s (should be ok, if we recursively call the operates_on method).
 class BUFeaturizer(MOFBaseFeaturizer):
     """
     Compute features on the BUs and then aggregate them.
@@ -76,7 +75,12 @@ def __init__(
                 If you do not do this, we default to assuming that it operates on structures.
             aggregations (Tuple[str]): The aggregations to use.
                 Must be one of :py:obj:`ARRAY_AGGREGATORS`.
+
+        Raises:
+            ValueError: If the featurizer is a `MultipleFeaturizer`.
         """
+        if isinstance(featurizer, (MOFMultipleFeaturizer, MultipleFeaturizer)):
+            raise ValueError("BUFeaturizer does not support MultipleFeaturizer.")
         self._featurizer = featurizer
         self._aggregations = aggregations
         set_operates_on(self, featurizer)
@@ -92,7 +96,7 @@ def feature_labels(self) -> List[str]:
 
     def _extract_bbs(
         self,
-        mof: Optional["MOF"],
+        mof: Optional[MOF],
         mofbbs: Optional[MOFBBs] = None,
     ):
         if mof is None and mofbbs is None:
@@ -144,14 +148,14 @@ def _fit(self):
 
     def fit(
         self,
-        mofs: Optional[Collection["MOF"]] = None,
+        mofs: Optional[Collection[MOF]] = None,
         mofbbs: Optional[Collection[MOFBBs]] = None,
     ) -> None:
         """
         Fit the featurizer to the given structures.
 
         Args:
-            structures (Collection[MOF], optional): The MOFs to featurize.
+            mofs (Collection[MOF], optional): The MOFs to featurize.
             mofbbs (Collection[MOFBBs], optional): The MOF fragments (nodes and linkers).
         """
         all_nodes, all_linkers = [], []
@@ -174,7 +178,7 @@ def _featurize(self):
 
     def featurize(
         self,
-        mof: Optional["MOF"] = None,
+        mof: Optional[MOF] = None,
         mofbbs: Optional[MOFBBs] = None,
     ) -> np.ndarray:
         """
@@ -227,7 +231,7 @@ def implementors(self) -> List[str]:
 class _BUSubBaseFeaturizer(BUFeaturizer):
     def featurize(
         self,
-        mof: Optional["MOF"] = None,
+        mof: Optional[MOF] = None,
     ) -> np.ndarray:
         """
         Compute features on the BUs and then aggregate them.
@@ -269,10 +273,10 @@ def featurize(
         return np.concatenate((aggregated_node_feats, aggregated_linker_feats))
 
     @abstractmethod
-    def _extract(self, mof: "MOF") -> Tuple[List[Structure], List[Structure]]:
+    def _extract(self, mof: MOF) -> Tuple[List[Structure], List[Structure]]:
         raise NotImplementedError()
 
-    def fit(self, mofs: Collection["MOF"]):
+    def fit(self, mofs: Collection[MOF]):
         all_nodes, all_linkers = [], []
         for mof in mofs:
             nodes, linkers = self._extract(mof)
@@ -290,7 +294,11 @@ def implementors(self) -> List[str]:
 class BindingSitesFeaturizer(_BUSubBaseFeaturizer):
     """A special BU featurizer that operates on structures spanned by "binding sites".
 
-    From more details see.
+    We define binding sites as the linker atoms that directly connect to
+    the metal atoms of a node.
+    A good example are the carboxy oxygen atoms in a copper paddlewheel.
+    From more details see the `moffragmentor documentation
+    <https://moffragmentor.readthedocs.io/en/latest/background.html#fragmentation>`_.
 
 
     Example:
@@ -306,7 +314,35 @@ class BindingSitesFeaturizer(_BUSubBaseFeaturizer):
 
     _NAME = "BindingSitesFeaturizer"
 
-    def _extract(self, mof: "MOF"):
-        linkers = [l._get_binding_sites_structure() for l in mof.fragments.linkers]
-        nodes = [n._get_binding_sites_structure() for n in mof.fragments.nodes]
+    def _extract(self, mof: MOF):
+        linkers = [linker._get_binding_sites_structure() for linker in mof.fragments.linkers]
+        nodes = [node._get_binding_sites_structure() for node in mof.fragments.nodes]
+        return nodes, linkers
+
+
+class BranchingSitesFeaturizer(_BUSubBaseFeaturizer):
+    """A special BU featurizer that operates on structures spanned by "branching sites".
+
+    Branching sites are defined as the sites where a linker connects to a node.
+    The most common example is the carbon atom in a carboxy group.
+    From more details see the `moffragmentor documentation
+    <https://moffragmentor.readthedocs.io/en/latest/background.html#fragmentation>`_.
+
+
+    Example:
+        >>> from mofdscribe.mof import MOF
+        >>> from mofdscribe.featurizers.bu.bu_featurizer import BranchingSitesFeaturizer
+        >>> from mofdscribe.featurizers.bu.lsop_featurizer import LSOP
+        >>> import pandas as pd
+        >>> mof = MOF.from_file("mof_file.cif")
+        >>> featurizer = BranchingSitesFeaturizer(LSOP())
+        >>> feats = featurizer.featurize(mof)
+        >>> pd.DataFrame([feats], columns=featurizer.feature_labels())
+    """
+
+    _NAME = "BranchingSitesFeaturizer"
+
+    def _extract(self, mof: MOF):
+        linkers = [linker._get_branching_sites_structure() for linker in mof.fragments.linkers]
+        nodes = [node._get_branching_sites_structure() for node in mof.fragments.nodes]
         return nodes, linkers
diff --git a/src/mofdscribe/featurizers/bu/lsop_featurizer.py b/src/mofdscribe/featurizers/bu/lsop_featurizer.py
index c4cf0e50..60f4f964 100644
--- a/src/mofdscribe/featurizers/bu/lsop_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/lsop_featurizer.py
@@ -14,6 +14,7 @@
     operates_on_molecule,
     operates_on_structure,
 )
+from mofdscribe.mof import MOF
 
 _default_types = (
     "cn",
@@ -88,7 +89,7 @@ def __init__(
     def feature_labels(self) -> List[str]:
         return [f"lsop_{val}" for val in self.types]
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         """Compute the LSOP for a fragment.
 
         Args:
diff --git a/src/mofdscribe/featurizers/bu/nconf20_featurizer.py b/src/mofdscribe/featurizers/bu/nconf20_featurizer.py
index b5b3a245..de38d89e 100644
--- a/src/mofdscribe/featurizers/bu/nconf20_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/nconf20_featurizer.py
@@ -7,7 +7,7 @@
 from rdkit import Chem
 from rdkit.Chem import AllChem
 
-from .rdkitadaptor import RDKitAdaptor
+from mofdscribe.featurizers.bu.rdkitadaptor import RDKitAdaptor
 
 
 # ToDo: Allow to set some of the options, e.g. for UFF
diff --git a/src/mofdscribe/featurizers/chemistry/amd.py b/src/mofdscribe/featurizers/chemistry/amd.py
index 54a488be..55068171 100644
--- a/src/mofdscribe/featurizers/chemistry/amd.py
+++ b/src/mofdscribe/featurizers/chemistry/amd.py
@@ -7,6 +7,7 @@
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
 from mofdscribe.featurizers.utils.substructures import filter_element
+from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
 __all__ = ["AMD"]
@@ -81,7 +82,7 @@ def _get_feature_labels(self) -> List[str]:
     def feature_labels(self) -> List[str]:
         return self._get_feature_labels()
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(self, structure: StructureIStructureType) -> np.ndarray:
@@ -92,6 +93,9 @@ def _featurize(self, structure: StructureIStructureType) -> np.ndarray:
 
         Returns:
             A numpy array containing the AMD descriptor.
+
+        Raises:
+            ImportError: If the AMD package is not installed.
         """
         try:
             from amd._nns import nearest_neighbours
diff --git a/src/mofdscribe/featurizers/chemistry/aprdf.py b/src/mofdscribe/featurizers/chemistry/aprdf.py
index 95ee0967..3376d7be 100644
--- a/src/mofdscribe/featurizers/chemistry/aprdf.py
+++ b/src/mofdscribe/featurizers/chemistry/aprdf.py
@@ -13,6 +13,7 @@
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.utils.aggregators import AGGREGATORS
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
+from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
 __all__ = ["APRDF"]
@@ -101,7 +102,7 @@ def _get_feature_labels(self):
 
         return list(aprdfs.flatten())
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(self, s: StructureIStructureType) -> np.array:
diff --git a/src/mofdscribe/featurizers/chemistry/energygrid.py b/src/mofdscribe/featurizers/chemistry/energygrid.py
index fbb0e3f6..e8a8dd5b 100644
--- a/src/mofdscribe/featurizers/chemistry/energygrid.py
+++ b/src/mofdscribe/featurizers/chemistry/energygrid.py
@@ -13,6 +13,7 @@
 from mofdscribe.featurizers.utils.histogram import get_rdf
 from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell
 from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa
+from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
 __all__ = ["EnergyGridHistogram"]
@@ -190,7 +191,7 @@ def feature_labels(self) -> List[str]:
                 labels.append(f"energygridhist_{self.mol_name}_{site}_{grid_point}")
         return labels
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(self, s: StructureIStructureType) -> np.array:
diff --git a/src/mofdscribe/featurizers/chemistry/henry.py b/src/mofdscribe/featurizers/chemistry/henry.py
index fa8a13f0..fde1bbf2 100644
--- a/src/mofdscribe/featurizers/chemistry/henry.py
+++ b/src/mofdscribe/featurizers/chemistry/henry.py
@@ -11,6 +11,7 @@
 from mofdscribe.featurizers.utils.raspa.base_parser import parse_base_output
 from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell
 from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa
+from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
 __all__ = ["Henry"]
@@ -142,7 +143,7 @@ def __init__(
         self.run_eqeq = run_eqeq
         self.return_std = return_std
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(self, s: StructureIStructureType) -> np.array:
diff --git a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py
index 3ac436d9..963e7f78 100644
--- a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py
+++ b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py
@@ -9,6 +9,7 @@
 from mofdscribe.featurizers.utils.eqeq import get_eqeq_charges
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
 from mofdscribe.featurizers.utils.histogram import get_rdf
+from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
 __all__ = ["PartialChargeHistogram"]
@@ -49,7 +50,7 @@ def _get_grid(self):
     def feature_labels(self) -> List[str]:
         return [f"chargehist_{val}" for val in self._get_grid()]
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(s=mof.structure)
 
     def _featurize(self, s: StructureIStructureType) -> np.ndarray:
diff --git a/src/mofdscribe/featurizers/chemistry/partialchargestats.py b/src/mofdscribe/featurizers/chemistry/partialchargestats.py
index 4be296e0..fd668e85 100644
--- a/src/mofdscribe/featurizers/chemistry/partialchargestats.py
+++ b/src/mofdscribe/featurizers/chemistry/partialchargestats.py
@@ -9,6 +9,7 @@
 from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
 from mofdscribe.featurizers.utils.eqeq import get_eqeq_charges
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
+from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
 __all__ = ["PartialChargeStats"]
@@ -41,7 +42,7 @@ def __init__(self, aggregations: Tuple[str] = ("max", "min", "std")) -> None:
     def feature_labels(self) -> List[str]:
         return [f"chargestat_{agg}" for agg in self.aggregations]
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(s=mof.structure)
 
     def _featurize(self, s: StructureIStructureType) -> np.ndarray:
diff --git a/src/mofdscribe/featurizers/chemistry/price.py b/src/mofdscribe/featurizers/chemistry/price.py
index afde093d..3e80205f 100644
--- a/src/mofdscribe/featurizers/chemistry/price.py
+++ b/src/mofdscribe/featurizers/chemistry/price.py
@@ -9,6 +9,7 @@
 import pandas as pd
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
 _THIS_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -121,7 +122,7 @@ def feature_labels(self) -> List[str]:
             labels.append(f"price_lower_bound_{projection}")
         return labels
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(self, structure: StructureIStructureType) -> np.ndarray:
diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py
index 4087d034..11012092 100644
--- a/src/mofdscribe/featurizers/chemistry/racs.py
+++ b/src/mofdscribe/featurizers/chemistry/racs.py
@@ -18,6 +18,7 @@
     get_neighbors_at_distance,
     get_structure_graph,
 )
+from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
 __all__ = ("RACS",)
@@ -201,13 +202,20 @@ def __init__(
                 "nodes",
             ]
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
-        return self._featurize(mof.structure)
+    def featurize(self, mof: MOF) -> np.ndarray:
+        return self._featurize(mof.structure_graph)
 
-    def _featurize(self, structure: StructureIStructureType) -> np.ndarray:
-        if isinstance(structure, Structure):
+    def _featurize(self, structure: Union[StructureIStructureType, StructureGraph]) -> np.ndarray:
+        if isinstance(structure, (Structure, IStructure)):
             structure = IStructure.from_sites(structure)
-        sg = get_structure_graph(structure, self.bond_heuristic)
+            sg = get_structure_graph(structure, self.bond_heuristic)
+        elif isinstance(structure, StructureGraph):
+            sg = structure
+            structure = sg.structure
+        else:
+            raise TypeError(
+                f"Structure must be pymatgen Structure or StructureGraph, found {type(structure)}"
+            )
 
         racs = {}
         # This finds all indices for a particular subset of atoms
diff --git a/src/mofdscribe/featurizers/graph/dimensionality.py b/src/mofdscribe/featurizers/graph/dimensionality.py
index 7c0d9c77..d966b544 100644
--- a/src/mofdscribe/featurizers/graph/dimensionality.py
+++ b/src/mofdscribe/featurizers/graph/dimensionality.py
@@ -7,13 +7,15 @@
 from pymatgen.analysis.graphs import StructureGraph
 
 from mofdscribe.featurizers.graph.graphfeaturizer import GraphFeaturizer
+from mofdscribe.mof import MOF
 
 
 class Dimensionality(GraphFeaturizer):
     def __init__(self) -> None:
+        """Construct a new Dimensionality featurizer."""
         super().__init__()
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure_graph)
 
     def _featurize(self, structure_graph: StructureGraph) -> np.ndarray:
diff --git a/src/mofdscribe/featurizers/graph/graphfeaturizer.py b/src/mofdscribe/featurizers/graph/graphfeaturizer.py
index 4f5dddc0..20f3520b 100644
--- a/src/mofdscribe/featurizers/graph/graphfeaturizer.py
+++ b/src/mofdscribe/featurizers/graph/graphfeaturizer.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+"""Base class for featurizers that operate on graphs."""
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.utils.extend import operates_on_structuregraph
 
@@ -12,4 +13,6 @@
 # all of this could then be orchestrated in a pipeline object
 @operates_on_structuregraph
 class GraphFeaturizer(MOFBaseFeaturizer):
+    """Base class for graph featurizers."""
+
     pass
diff --git a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py
index 313c7433..7de9c50d 100644
--- a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py
+++ b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py
@@ -10,6 +10,7 @@
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.hostguest.utils import _extract_host_guest
 from mofdscribe.featurizers.utils.aggregators import AGGREGATORS
+from mofdscribe.mof import MOF
 
 __all__ = ["GuestCenteredAPRDF"]
 
@@ -94,7 +95,7 @@ def _extract_host_guest(
             local_env_method=self._local_env_method,
         )
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(structure=mof.structure)
 
     def _featurize(
diff --git a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py
index 129ba3c4..1d725074 100644
--- a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py
+++ b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py
@@ -11,6 +11,7 @@
 from mofdscribe.featurizers.hostguest.utils import _extract_host_guest
 from mofdscribe.featurizers.utils import set_operates_on
 from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
+from mofdscribe.mof import MOF
 
 
 # ToDo: How do we handle if there is no guest?
@@ -92,12 +93,12 @@ def _extract_host_guest(self, structure: Union[Structure, IStructure]):
             local_env_method=self._local_env_method,
         )
 
-    def fit(self, mofs: Collection["MOF"]):
+    def fit(self, mofs: Collection[MOF]):
         """
         Fit the featurizer to the given MOFs.
 
         Args:
-            mofs (Collection["MOF"]): The MOFs to fit to.
+            mofs (Collection[MOF]): The MOFs to fit to.
         """
         structures = [mof.structure for mof in mofs]
         self._fit(structures=structures)
@@ -121,7 +122,7 @@ def _fit(
 
         self._featurizer.fit(all_hosts + all_guests)
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(structure=mof.structure)
 
     def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray:
diff --git a/src/mofdscribe/featurizers/matmineradapter.py b/src/mofdscribe/featurizers/matmineradapter.py
index 8e8bb585..24be9ba5 100644
--- a/src/mofdscribe/featurizers/matmineradapter.py
+++ b/src/mofdscribe/featurizers/matmineradapter.py
@@ -7,23 +7,21 @@
 from pymatgen.core.structure import Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.mof import MOF
 
 
 class MatminerAdapter(MOFBaseFeaturizer):
-    """
-    MatminerAdapter is a wrapper for matminer featurizers to be used in mofdscribe.
+    """MatminerAdapter is a wrapper for matminer featurizers to be used in mofdscribe.
 
     That is, you can then pass :py:class:`mofdscribe.mof.MOF` objects to the
     featurizer.
 
     .. note::
-
         If your matminer featurizer needs an expensive
         computation, e.g., the structure graph, if cannot reuse the
         one computed on the ``MOF`` object.
 
     Example:
-
         >>> from mofdscribe.featurizers.matmineradapter import MatminerAdapter
         >>> from matminer.featurizers.structure import DensityFeatures
         >>> from mofdscribe.mof import MOF
@@ -49,7 +47,7 @@ def _featurize(self, structure: Structure):
         """Call the featurize method of the matminer featurizer."""
         return self.matminer_featurizer.featurize(structure)
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         """Call the featurize method of the matminer featurizer using a MOF as input.
 
         Args:
@@ -64,7 +62,7 @@ def _fit(self, structures: List[Structure]) -> np.ndarray:
         """Call the fit method of the matminer featurizer."""
         self.matminer_featurizer.fit(structures)
 
-    def fit(self, mofs: List["MOF"]):
+    def fit(self, mofs: List[MOF]):
         """Call the fit method of the matminer featurizer using a list of MOFs as input.
 
         Args:
diff --git a/src/mofdscribe/featurizers/pore/geometric_properties.py b/src/mofdscribe/featurizers/pore/geometric_properties.py
index 09ca6beb..3f36804e 100644
--- a/src/mofdscribe/featurizers/pore/geometric_properties.py
+++ b/src/mofdscribe/featurizers/pore/geometric_properties.py
@@ -21,6 +21,7 @@
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.utils import is_tool
 from mofdscribe.featurizers.utils.tempdir import TEMPDIR
+from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
 ZEOPP_BASE_COMMAND = ["network"]
@@ -194,7 +195,7 @@ def __init__(
         self.labels = ["lis", "lifs", "lifsp"]
         self.ha = ha
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(self, s: StructureIStructureType):
@@ -281,7 +282,7 @@ def __init__(
 
         self.labels = [f"{label}_{self.probe_radius}" for label in labels]
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(self, s: StructureIStructureType) -> np.ndarray:
@@ -372,7 +373,7 @@ def __init__(
         ]
         self.labels = [f"{label}_{self.probe_radius}" for label in labels]
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(self, s: StructureIStructureType) -> np.ndarray:
@@ -462,8 +463,6 @@ def __init__(
                 It has been reported that this can lead to issues
                 for some structures.
                 Default is True.
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to True.
         """
         if channel_radius is not None and probe_radius != channel_radius:
             logger.warning(
@@ -486,7 +485,7 @@ def __init__(
     def feature_labels(self) -> List[str]:
         return [f"ray_hist_{self.probe_radius}_{i}" for i in range(1000)]
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(self, s: StructureIStructureType) -> np.ndarray:
@@ -566,7 +565,6 @@ def __init__(
         channel_radius: Optional[Union[str, float]] = None,
         hist_type: str = "derivative",
         ha: bool = False,
-        primitive: bool = True,
     ) -> None:
         """Initialize the PoreSizeDistribution featurizer.
 
@@ -624,7 +622,7 @@ def __init__(
     def feature_labels(self) -> List[str]:
         return [f"psd_hist_{self.probe_radius}_{i}" for i in range(1000)]
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(self, s: StructureIStructureType) -> np.ndarray:
diff --git a/src/mofdscribe/featurizers/pore/voxelgrid.py b/src/mofdscribe/featurizers/pore/voxelgrid.py
index 68795c01..9007f6af 100644
--- a/src/mofdscribe/featurizers/pore/voxelgrid.py
+++ b/src/mofdscribe/featurizers/pore/voxelgrid.py
@@ -8,6 +8,7 @@
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.pore._voxelgrid import VoxelGrid as VGBase
+from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
 
@@ -151,7 +152,7 @@ def __init__(
         self.flatten = flatten
         self.regular_bounding_box = regular_bounding_box
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(self, structure: StructureIStructureType) -> np.ndarray:
diff --git a/src/mofdscribe/featurizers/topology/atom_centered_ph.py b/src/mofdscribe/featurizers/topology/atom_centered_ph.py
index c79be734..e14b4cb9 100644
--- a/src/mofdscribe/featurizers/topology/atom_centered_ph.py
+++ b/src/mofdscribe/featurizers/topology/atom_centered_ph.py
@@ -16,6 +16,7 @@
 from mofdscribe.featurizers.utils import flatten
 from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
+from mofdscribe.mof import MOF
 
 
 # Todo: allow doing this with cutoff and coordination shells
@@ -73,7 +74,7 @@ def __init__(
         self.dimensions = dimensions
         self.alpha_weight = alpha_weight
 
-    def featurize(self, mof: "MOF", idx: int) -> np.ndarray:
+    def featurize(self, mof: MOF, idx: int) -> np.ndarray:
         """Compute the features for a single site.
 
         Args:
@@ -226,7 +227,7 @@ def _get_relevant_atom_type(self, element: str) -> str:
             if element in atom_type:
                 return atom_type
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray:
diff --git a/src/mofdscribe/featurizers/topology/ph_hist.py b/src/mofdscribe/featurizers/topology/ph_hist.py
index b236b786..0fbdfa42 100644
--- a/src/mofdscribe/featurizers/topology/ph_hist.py
+++ b/src/mofdscribe/featurizers/topology/ph_hist.py
@@ -14,6 +14,7 @@
     operates_on_molecule,
     operates_on_structure,
 )
+from mofdscribe.mof import MOF
 
 
 @operates_on_imolecule
@@ -114,7 +115,7 @@ def _get_feature_labels(self) -> List[str]:
     def feature_labels(self) -> List[str]:
         return self._get_feature_labels()
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(
diff --git a/src/mofdscribe/featurizers/topology/ph_image.py b/src/mofdscribe/featurizers/topology/ph_image.py
index f2d03283..54f59545 100644
--- a/src/mofdscribe/featurizers/topology/ph_image.py
+++ b/src/mofdscribe/featurizers/topology/ph_image.py
@@ -17,6 +17,7 @@
     operates_on_molecule,
     operates_on_structure,
 )
+from mofdscribe.mof import MOF
 
 
 @operates_on_imolecule
@@ -110,8 +111,6 @@ def __init__(
                 in the analysis (experimental!). Defaults to False.
             no_supercell (bool): If true, then the supercell is not created.
                 Defaults to False.
-            primitive (bool): If True, the structure is reduced to its primitive
-                form before the descriptor is computed. Defaults to False.
             alpha_weight (Optional[str]): If specified, the use weighted alpha shapes,
                 i.e., replacing the points with balls of varying radii.
                 For instance `atomic_radius_calculated` or `van_der_waals_radius`.
@@ -175,7 +174,7 @@ def _get_feature_labels(self) -> List[str]:
     def feature_labels(self) -> List[str]:
         return self._get_feature_labels()
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(
@@ -205,14 +204,14 @@ def _featurize(
                 features.append(np.array(results["image"][element][dim]).flatten())
         return np.concatenate(features)
 
-    def fit(self, mofs: List["MOF"]) -> None:
+    def fit(self, mofs: List[MOF]) -> None:
         """Use structures to estimate the settings for the featurizer.
 
         Find the limits (maximum/minimum birth/death and persistence)
         for all the structures in the dataset and store them in the object.
 
         Args:
-            mofs (List["MOF"]): List of MOFs to find the limits for.
+            mofs (List[MOF]): List of MOFs to find the limits for.
         """
         structures = [mof.structure for mof in mofs]
         self._fit(structures)
diff --git a/src/mofdscribe/featurizers/topology/ph_stats.py b/src/mofdscribe/featurizers/topology/ph_stats.py
index bed8ee3d..650d358a 100644
--- a/src/mofdscribe/featurizers/topology/ph_stats.py
+++ b/src/mofdscribe/featurizers/topology/ph_stats.py
@@ -17,6 +17,7 @@
     operates_on_molecule,
     operates_on_structure,
 )
+from mofdscribe.mof import MOF
 
 
 @operates_on_imolecule
@@ -50,7 +51,6 @@ def __init__(
         aggregation_functions: Tuple[str] = ("min", "max", "mean", "std"),
         periodic: bool = False,
         no_supercell: bool = False,
-        primitive: bool = False,
         alpha_weight: Optional[str] = None,
     ) -> None:
         """Initialize the PHStats object.
@@ -105,7 +105,7 @@ def _get_feature_labels(self) -> List[str]:
     def feature_labels(self) -> List[str]:
         return self._get_feature_labels()
 
-    def featurize(self, mof: "MOF") -> np.ndarray:
+    def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure)
 
     def _featurize(
diff --git a/src/mofdscribe/featurizers/topology/ph_vect.py b/src/mofdscribe/featurizers/topology/ph_vect.py
index 7566f793..02135c2c 100644
--- a/src/mofdscribe/featurizers/topology/ph_vect.py
+++ b/src/mofdscribe/featurizers/topology/ph_vect.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """Featurizers using persistent homology -- vectorized using Gaussian mixture models."""
 from collections import defaultdict
-from typing import List, Optional, Tuple, Union
+from typing import Collection, List, Optional, Tuple, Union
 
 import numpy as np
 from loguru import logger
@@ -9,14 +9,14 @@
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.topology._tda_helpers import get_diagrams_for_structure
 from mofdscribe.featurizers.utils.extend import (
     operates_on_imolecule,
     operates_on_istructure,
     operates_on_molecule,
     operates_on_structure,
 )
-
-from ._tda_helpers import get_diagrams_for_structure
+from mofdscribe.mof import MOF
 
 
 def _apply_and_fill(transformer_func, diagrams):
@@ -237,7 +237,6 @@ def __init__(
         self.periodic = periodic
         self.no_supercell = no_supercell
         self.alpha_weight = alpha_weight
-        super().__init__(primitive=primitive)
 
     def _get_feature_labels(self) -> List[str]:
         labels = []
@@ -251,6 +250,9 @@ def _get_feature_labels(self) -> List[str]:
     def feature_labels(self) -> List[str]:
         return self._get_feature_labels()
 
+    def featurize(self, mof: MOF) -> np.ndarray:
+        return self._featurize(mof.structure)
+
     def _featurize(
         self, structure: Union[Structure, IStructure, Molecule, IMolecule]
     ) -> np.ndarray:
@@ -269,9 +271,15 @@ def _featurize(
         compiled_results = self._reshape_results(res, 1).flatten()
         return compiled_results
 
-    def fit(self, structures: Union[Structure, IStructure, Molecule, IMolecule]) -> "PHVect":
-        if self.primitive:
-            structures = self._get_primitive_many(structures)
+    def fit(self, mofs: Collection["MOF"]):
+        """Fit the featurizer to a collection of MOFs.
+
+        Args:
+            mofs (Collection[MOF]): A collection of MOFs to fit the featurizer to.
+        """
+        self._fit([mof.structure for mof in mofs])
+
+    def _fit(self, structures: Union[Structure, IStructure, Molecule, IMolecule]) -> "PHVect":
         self.transformers, _ = _fit_transform_structures(
             self.transformers,
             structures,
@@ -294,11 +302,20 @@ def _reshape_results(self, results, num_structures) -> np.ndarray:
                 n_col += self.n_components
         return compiled_results
 
-    def fit_transform(
+    def fit_transform(self, mofs: Collection["MOF"]) -> np.ndarray:
+        """Fit the featurizer to a collection of MOFs and return the featurized values.
+
+        Args:
+            mofs (Collection[MOF]): A collection of MOFs to fit the featurizer to.
+
+        Returns:
+            np.ndarray: The featurized values.
+        """
+        return self._fit_transform([mof.structure for mof in mofs])
+
+    def _fit_transform(
         self, structures: Union[Structure, IStructure, Molecule, IMolecule]
     ) -> np.ndarray:
-        if self.primitive:
-            structures = self._get_primitive_many(structures)
         self.transformers, results = _fit_transform_structures(
             self.transformers,
             structures,
diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py
index c79b31fa..dd5b3956 100644
--- a/src/mofdscribe/mof.py
+++ b/src/mofdscribe/mof.py
@@ -98,6 +98,7 @@ def from_file(cls, path: PathType, primitive: bool = True) -> "MOF":
             path (PathType): The path to the file.
             primitive (bool): Whether to use the primitive cell or not.
                 Defaults to ``True``.
+
         Returns:
             MOF: The MOF class.
         """
@@ -119,14 +120,14 @@ def fragments(self) -> namedtuple:
             namedtuple: :py:class:`~moffragmentor.fragment.Fragments` namedtuple.
         """
         try:
-            from moffragmentor import MOF as MOFFragmentorMOF
+            from moffragmentor import MOF as MOFFRAGMENTORMOF
         except ImportError:
             raise ImportError(
                 "moffragmentor is not installed. Please install it to use the fragments feature.\
                 See https://github.com/kjappelbaum/moffragmentor for more information."
             )
 
-        mof = MOFFragmentorMOF(self.__structure, self.structure_graph)
+        mof = MOFFRAGMENTORMOF(self.__structure, self.structure_graph)
         return mof.fragment(**self.__fragmentation_kwargs)
 
     @cached_property
diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py
index 4535aa6e..51eac95e 100644
--- a/tests/featurizers/bu/test_bu_featurizer.py
+++ b/tests/featurizers/bu/test_bu_featurizer.py
@@ -2,11 +2,18 @@
 """Test the BU featurizer."""
 
 import numpy as np
+import pandas as pd
 from matminer.featurizers.site import SOAP
 from matminer.featurizers.structure import SiteStatsFingerprint
 from pymatgen.core import Structure
 
-from mofdscribe.featurizers.bu.bu_featurizer import BUFeaturizer, MOFBBs
+from mofdscribe.featurizers.bu.bu_featurizer import (
+    BindingSitesFeaturizer,
+    BranchingSitesFeaturizer,
+    BUFeaturizer,
+    MOFBBs,
+)
+from mofdscribe.featurizers.bu.lsop_featurizer import LSOP
 from mofdscribe.featurizers.matmineradapter import MatminerAdapter
 from mofdscribe.featurizers.topology import PHStats
 from mofdscribe.mof import MOF
@@ -60,3 +67,42 @@ def test_bu_featurizer_with_matminer_featurizer(hkust_structure, hkust_linker_st
     assert len(features[linker_feature_mask]) == len(linker_feats)
     # todo: check where potential differences come from
     assert np.allclose(features[linker_feature_mask], linker_feats, rtol=0.005, atol=1e-3)
+
+
+def test_binding_site_featurizer(hkust_structure):
+    """Make sure we can featurize on the 'binding' substructure and that the features make some sense."""
+    featurizer = BindingSitesFeaturizer(LSOP())
+    feats = featurizer.featurize(mof=MOF(hkust_structure))
+    labels = featurizer.feature_labels()
+    for label in labels:
+        assert "BindingSite" in label
+    assert len(feats) == len(labels)
+
+    df = pd.DataFrame(data=[feats], columns=labels)
+
+    assert df["BindingSitesFeaturizer_node_mean_lsop_cn"].values[0] == 8.0
+
+    assert df["BindingSitesFeaturizer_node_mean_lsop_cuboct_max"].values[0] > 0.95
+
+    assert df["BindingSitesFeaturizer_linker_mean_lsop_cn"].values[0] == 6.0
+
+    assert df["BindingSitesFeaturizer_linker_mean_lsop_hex_plan_max"].values[0] > 0.4
+
+
+def test_branching_site_featurizer(hkust_structure):
+    """Make sure we can featurize on the 'branching' substructure and that the features make some sense."""
+    featurizer = BranchingSitesFeaturizer(LSOP())
+    feats = featurizer.featurize(mof=MOF(hkust_structure))
+    labels = featurizer.feature_labels()
+    for label in labels:
+        assert "BranchingSite" in label
+    assert len(feats) == len(labels)
+
+    df = pd.DataFrame(data=[feats], columns=labels)
+
+    assert df["BranchingSitesFeaturizer_node_mean_lsop_cn"].values[0] == 4.0
+    assert df["BranchingSitesFeaturizer_linker_mean_lsop_cn"].values[0] == 3.0
+
+    assert df["BranchingSitesFeaturizer_node_mean_lsop_sq_plan_max"].values[0] > 0.9
+
+    assert df["BranchingSitesFeaturizer_linker_mean_lsop_tri_plan"].values[0] > 0.98
diff --git a/tests/featurizers/test_base.py b/tests/featurizers/test_base.py
index 1f0d3ef5..f4ce290e 100644
--- a/tests/featurizers/test_base.py
+++ b/tests/featurizers/test_base.py
@@ -1,56 +1,24 @@
-import numpy as np
-import pandas as pd
-from matminer.featurizers.base import MultipleFeaturizer
 from matminer.featurizers.structure import DensityFeatures
 
 from mofdscribe.featurizers.base import MOFMultipleFeaturizer
 from mofdscribe.featurizers.chemistry import RACS
+from mofdscribe.featurizers.matmineradapter import MatminerAdapter
+from mofdscribe.mof import MOF
 
 
 def test_mofmultiplefeaturizer(hkust_structure, irmof_structure):
     """Test that the calls work. However, I currently do not know of a good
     way of testing if and how often get_primitive is called
     (other than looking at the logs)"""
-    structures = [hkust_structure, irmof_structure]
-    primitive_structures = [structure.get_primitive_structure() for structure in structures]
-    featurizer = MOFMultipleFeaturizer([RACS(), DensityFeatures()], primitive=True)
-    for featurizer_ in featurizer.featurizers:
-        assert featurizer_.primitive is False
-
+    featurizer = MOFMultipleFeaturizer([RACS(), MatminerAdapter(DensityFeatures())])
     # make sure the simple call works
-    features = featurizer.featurize(irmof_structure)
+    features = featurizer.featurize(MOF(irmof_structure))
     assert (
         len(features)
         == len(featurizer.feature_labels())
         == len(RACS().feature_labels()) + len(DensityFeatures().feature_labels())
     )
 
-    featurizer_no_primitive = MOFMultipleFeaturizer([RACS(), DensityFeatures()], primitive=False)
-    features_no_prim = featurizer_no_primitive.featurize(primitive_structures[-1])
-
-    assert np.allclose(features, features_no_prim, rtol=1e-2, atol=1e-2, equal_nan=True)
-
-    # now compare with using the "original" matminer MultipleFeaturizer
-    featurizer_orig = MultipleFeaturizer([RACS(), DensityFeatures()])
-    features_orig = featurizer_orig.featurize(primitive_structures[-1])
-    assert np.allclose(features, features_orig, rtol=1e-2, atol=1e-2, equal_nan=True)
-
     # make sure the multiple calls work
-    features_many_1 = featurizer.featurize_many(structures)
-    features_many_1_labels = featurizer.feature_labels()
+    features_many_1 = featurizer.featurize_many([MOF(irmof_structure), MOF(hkust_structure)])
     assert features_many_1.ndim == 2
-
-    featurizer = MOFMultipleFeaturizer(
-        [RACS(), DensityFeatures()], iterate_over_entries=False, primitive=True
-    )
-    features_many_2 = featurizer.featurize_many(structures)
-    features_many_2_labels = featurizer.feature_labels()
-    assert features_many_2.ndim == 2
-
-    features_many_1_df = pd.DataFrame(features_many_1, columns=features_many_1_labels)
-    features_many_2_df = pd.DataFrame(features_many_2, columns=features_many_2_labels)
-
-    # independent of the way we call the featurizer, the features should be the same
-    values_1 = features_many_1_df.values
-    values_2 = features_many_2_df[features_many_1_df.columns].values
-    assert np.allclose(values_1, values_2, rtol=1e-2, atol=1e-2, equal_nan=True)
diff --git a/tests/featurizers/topology/test_ph_vect.py b/tests/featurizers/topology/test_ph_vect.py
index 4ca80d6d..22ee06cf 100644
--- a/tests/featurizers/topology/test_ph_vect.py
+++ b/tests/featurizers/topology/test_ph_vect.py
@@ -3,6 +3,7 @@
 import pytest
 
 from mofdscribe.featurizers.topology.ph_vect import PHVect
+from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
 
@@ -12,20 +13,20 @@ def test_ph_vect(hkust_structure, irmof_structure, hkust_la_structure):
     # should raise if not fitted
     with pytest.raises(ValueError):
         ph_vect = PHVect()
-        ph_vect.featurize(hkust_structure)
+        ph_vect.featurize(MOF(hkust_structure))
 
     # should be able to fit and featurize
     ph_vect = PHVect(n_components=2, random_state=42)
-    ph_vect.fit([hkust_structure, irmof_structure])
+    ph_vect.fit([MOF(hkust_structure), MOF(irmof_structure)])
 
     # test fit_transform
     ph_vect = PHVect(n_components=2, random_state=42)
-    feat = ph_vect.fit_transform([hkust_structure, irmof_structure])
+    feat = ph_vect.fit_transform([MOF(hkust_structure), MOF(irmof_structure)])
     assert feat.shape == (2, 4 * 2 * 2)
     assert is_jsonable(dict(zip(ph_vect.feature_labels(), feat[0])))
     assert feat.ndim == 2
 
-    feat = ph_vect.featurize(hkust_structure)
+    feat = ph_vect.featurize(MOF(hkust_structure))
     assert feat.ndim == 1
 
     assert len(feat) == len(set(ph_vect.feature_labels()))
@@ -36,14 +37,14 @@ def test_ph_vect(hkust_structure, irmof_structure, hkust_la_structure):
     ph_vect = PHVect(
         n_components=2, atom_types=None, alpha_weight="atomic_radius_calculated", random_state=42
     )
-    hkust_feats = ph_vect.fit_transform([hkust_structure])
-    features_hkust_la = ph_vect.fit_transform([hkust_la_structure])
+    hkust_feats = ph_vect.fit_transform([MOF(hkust_structure)])
+    features_hkust_la = ph_vect.fit_transform([MOF(hkust_la_structure)])
     assert hkust_feats.shape == features_hkust_la.shape
     assert (hkust_feats != features_hkust_la).any()
 
     # now, to be sure do not encode the same thing with the atomic radius
     ph_vect = PHVect(n_components=2, atom_types=None, alpha_weight=None, random_state=42)
-    hkust_feats = ph_vect.fit_transform([hkust_structure])
-    features_hkust_la = ph_vect.fit_transform([hkust_la_structure])
+    hkust_feats = ph_vect.fit_transform([MOF(hkust_structure)])
+    features_hkust_la = ph_vect.fit_transform([MOF(hkust_la_structure)])
     assert hkust_feats.shape == features_hkust_la.shape
     assert hkust_feats == pytest.approx(features_hkust_la, rel=1e-2)
diff --git a/tests/test_matmineradapter.py b/tests/test_matmineradapter.py
index 24c74bac..c6ae1bfe 100644
--- a/tests/test_matmineradapter.py
+++ b/tests/test_matmineradapter.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+"""Test the MatminerAdapter."""
 import numpy as np
 from matminer.featurizers.site import SOAP
 from matminer.featurizers.structure import DensityFeatures, SiteStatsFingerprint
diff --git a/tox.ini b/tox.ini
index 2616e35e..66c0a385 100644
--- a/tox.ini
+++ b/tox.ini
@@ -11,7 +11,7 @@ envlist =
     lint
     #manifest
     pyroma
-    flake8 < 5
+    flake8 
     #mypy
     # documentation linters/checkers
     #doc8
@@ -56,17 +56,17 @@ description = Run linters.
 skip_install = true
 deps =
     darglint
-    flake8 < 5
+    flake8 
     flake8-black
     flake8-bugbear
     flake8-colors
     flake8-docstrings
-    flake8-isort < 5
+    flake8-isort 
     flake8-print
     pep8-naming
     pydocstyle
 commands =
-    flake8 src/mofdscribe/ tests/ setup.py
+    flake8 --color always src/mofdscribe/ tests/ setup.py 
 description = Run the flake8 tool with several plugins (docstrings, import order, pep8 naming). See https://cthoyt.com/2020/04/25/how-to-code-with-me-flake8.html for more information.
 
 [testenv:pyroma]

From 5fb0c86d3e675de7f75a7a945a702c4e654a1bba Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Fri, 23 Dec 2022 18:41:24 +0100
Subject: [PATCH 07/17] refactoring done

---
 docs/source/api.rst                           |  3 +-
 docs/source/api/featurizers.rst               | 17 ++++
 docs/source/api/structures.rst                |  9 +++
 docs/source/contributing.rst                  |  9 ++-
 .../featurizers/bu/bu_featurizer.py           | 81 ++++++++++++++++---
 src/mofdscribe/featurizers/chemistry/price.py |  3 +
 src/mofdscribe/featurizers/chemistry/racs.py  |  7 +-
 src/mofdscribe/featurizers/graph/__init__.py  |  1 +
 .../featurizers/graph/dimensionality.py       |  4 +-
 .../featurizers/hostguest/guest_aprdf.py      |  4 +-
 .../hostguest/host_guest_featurizer.py        |  6 +-
 src/mofdscribe/featurizers/hostguest/utils.py |  5 +-
 src/mofdscribe/featurizers/matmineradapter.py |  6 +-
 src/mofdscribe/featurizers/utils/__init__.py  | 12 +--
 src/mofdscribe/featurizers/utils/extend.py    |  3 +-
 tests/featurizers/bu/test_bu_featurizer.py    |  8 ++
 .../featurizers/graph/test_dimensionality.py  |  4 +-
 .../hostguest/test_host_guest_featurizer.py   | 13 ++-
 18 files changed, 155 insertions(+), 40 deletions(-)
 create mode 100644 docs/source/api/structures.rst

diff --git a/docs/source/api.rst b/docs/source/api.rst
index 537a6680..9e09f56f 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -10,4 +10,5 @@ API documentation
    api/splitters
    api/metrics
    api/bench
-   api/helpers
\ No newline at end of file
+   api/helpers
+   api/structures
\ No newline at end of file
diff --git a/docs/source/api/featurizers.rst b/docs/source/api/featurizers.rst
index 954b8b6f..048cb031 100644
--- a/docs/source/api/featurizers.rst
+++ b/docs/source/api/featurizers.rst
@@ -94,4 +94,21 @@ Host Guest featurization
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. automodule:: mofdscribe.featurizers.hostguest.host_guest_featurizer
+    :members:
+
+
+Graph featurization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. automodule:: mofdscribe.featurizers.graph.graph_featurizer
+    :members:
+
+.. automodule:: mofdscribe.featurizers.graph.dimensionality
+    :members:
+
+
+Matminer adapter 
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. automodule:: mofdscribe.featurizers.matmineradapter
     :members:
\ No newline at end of file
diff --git a/docs/source/api/structures.rst b/docs/source/api/structures.rst
new file mode 100644
index 00000000..dc6e7045
--- /dev/null
+++ b/docs/source/api/structures.rst
@@ -0,0 +1,9 @@
+
+Structure inputs
+-------------------
+
+MOF
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. automodule:: mofdscribe.mof
+    :members:
diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst
index 713aa7da..96c1fa63 100644
--- a/docs/source/contributing.rst
+++ b/docs/source/contributing.rst
@@ -5,11 +5,12 @@ Extending and contributing to mofdscribe
 Implementing a new featurizer
 -----------------------------
 
-To implement a new featurizer, you typically need to create a new class that inherits from the :py:class:`~mofdscribe.featurizers.base.MOFBaseFeaturizer`. In this class, you need to implement three methods: 
-:py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize`, :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.feature_labels` and :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.citation`.
+To implement a new featurizer, you typically need to create a new class that inherits from the :py:class:`~mofdscribe.featurizers.base.MOFBaseFeaturizer`. In this class, you need to implement four methods: 
+:py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize`, :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer._featurize`, :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.feature_labels` and :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.citation`.
+
+The main featurization logic happens in :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer._featurize`.
+Your method should accept as input a :py:class:`~pymatgen.core.Structure` (:py:class:`~pymatgen.core.IStructure`, :py:class:`~pymatgen.core.Molecule`, :py:class:`~pymatgen.core.StructureGraph`) object and return a :py:class:`numpy.array`. The   :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer._featurize` is supposed to extract the relevant object from a :py:object:`~mofdscribe.mof.MOF` object. 
 
-The main featurization logic happens in :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize`.
-Your method should accept as input a :py:class:`~pymatgen.core.Structure` object and return a :py:class:`numpy.array`.
 The :py:meth:`mofdscribe.featurizers.base.MOFBaseFeaturizer.feature_labels` method should return a list of strings that describe the features returned by :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize`. The number of feature names should match the number of features returned by :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize` (i.e. the number of columns in the feature matrix). The :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.citation` method should return a list of strings of BibTeX citations for the featurizer.
 
 Generally, you also want to decorate your structure with the 
diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py
index 2641c133..38e2ee7a 100644
--- a/src/mofdscribe/featurizers/bu/bu_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py
@@ -6,6 +6,7 @@
 import numpy as np
 from matminer.featurizers.base import MultipleFeaturizer
 from pydantic import BaseModel
+from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer, MOFMultipleFeaturizer
@@ -14,6 +15,8 @@
 from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
 from mofdscribe.mof import MOF
 
+if False:
+    from moffragmentor import SBU
 __all__ = ("BUFeaturizer", "BranchingSitesFeaturizer", "BindingSitesFeaturizer")
 
 
@@ -26,6 +29,18 @@ class MOFBBs(BaseModel):
     linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]]
 
 
+def _structuregraph_from_indices(mof, indices) -> StructureGraph:
+    from moffragmentor.utils import remove_all_nodes_not_in_indices
+
+    structure_graph = mof.structure_graph.__copy__()
+    remove_all_nodes_not_in_indices(structure_graph, indices)
+    return structure_graph
+
+
+def structuregraph_from_bu(mof: "MOF", bu: "SBU") -> StructureGraph:
+    return _structuregraph_from_indices(mof, bu._original_indices)
+
+
 class BUFeaturizer(MOFBaseFeaturizer):
     """
     Compute features on the BUs and then aggregate them.
@@ -104,10 +119,15 @@ def _extract_bbs(
 
         if mof is not None:
             fragments = mof.fragments
-
-            if self._operates_on in ("both", "molecule"):
+            if self._operates_on & set([Molecule]):
                 linkers = [linker.molecule for linker in fragments.linkers]
                 nodes = [node.molecule for node in fragments.nodes]
+            elif self._operates_on & set([StructureGraph]):
+                linkers = [structuregraph_from_bu(mof, linker) for linker in fragments.linkers]
+                nodes = [structuregraph_from_bu(mof, node) for node in fragments.nodes]
+            elif self._operates_on & set([MoleculeGraph]):
+                linkers = [linker.molecule_graph for linker in fragments.linkers]
+                nodes = [node.molecule_graph for node in fragments.nodes]
             else:
                 # create a boxed structure
                 linkers = [linker._get_boxed_structure() for linker in fragments.linkers]
@@ -122,18 +142,22 @@ def _extract_bbs(
                 raise ValueError("All nodes and linkers must be of the same type.")
 
             this_type = types[0]
-            if this_type in (Structure, IStructure) and self._operates_on in ("both", "structure"):
+            if this_type in (Structure, IStructure) and self._operates_on & set(
+                [Molecule, Structure]
+            ):
                 # this is the simple case, we do not need to convert to molecules
                 pass
-            elif this_type in (Molecule, IMolecule) and self._operates_on in ("both", "molecule"):
+            elif this_type in (Molecule, IMolecule) and self._operates_on & set(
+                [Molecule, Structure]
+            ):
                 # again simple case, we do not need to convert to structures
 
                 pass
-            elif this_type in (Molecule, IMolecule) and (self._operates_on == "structure"):
+            elif this_type in (Molecule, IMolecule) and (self._operates_on & set([Molecule])):
                 # we need to convert to structures
                 nodes = [boxed_molecule(node) for node in nodes]
                 linkers = [boxed_molecule(linker) for linker in linkers]
-            elif this_type in (Structure, IStructure) and (self._operates_on == "molecule"):
+            elif this_type in (Structure, IStructure) and (self._operates_on & set([Molecule])):
                 raise ValueError(
                     "You provided structures for a featurizer that operates on molecules. "
                     / "Cannot automatically convert to molecules from structures."
@@ -291,6 +315,7 @@ def implementors(self) -> List[str]:
         return self._featurizer.implementors()
 
 
+# ToDo: generalize extract depending on what the featurizer operates on
 class BindingSitesFeaturizer(_BUSubBaseFeaturizer):
     """A special BU featurizer that operates on structures spanned by "binding sites".
 
@@ -315,9 +340,25 @@ class BindingSitesFeaturizer(_BUSubBaseFeaturizer):
     _NAME = "BindingSitesFeaturizer"
 
     def _extract(self, mof: MOF):
-        linkers = [linker._get_binding_sites_structure() for linker in mof.fragments.linkers]
-        nodes = [node._get_binding_sites_structure() for node in mof.fragments.nodes]
-        return nodes, linkers
+        if Structure in self._operates_on:
+            linkers = [linker._get_binding_sites_structure() for linker in mof.fragments.linkers]
+            nodes = [node._get_binding_sites_structure() for node in mof.fragments.nodes]
+            return nodes, linkers
+        elif StructureGraph in self._operates_on:
+            linkers = [
+                _structuregraph_from_indices(mof, linker._original_binding_indices)
+                for linker in mof.fragments.linkers
+            ]
+            nodes = [
+                _structuregraph_from_indices(mof, node._original_binding_indices)
+                for node in mof.fragments.nodes
+            ]
+        elif Molecule in self._operates_on:
+            raise NotImplementedError("BindingSitesFeaturizer does not support Molecule yet.")
+        else:
+            raise NotImplementedError(
+                f"BindingSitesFeaturizer does not support featurizers operating on {self._operates_on}."
+            )
 
 
 class BranchingSitesFeaturizer(_BUSubBaseFeaturizer):
@@ -343,6 +384,22 @@ class BranchingSitesFeaturizer(_BUSubBaseFeaturizer):
     _NAME = "BranchingSitesFeaturizer"
 
     def _extract(self, mof: MOF):
-        linkers = [linker._get_branching_sites_structure() for linker in mof.fragments.linkers]
-        nodes = [node._get_branching_sites_structure() for node in mof.fragments.nodes]
-        return nodes, linkers
+        if Structure in self._operates_on:
+            linkers = [linker._get_branching_sites_structure() for linker in mof.fragments.linkers]
+            nodes = [node._get_branching_sites_structure() for node in mof.fragments.nodes]
+            return nodes, linkers
+        elif StructureGraph in self._operates_on:
+            linkers = [
+                _structuregraph_from_indices(mof, linker._original_binding_indices)
+                for linker in mof.fragments.linkers
+            ]
+            nodes = [
+                _structuregraph_from_indices(mof, node._original_binding_indices)
+                for node in mof.fragments.nodes
+            ]
+        elif Molecule in self._operates_on:
+            raise NotImplementedError("BranchingSitesFeaturizer does not support Molecule yet.")
+        else:
+            raise NotImplementedError(
+                f"BranchingSitesFeaturizer does not support featurizers operating on {self._operates_on}."
+            )
diff --git a/src/mofdscribe/featurizers/chemistry/price.py b/src/mofdscribe/featurizers/chemistry/price.py
index 3e80205f..a89f5d83 100644
--- a/src/mofdscribe/featurizers/chemistry/price.py
+++ b/src/mofdscribe/featurizers/chemistry/price.py
@@ -9,6 +9,7 @@
 import pandas as pd
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
 from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
@@ -77,6 +78,8 @@ def _price_per_atom(element_price_fractions_kg, structure):
     )
 
 
+@operates_on_istructure
+@operates_on_structure
 class PriceLowerBound(MOFBaseFeaturizer):
     """Compute a lower bound for the price based on the element prices.
 
diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py
index 11012092..23159ce8 100644
--- a/src/mofdscribe/featurizers/chemistry/racs.py
+++ b/src/mofdscribe/featurizers/chemistry/racs.py
@@ -12,7 +12,11 @@
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.chemistry._fragment import get_bb_indices
 from mofdscribe.featurizers.utils.aggregators import AGGREGATORS, ARRAY_AGGREGATORS
-from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
+from mofdscribe.featurizers.utils.extend import (
+    operates_on_istructure,
+    operates_on_structure,
+    operates_on_structuregraph,
+)
 from mofdscribe.featurizers.utils.structure_graph import (
     get_connected_site_indices,
     get_neighbors_at_distance,
@@ -124,6 +128,7 @@ def _get_racs_for_bbs(
 
 @operates_on_istructure
 @operates_on_structure
+@operates_on_structuregraph
 class RACS(MOFBaseFeaturizer):
     r"""Modified version of the revised autocorrelation functions (RACs) for MOFs.
 
diff --git a/src/mofdscribe/featurizers/graph/__init__.py b/src/mofdscribe/featurizers/graph/__init__.py
index e69de29b..5a4e7539 100644
--- a/src/mofdscribe/featurizers/graph/__init__.py
+++ b/src/mofdscribe/featurizers/graph/__init__.py
@@ -0,0 +1 @@
+from .dimensionality import Dimensionality  # noqa: F401
diff --git a/src/mofdscribe/featurizers/graph/dimensionality.py b/src/mofdscribe/featurizers/graph/dimensionality.py
index d966b544..ff6814e8 100644
--- a/src/mofdscribe/featurizers/graph/dimensionality.py
+++ b/src/mofdscribe/featurizers/graph/dimensionality.py
@@ -7,9 +7,11 @@
 from pymatgen.analysis.graphs import StructureGraph
 
 from mofdscribe.featurizers.graph.graphfeaturizer import GraphFeaturizer
+from mofdscribe.featurizers.utils.extend import operates_on_structuregraph
 from mofdscribe.mof import MOF
 
 
+@operates_on_structuregraph
 class Dimensionality(GraphFeaturizer):
     def __init__(self) -> None:
         """Construct a new Dimensionality featurizer."""
@@ -19,7 +21,7 @@ def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(mof.structure_graph)
 
     def _featurize(self, structure_graph: StructureGraph) -> np.ndarray:
-        return get_dimensionality_larsen(structure_graph)
+        return np.array([get_dimensionality_larsen(structure_graph)])
 
     def feature_labels(self) -> List[str]:
         return ["dimensionality"]
diff --git a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py
index 7de9c50d..3dce89d2 100644
--- a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py
+++ b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 from element_coder import encode
-from pymatgen.core import IStructure, Structure
+from pymatgen.core import IStructure, Structure, Molecule
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.hostguest.utils import _extract_host_guest
@@ -91,7 +91,7 @@ def _extract_host_guest(
         return _extract_host_guest(
             structure=structure,
             remove_guests=True,
-            operates_on="molecule",
+            operates_on=set([Molecule]),
             local_env_method=self._local_env_method,
         )
 
diff --git a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py
index 1d725074..f4c97249 100644
--- a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py
+++ b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py
@@ -120,7 +120,7 @@ def _fit(
             all_hosts.append(host_guest.host)
             all_guests.extend(host_guest.guests)
 
-        self._featurizer.fit(all_hosts + all_guests)
+        self._featurizer._fit(all_hosts + all_guests)
 
     def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(structure=mof.structure)
@@ -151,10 +151,10 @@ def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray:
             )
 
         guest_feats = []
-        host_feats = self._featurizer.featurize(host_guest.host)
+        host_feats = self._featurizer._featurize(host_guest.host)
 
         for guest in host_guest.guests:
-            guest_feats.append(self._featurizer.featurize(guest))
+            guest_feats.append(self._featurizer._featurize(guest))
 
         aggregated_feats = []
         for aggregation in self._aggregations:
diff --git a/src/mofdscribe/featurizers/hostguest/utils.py b/src/mofdscribe/featurizers/hostguest/utils.py
index f53d7390..43a6acdb 100644
--- a/src/mofdscribe/featurizers/hostguest/utils.py
+++ b/src/mofdscribe/featurizers/hostguest/utils.py
@@ -39,7 +39,7 @@ class HostGuest(BaseModel):
 
 def _extract_host_guest(
     structure: Union[Structure, IStructure],
-    operates_on: str = "structure",
+    operates_on: str = set([Structure]),
     remove_guests: bool = True,
     local_env_method: str = "vesta",
 ):
@@ -55,7 +55,8 @@ def _extract_host_guest(
     else:
         host = structure_graph.structure
 
-    if operates_on == "structure":
+    # ToDo: generalize this to other types
+    if operates_on & set([Structure]):
         mols = [boxed_molecule(mol) for mol in mols]
 
     return HostGuest(host=host, guests=mols)
diff --git a/src/mofdscribe/featurizers/matmineradapter.py b/src/mofdscribe/featurizers/matmineradapter.py
index 24be9ba5..91702df9 100644
--- a/src/mofdscribe/featurizers/matmineradapter.py
+++ b/src/mofdscribe/featurizers/matmineradapter.py
@@ -7,9 +7,11 @@
 from pymatgen.core.structure import Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.utils.extend import operates_on_structure
 from mofdscribe.mof import MOF
 
 
+@operates_on_structure
 class MatminerAdapter(MOFBaseFeaturizer):
     """MatminerAdapter is a wrapper for matminer featurizers to be used in mofdscribe.
 
@@ -60,7 +62,9 @@ def featurize(self, mof: MOF) -> np.ndarray:
 
     def _fit(self, structures: List[Structure]) -> np.ndarray:
         """Call the fit method of the matminer featurizer."""
-        self.matminer_featurizer.fit(structures)
+        self.matminer_featurizer.fit(
+            [Structure.from_sites(structure.sites) for structure in structures]
+        )
 
     def fit(self, mofs: List[MOF]):
         """Call the fit method of the matminer featurizer using a list of MOFs as input.
diff --git a/src/mofdscribe/featurizers/utils/__init__.py b/src/mofdscribe/featurizers/utils/__init__.py
index 57db31df..fa7b1c78 100644
--- a/src/mofdscribe/featurizers/utils/__init__.py
+++ b/src/mofdscribe/featurizers/utils/__init__.py
@@ -5,7 +5,7 @@
 from shutil import which
 
 import numpy as np
-from pymatgen.core import Molecule, Structure
+from pymatgen.core import Structure
 
 if sys.version_info.major == 3 and sys.version_info.minor >= 10:
     from collections.abc import MutableMapping
@@ -45,12 +45,6 @@ def flatten(d: dict, parent_key: str = "", sep: str = ".") -> dict:
 def set_operates_on(self, featurizer):
     try:
         _operates_on = featurizer.operates_on()
-        if (Structure in _operates_on) and (Molecule in _operates_on):
-            self._operates_on = "both"
-        elif Structure in _operates_on:
-            self._operates_on = "structure"
-        elif Molecule in _operates_on:
-            self._operates_on = "molecule"
-
+        self._operates_on = set(_operates_on)
     except AttributeError:
-        self._operates_on = "structure"
+        self._operates_on = {Structure}
diff --git a/src/mofdscribe/featurizers/utils/extend.py b/src/mofdscribe/featurizers/utils/extend.py
index ed00c907..5d307996 100644
--- a/src/mofdscribe/featurizers/utils/extend.py
+++ b/src/mofdscribe/featurizers/utils/extend.py
@@ -3,7 +3,7 @@
 from functools import partial
 from typing import Type
 
-from pymatgen.analysis.graphs import StructureGraph
+from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph
 from pymatgen.core import IMolecule, IStructure, Molecule, Structure
 
 
@@ -37,3 +37,4 @@ def operates_on(self):
 operates_on_molecule = partial(add_operates_on, type=Molecule)
 operates_on_imolecule = partial(add_operates_on, type=IMolecule)
 operates_on_structuregraph = partial(add_operates_on, type=StructureGraph)
+operates_on_moleculegraph = partial(add_operates_on, type=MoleculeGraph)
diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py
index 51eac95e..14031012 100644
--- a/tests/featurizers/bu/test_bu_featurizer.py
+++ b/tests/featurizers/bu/test_bu_featurizer.py
@@ -14,6 +14,7 @@
     MOFBBs,
 )
 from mofdscribe.featurizers.bu.lsop_featurizer import LSOP
+from mofdscribe.featurizers.graph.dimensionality import Dimensionality
 from mofdscribe.featurizers.matmineradapter import MatminerAdapter
 from mofdscribe.featurizers.topology import PHStats
 from mofdscribe.mof import MOF
@@ -35,6 +36,13 @@ def test_bu_featurizer(hkust_structure, molecule):
     assert features[0] > 0
     assert features[0] < 2
 
+    # test if it works with graph featurizers
+    featurizer = BUFeaturizer(Dimensionality(), aggregations=("mean",))
+    features = featurizer.featurize(mof=MOF(hkust_structure))
+    assert features.shape == (2,)
+    assert features[0] == 0
+    assert features[1] == 0
+
 
 def test_bu_featurizer_with_matminer_featurizer(hkust_structure, hkust_linker_structure):
     """Test that we can call BU featurizers with matminer molecules."""
diff --git a/tests/featurizers/graph/test_dimensionality.py b/tests/featurizers/graph/test_dimensionality.py
index 52e3e85e..f8943113 100644
--- a/tests/featurizers/graph/test_dimensionality.py
+++ b/tests/featurizers/graph/test_dimensionality.py
@@ -7,5 +7,5 @@
 def test_dimensionality(hkust_structure):
     dim = Dimensionality()
     sg = get_structure_graph(hkust_structure, "vesta")
-    assert dim.featurize(MOF(hkust_structure)) == 3
-    assert dim._featurize(sg) == 3
+    assert dim.featurize(MOF(hkust_structure))[0] == 3
+    assert dim._featurize(sg)[0] == 3
diff --git a/tests/featurizers/hostguest/test_host_guest_featurizer.py b/tests/featurizers/hostguest/test_host_guest_featurizer.py
index dbbd1bbf..b2993909 100644
--- a/tests/featurizers/hostguest/test_host_guest_featurizer.py
+++ b/tests/featurizers/hostguest/test_host_guest_featurizer.py
@@ -1,14 +1,25 @@
 # -*- coding: utf-8 -*-
 from matminer.featurizers.structure.sites import SiteStatsFingerprint
 
+from mofdscribe.featurizers.chemistry import APRDF
 from mofdscribe.featurizers.hostguest import HostGuestFeaturizer
+from mofdscribe.featurizers.matmineradapter import MatminerAdapter
 from mofdscribe.mof import MOF
 
 
 def test_host_guest_featurizer(floating_structure):
     """Test the HostGuestFeaturizer."""
     featurizer = HostGuestFeaturizer(
-        featurizer=SiteStatsFingerprint.from_preset("SOAP_formation_energy"),
+        featurizer=APRDF(),
+        aggregations=("mean",),
+    )
+    features = featurizer.featurize(MOF(floating_structure))
+    labels = featurizer._featurizer.feature_labels()
+    assert len(features) == 2 * len(labels)
+
+    # Test the matminer adapter
+    featurizer = HostGuestFeaturizer(
+        featurizer=MatminerAdapter(SiteStatsFingerprint.from_preset("SOAP_formation_energy")),
         aggregations=("mean",),
     )
     featurizer.fit([MOF(floating_structure)])

From 2d5981e06c05528608929ecb38ded79ce0a69ffd Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Fri, 23 Dec 2022 19:11:54 +0100
Subject: [PATCH 08/17] remove lint

---
 src/mofdscribe/featurizers/hostguest/guest_aprdf.py | 2 +-
 src/mofdscribe/featurizers/hostguest/utils.py       | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py
index 3dce89d2..b779786b 100644
--- a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py
+++ b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 from element_coder import encode
-from pymatgen.core import IStructure, Structure, Molecule
+from pymatgen.core import IStructure, Molecule, Structure
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.hostguest.utils import _extract_host_guest
diff --git a/src/mofdscribe/featurizers/hostguest/utils.py b/src/mofdscribe/featurizers/hostguest/utils.py
index 43a6acdb..28e72e8a 100644
--- a/src/mofdscribe/featurizers/hostguest/utils.py
+++ b/src/mofdscribe/featurizers/hostguest/utils.py
@@ -39,10 +39,12 @@ class HostGuest(BaseModel):
 
 def _extract_host_guest(
     structure: Union[Structure, IStructure],
-    operates_on: str = set([Structure]),
+    operates_on: Optional[str] = None,
     remove_guests: bool = True,
     local_env_method: str = "vesta",
 ):
+    if operates_on is None:
+        operates_on = set([Structure])
 
     if not isinstance(structure, (IStructure)):
         structure = IStructure.from_sites(structure.sites)

From 0995bd3228889ebe39f95478fdbb0b58b993370f Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Sat, 24 Dec 2022 09:49:19 +0100
Subject: [PATCH 09/17] ready for test

---
 src/mofdscribe/mof.py                      | 11 ++++
 tests/conftest.py                          |  6 ++
 tests/featurizers/bu/test_bu_featurizer.py | 10 ++-
 tests/test_files/Fe-MOF-74.cif             | 77 ++++++++++++++++++++++
 4 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_files/Fe-MOF-74.cif

diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py
index dd5b3956..450ec4c7 100644
--- a/src/mofdscribe/mof.py
+++ b/src/mofdscribe/mof.py
@@ -64,6 +64,17 @@ def __init__(
                 Defaults to ``"vesta"``.
             fragmentation_kwargs: The fragmentation kwargs to use for the fragmentation using ``moffragmentor``.
                 Defaults to ``None``.
+                Some relevant kwargs are:
+                * check_dimensionality (bool): Check if the node is 0D.
+                    If not, split into isolated metals.
+                    Defaults to True.
+                * create_single_metal_bus (bool): Create a single metal BUs.
+                    Defaults to False.
+                * break_organic_nodes_at_metal (bool): Break nodes into single metal BU
+                    if they appear "too organic".
+
+                For the dimensionality featurizers applied to building blocks, you want
+                to be careful wtih ``check_dimensionality``.
         """
         self.__structure = (
             IStructure.from_sites(structure.sites)
diff --git a/tests/conftest.py b/tests/conftest.py
index 87cf7b86..433b6286 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,6 +18,12 @@ def hkust_structure():
     return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "HKUST-1.cif"))
 
 
+@pytest.fixture(scope="session")
+def mof74_structure():
+    """Return a pymatgen Structure for MOF74"""
+    return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "Fe-MOF-74.cif"))
+
+
 @pytest.fixture(scope="session")
 def hkust_path():
     """Return a path to a HKUST structure"""
diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py
index 14031012..510b435a 100644
--- a/tests/featurizers/bu/test_bu_featurizer.py
+++ b/tests/featurizers/bu/test_bu_featurizer.py
@@ -20,7 +20,7 @@
 from mofdscribe.mof import MOF
 
 
-def test_bu_featurizer(hkust_structure, molecule):
+def test_bu_featurizer(hkust_structure, molecule, mof74_structure):
     """Test that we can call BU featurizers with pymatgen molecules."""
     featurizer = BUFeaturizer(PHStats(no_supercell=True))
 
@@ -43,6 +43,14 @@ def test_bu_featurizer(hkust_structure, molecule):
     assert features[0] == 0
     assert features[1] == 0
 
+    # make sure we find the rod node
+    featurizer = BUFeaturizer(Dimensionality(), aggregations=("mean",))
+    mof4 = MOF(mof74_structure, fragmentation_kwargs={"check_dimensionality": False})
+    features = featurizer.featurize(mof=mof4)
+    assert features.shape == (2,)
+    assert features[0] == 1
+    assert features[1] == 0
+
 
 def test_bu_featurizer_with_matminer_featurizer(hkust_structure, hkust_linker_structure):
     """Test that we can call BU featurizers with matminer molecules."""
diff --git a/tests/test_files/Fe-MOF-74.cif b/tests/test_files/Fe-MOF-74.cif
new file mode 100644
index 00000000..2a6fd4bf
--- /dev/null
+++ b/tests/test_files/Fe-MOF-74.cif
@@ -0,0 +1,77 @@
+data_crystal
+ 
+_cell_length_a    6.96775
+_cell_length_b    15.40000
+_cell_length_c    15.42110
+_cell_angle_alpha 117.56400
+_cell_angle_beta  98.67390
+_cell_angle_gamma 98.70470
+ 
+_symmetry_space_group_name_Hall 'P 1'
+_symmetry_space_group_name_H-M  'P 1'
+ 
+loop_
+_symmetry_equiv_pos_as_xyz
+ 'x,y,z' 
+ 
+loop_
+_atom_site_label
+_atom_site_type_symbol
+_atom_site_fract_x
+_atom_site_fract_y
+_atom_site_fract_z
+_atom_site_charge
+C          C       0.03198   0.90054   0.46577   0.1870000000
+C          C       0.08103   0.46326   0.91497  -0.1740000000
+C          C       0.13137   0.56584   0.09913   0.1870000000
+C          C       0.16623   0.08519   0.54897  -0.1750000000
+C          C       0.17945   0.43074   0.82994   0.3860000000
+C          C       0.19214   0.98722   0.51494  -0.0710000000
+C          C       0.20483   0.52779   0.01259  -0.0730000000
+C          C       0.25129   0.60109   0.43035   0.3860000000
+C          C       0.32290   0.51504   0.52768  -0.0720000000
+C          C       0.34978   0.17032   0.60199   0.3860000000
+C          C       0.38223   0.54845   0.46308  -0.1750000000
+C          C       0.43442   0.46640   0.56592   0.1880000000
+C          C       0.56558   0.53360   0.43408   0.1880000000
+C          C       0.61777   0.45155   0.53692  -0.1750000000
+C          C       0.65022   0.82968   0.39801   0.3860000000
+C          C       0.67710   0.48496   0.47232  -0.0720000000
+C          C       0.74871   0.39891   0.56965   0.3860000000
+C          C       0.79517   0.47221   0.98741  -0.0730000000
+C          C       0.80786   0.01278   0.48506  -0.0710000000
+C          C       0.82055   0.56926   0.17006   0.3860000000
+C          C       0.83377   0.91481   0.45103  -0.1750000000
+C          C       0.86863   0.43416   0.90087   0.1870000000
+C          C       0.91897   0.53674   0.08503  -0.1730000000
+C          C       0.96802   0.09946   0.53423   0.1880000000
+H          H       0.18345   0.52756   0.54920   0.0590000000
+H          H       0.34407   0.97808   0.52739   0.0590000000
+H          H       0.36584   0.54940   0.02155   0.0590000000
+H          H       0.63416   0.45060   0.97845   0.0590000000
+H          H       0.65593   0.02192   0.47261   0.0590000000
+H          H       0.81655   0.47244   0.45080   0.0590000000
+O          O       0.06914   0.36860   0.73969  -0.5690000000
+O          O       0.07325   0.80916   0.43511  -0.5130000000
+O          O       0.09643   0.61781   0.46347  -0.4540000000
+O          O       0.26402   0.62706   0.19021  -0.5130000000
+O          O       0.29956   0.62958   0.36820  -0.5690000000
+O          O       0.32998   0.26088   0.63057  -0.5690000000
+O          O       0.36301   0.43627   0.62724  -0.5130000000
+O          O       0.36745   0.46401   0.84623  -0.4540000000
+O          O       0.47868   0.84617   0.38113  -0.4540000000
+O          O       0.52132   0.15383   0.61887  -0.4540000000
+O          O       0.63255   0.53599   0.15377  -0.4540000000
+O          O       0.63699   0.56373   0.37276  -0.5130000000
+O          O       0.67002   0.73912   0.36943  -0.5690000000
+O          O       0.70044   0.37042   0.63180  -0.5690000000
+O          O       0.73598   0.37294   0.80979  -0.5130000000
+O          O       0.90357   0.38219   0.53653  -0.4540000000
+O          O       0.92675   0.19084   0.56489  -0.5130000000
+O          O       0.93086   0.63140   0.26031  -0.5690000000
+Fe         Fe      0.10807   0.32430   0.59200   1.1500000000
+Fe         Fe      0.21621   0.73303   0.32332   1.1500000000
+Fe         Fe      0.48320   0.59129   0.26667   1.1500000000
+Fe         Fe      0.51680   0.40871   0.73333   1.1500000000
+Fe         Fe      0.78379   0.26697   0.67668   1.1500000000
+Fe         Fe      0.89193   0.67570   0.40800   1.1500000000

From 230a694be22a555d1cdb8573ad78712adf403d70 Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Sat, 24 Dec 2022 10:59:20 +0100
Subject: [PATCH 10/17] pass fragmentation kwargs

---
 src/mofdscribe/mof.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py
index 450ec4c7..f0cc03b5 100644
--- a/src/mofdscribe/mof.py
+++ b/src/mofdscribe/mof.py
@@ -102,13 +102,14 @@ def structure_graph(self) -> StructureGraph:
         return self.__structure_graph
 
     @classmethod
-    def from_file(cls, path: PathType, primitive: bool = True) -> "MOF":
+    def from_file(cls, path: PathType, primitive: bool = True, fragmentation_kwargs: Optional[dict]=None) -> "MOF":
         """Create a MOF class from a file.
 
         Args:
             path (PathType): The path to the file.
             primitive (bool): Whether to use the primitive cell or not.
                 Defaults to ``True``.
+            fragmentation_kwargs: The fragmentation kwargs to use for the fragmentation using ``moffragmentor``.
 
         Returns:
             MOF: The MOF class.
@@ -116,7 +117,8 @@ def from_file(cls, path: PathType, primitive: bool = True) -> "MOF":
         return cls(
             IStructure.from_file(path).get_primitive_structure()
             if primitive
-            else IStructure.from_file(path)
+            else IStructure.from_file(path),
+            fragmentation_kwargs=fragmentation_kwargs,
         )
 
     @cached_property

From c063c2d45019236b3fcb7d0eb00968d09168cbd4 Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Sat, 24 Dec 2022 11:20:01 +0100
Subject: [PATCH 11/17] docs: add graph-based feats

[skip ci]
---
 docs/source/background.rst                     | 11 +++++++++++
 .../featurizers/graph/dimensionality.rst       | 18 ++++++++++++++++++
 .../host_guest/host_guest_aprdf.rst            |  2 ++
 docs/source/references.rst                     |  2 ++
 4 files changed, 33 insertions(+)
 create mode 100644 docs/source/featurizers/graph/dimensionality.rst

diff --git a/docs/source/background.rst b/docs/source/background.rst
index d47926d7..e19d032a 100644
--- a/docs/source/background.rst
+++ b/docs/source/background.rst
@@ -60,6 +60,17 @@ descriptor that only considers the local environment (of e.g., 3 atoms). For thi
     featurizers/global/*
 
 
+Graph featurizers
+--------------------
+
+Graph featurizers are a special class of featurizers that are based on the structure graph. The structure graph is a periodic graph in which the edges are bonds and the nodes are atoms.
+
+.. toctree::
+    :glob:
+    :maxdepth: 1
+
+    featurizers/graph/*
+
 
 BU-centered featurizers
 -----------------------------
diff --git a/docs/source/featurizers/graph/dimensionality.rst b/docs/source/featurizers/graph/dimensionality.rst
new file mode 100644
index 00000000..ad8330af
--- /dev/null
+++ b/docs/source/featurizers/graph/dimensionality.rst
@@ -0,0 +1,18 @@
+Dimensionality 
+..................
+
+Returns the dimensionality of the structure. This measure is based on [LarsenDimensionality]_, where the structure graph is analyzed. 
+
+In the case of MOFs, rod like structures are considered 1D, sheet like structures are considered 2D, and 3D structures are considered 3D.
+
+This can be interesting for the metal nodes, where the typical SBUs such as Cu-paddlewheels are 0D. However, many well-known MOFs such as MOF-74 have infinite rod nodes, that this featurizer would consider 1D.
+
+.. featurizer::  Dimensionality
+    :id: Dimensionality
+    :considers_geometry: False
+    :considers_structure_graph: True
+    :encodes_chemistry: false
+    :scope: global
+    :scalar: True
+
+    Returns the dimensionality of the structure. This measure is based on [LarsenDimensionality]_, where the structure graph is analyzed.
\ No newline at end of file
diff --git a/docs/source/featurizers/host_guest/host_guest_aprdf.rst b/docs/source/featurizers/host_guest/host_guest_aprdf.rst
index 2fba836e..89867792 100644
--- a/docs/source/featurizers/host_guest/host_guest_aprdf.rst
+++ b/docs/source/featurizers/host_guest/host_guest_aprdf.rst
@@ -5,6 +5,8 @@ This featurizer builds on the :ref:`APRDF` featurizer, but instead of using the
 correlations between all atoms, it only considers the ones between the guest and all host atoms 
 (within some cutoff distance). 
 
+The APRDF is defined as:
+
 .. math::
 
   \operatorname{RDF}^{P}(R)=f \sum_{i, j}^{\text {all atom pairs }} P_{i} P_{j} \mathrm{e}^{-B\left(r_{i j}-R\right)^{2}}
diff --git a/docs/source/references.rst b/docs/source/references.rst
index 2c1201fe..3bf3bd47 100644
--- a/docs/source/references.rst
+++ b/docs/source/references.rst
@@ -112,3 +112,5 @@ References
 .. [Trappe] `Potoff, J. J.; Siepmann, J. I. Vapor–Liquid Equilibria of Mixtures Containing Alkanes, Carbon Dioxide, and Nitrogen. AIChE Journal 2001, 47 (7), 1676–1682. <https://doi.org/10.1002/aic.690470719>`_
 
 .. [Varoquaux] `Varoquaux, G. Cross-Validation Failure: Small Sample Sizes Lead to Large Error Bars. NeuroImage 2018, 180, 68–77. <https://doi.org/10.1016/j.neuroimage.2017.06.061>`_
+
+.. [LarsenDimensionality] `Larsen, P. M.; Pandey, M.; Strange, M.; Jacobsen, K. W. Definition of a Scoring Parameter to Identify Low-Dimensional Materials Components. Phys. Rev. Materials 2019, 3 (3), 034003. <https://doi.org/10.1103/PhysRevMaterials.3.034003>`_
\ No newline at end of file

From 13b127706a8635b34bc2e09722b55264fd7e19f4 Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Thu, 29 Dec 2022 13:46:46 +0100
Subject: [PATCH 12/17] add more BU featurizers

Fixes #419
---
 .../source/featurizers/bu_centered/angles.rst |  24 ++++
 .../featurizers/bu_centered/num_hops.rst      |  25 ++++
 docs/source/featurizers/global/size.rst       |   4 +
 docs/source/getting_started.rst               |  16 +++
 src/mofdscribe/featurizers/bu/__init__.py     |  11 +-
 .../featurizers/bu/angle_hist_featurizer.py   |  82 +++++++++++
 .../featurizers/bu/angle_stats_featurizer.py  |  57 ++++++++
 .../featurizers/bu/bu_featurizer.py           | 128 +++++++++++++++++-
 .../bu/distance_hist_featurizer.py            |  13 +-
 .../featurizers/chemistry/__init__.py         |   1 +
 .../featurizers/chemistry/energygrid.py       |  12 +-
 .../featurizers/chemistry/numatoms.py         |  46 +++++++
 .../chemistry/partialchargehistogram.py       |  13 +-
 src/mofdscribe/featurizers/graph/numhops.py   |  66 +++++++++
 src/mofdscribe/featurizers/utils/mixins.py    |  10 ++
 src/mofdscribe/mof.py                         |   4 +-
 .../bu/test_angle_hist_featurizer.py          |  11 ++
 .../bu/test_angle_stats_featurizer.py         |  11 ++
 tests/featurizers/bu/test_bu_featurizer.py    |  59 ++++++++
 tests/featurizers/chemistry/test_num_atoms.py |   8 ++
 tests/featurizers/graph/test_num_hop.py       |  17 +++
 21 files changed, 597 insertions(+), 21 deletions(-)
 create mode 100644 docs/source/featurizers/bu_centered/angles.rst
 create mode 100644 docs/source/featurizers/bu_centered/num_hops.rst
 create mode 100644 docs/source/featurizers/global/size.rst
 create mode 100644 src/mofdscribe/featurizers/bu/angle_hist_featurizer.py
 create mode 100644 src/mofdscribe/featurizers/bu/angle_stats_featurizer.py
 create mode 100644 src/mofdscribe/featurizers/chemistry/numatoms.py
 create mode 100644 src/mofdscribe/featurizers/graph/numhops.py
 create mode 100644 src/mofdscribe/featurizers/utils/mixins.py
 create mode 100644 tests/featurizers/bu/test_angle_hist_featurizer.py
 create mode 100644 tests/featurizers/bu/test_angle_stats_featurizer.py
 create mode 100644 tests/featurizers/chemistry/test_num_atoms.py
 create mode 100644 tests/featurizers/graph/test_num_hop.py

diff --git a/docs/source/featurizers/bu_centered/angles.rst b/docs/source/featurizers/bu_centered/angles.rst
new file mode 100644
index 00000000..71d23d87
--- /dev/null
+++ b/docs/source/featurizers/bu_centered/angles.rst
@@ -0,0 +1,24 @@
+Angle-based description of BU shape 
+=======================================
+
+The following featurizers compute the angles between all pairs of atoms in a building block. 
+We always compute the angles A-COM-B, where COM is the center of mass of the building block.
+
+Given the distribution of the angles, we can compute fixed-length descriptors by either converting
+the distribution to a histogram or computing some statistics (mean, standard deviation, etc.) of the distribution.
+
+.. featurizer::  PairWiseAngleHist
+    :id: PairWiseAngleHist
+    :considers_geometry: True
+    :considers_structure_graph: False
+    :encodes_chemistry: False
+    :scope: bu
+    :scalar: False
+
+.. featurizer::  PairWiseAngleStats
+    :id: PairWiseAngleStats
+    :considers_geometry: True
+    :considers_structure_graph: False
+    :encodes_chemistry: False
+    :scope: bu
+    :scalar: False
diff --git a/docs/source/featurizers/bu_centered/num_hops.rst b/docs/source/featurizers/bu_centered/num_hops.rst
new file mode 100644
index 00000000..597afb72
--- /dev/null
+++ b/docs/source/featurizers/bu_centered/num_hops.rst
@@ -0,0 +1,25 @@
+Shortest-Path Based Description of Building Blocks
+======================================================
+
+For certain targets, the proximity of connecting groups in a building unit 
+(e.g. carboxy groups) can be interesting features. 
+
+One way to describe this generally is to compute the distribution 
+of shortest paths between special sites in the building units that
+our ``moffragmentor`` package calls "binding sites" and "branching sites". 
+
+.. featurizer::  BranchingNumHopFeaturizer
+    :id: BranchingNumHopFeaturizer
+    :considers_geometry: False
+    :considers_structure_graph: True
+    :encodes_chemistry: False
+    :scope: bu
+    :scalar: False
+
+.. featurizer::  BindingNumHopFeaturizer
+    :id: BindingNumHopFeaturizer
+    :considers_geometry: False
+    :considers_structure_graph: True
+    :encodes_chemistry: False
+    :scope: bu
+    :scalar: False
diff --git a/docs/source/featurizers/global/size.rst b/docs/source/featurizers/global/size.rst
new file mode 100644
index 00000000..1625e9cf
--- /dev/null
+++ b/docs/source/featurizers/global/size.rst
@@ -0,0 +1,4 @@
+Extensive size descriptors
+================================
+
+
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index 755c594d..fb45d341 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -62,6 +62,22 @@ directory.
     and notebook in the `examples folder <https://github.com/kjappelbaum/mofdscribe/tree/main/examples>`_.
 
 
+.. admonition:: Saving time using the MOF object 
+    :class: tip 
+
+    From our experience, the most time-consuming part of featurization is the
+    the computation of the structure graph or the fragments. 
+
+    Additionally, you often do not know in advance which featurizers you will
+    use.
+
+    If you want to save in the case you need to compute additional features, 
+    it can be practical to serialize the :py:class:`~mofdscribe.mof.MOF` objects
+    after the first featurization. 
+    The objects will already contain the structure graph and the fragments (if 
+    they have been computed in the first featurization).
+    
+
 Using a reference dataset
 --------------------------
 
diff --git a/src/mofdscribe/featurizers/bu/__init__.py b/src/mofdscribe/featurizers/bu/__init__.py
index b1658dfd..1add3acf 100644
--- a/src/mofdscribe/featurizers/bu/__init__.py
+++ b/src/mofdscribe/featurizers/bu/__init__.py
@@ -1,6 +1,15 @@
 # -*- coding: utf-8 -*-
 """Featurizers operating on the secondary building units."""
-from .bu_featurizer import BUFeaturizer, MOFBBs  # noqa: F401
+from .angle_hist_featurizer import PairWiseAngleHist  # noqa: F401
+from .angle_stats_featurizer import PairWiseAngleStats  # noqa: F401
+from .bu_featurizer import (  # noqa: F401
+    BindingNumHopFeaturizer,
+    BindingSitesFeaturizer,
+    BranchingNumHopFeaturizer,
+    BranchingSitesFeaturizer,
+    BUFeaturizer,
+    MOFBBs,
+)
 from .bu_matches import BUMatch  # noqa: F401
 from .compositionstats_featurizer import CompositionStats  # noqa: F401
 from .distance_hist_featurizer import PairwiseDistanceHist  # noqa: F401
diff --git a/src/mofdscribe/featurizers/bu/angle_hist_featurizer.py b/src/mofdscribe/featurizers/bu/angle_hist_featurizer.py
new file mode 100644
index 00000000..e51ea5c4
--- /dev/null
+++ b/src/mofdscribe/featurizers/bu/angle_hist_featurizer.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+"""Describe molecules by computing histogram of angles A-COM-B between atoms A, B and COM.
+
+COM is the center of mass of the molecule.
+"""
+from typing import List, Union
+
+import numpy as np
+from matminer.featurizers.base import BaseFeaturizer
+from pymatgen.core import IMolecule, Molecule
+from pymatgen.util.coord import get_angle
+
+from mofdscribe.featurizers.utils.extend import operates_on_imolecule, operates_on_molecule
+from mofdscribe.featurizers.utils.histogram import get_rdf
+from mofdscribe.featurizers.utils.mixins import GetGridMixin
+
+
+@operates_on_molecule
+@operates_on_imolecule
+class PairWiseAngleHist(BaseFeaturizer, GetGridMixin):
+    _NAME = "PairWiseAngleHist"
+
+    def __init__(
+        self,
+        lower_bound: float = 0.0,
+        upper_bound: float = 180.0,
+        bin_size: float = 20,
+        density: bool = True,
+    ) -> None:
+        """Create a new PairwiseDistanceHist featurizer.
+
+        Args:
+            lower_bound (float): Lower bound of the histogram.
+                Defaults to 0.0.
+            upper_bound (float): Upper bound of the histogram.
+                Defaults to 180.
+            bin_size (float): Size of the bins.
+                Defaults to 20.
+            density (bool): Whether to return the density or the counts.
+                Defaults to True.
+        """
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+        self.bin_size = bin_size
+        self.density = density
+
+    def feature_labels(self) -> List[str]:
+        return [
+            f"{self._NAME}_{a}"
+            for a in self._get_grid(self.lower_bound, self.upper_bound, self.bin_size)
+        ]
+
+    def _featurize(self, molecule: Union[Molecule, IMolecule]) -> np.ndarray:
+        # Get the pairwise distances
+        com = molecule.center_of_mass
+        angles = []
+        for i in range(len(molecule)):
+            for j in range(len(molecule)):
+                if i > j:
+                    v1 = molecule.cart_coords[i] - com
+                    v2 = molecule.cart_coords[j] - com
+                    angles.append(get_angle(v1, v2))
+
+        # Compute the histogram
+        features = get_rdf(
+            angles,
+            lower_lim=self.lower_bound,
+            upper_lim=self.upper_bound,
+            bin_size=self.bin_size,
+            density=self.density,
+            normalized=False,
+        )
+        return features
+
+    def featurize(self, molecule: Union[Molecule, IMolecule]) -> np.ndarray:
+        return self._featurize(molecule)
+
+    def citations(self) -> List[str]:
+        return []
+
+    def implementors(self) -> List[str]:
+        return ["Kevin Maik Jablonka"]
diff --git a/src/mofdscribe/featurizers/bu/angle_stats_featurizer.py b/src/mofdscribe/featurizers/bu/angle_stats_featurizer.py
new file mode 100644
index 00000000..8469b012
--- /dev/null
+++ b/src/mofdscribe/featurizers/bu/angle_stats_featurizer.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+"""Describe molecules by computing statistics for the distribution of angles A-COM-B between atoms A, B and COM.
+
+COM is the center of mass of the molecule.
+"""
+
+from typing import List, Tuple, Union
+
+import numpy as np
+from matminer.featurizers.base import BaseFeaturizer
+from pymatgen.core import IMolecule, Molecule
+from pymatgen.util.coord import get_angle
+
+from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
+from mofdscribe.featurizers.utils.extend import operates_on_imolecule, operates_on_molecule
+
+
+@operates_on_molecule
+@operates_on_imolecule
+class PairWiseAngleStats(BaseFeaturizer):
+    _NAME = "PairWiseAngleStats"
+
+    def __init__(self, aggreations: Tuple[str] = ("mean", "std", "min", "max")) -> None:
+        """Create a new PairwiseDistanceStats featurizer.
+
+        Args:
+            aggreations (Tuple[str]): Aggreations to compute.
+                Defaults to ("mean", "std", "min", "max").
+        """
+        self.aggreations = aggreations
+
+    def feature_labels(self) -> List[str]:
+        return [f"{self._NAME}_{a}" for a in self.aggreations]
+
+    def _featurize(self, molecule: Union[Molecule, IMolecule]) -> np.ndarray:
+        # Get the pairwise distances
+        com = molecule.center_of_mass
+        angles = []
+        for i in range(len(molecule)):
+            for j in range(len(molecule)):
+                if i > j:
+                    v1 = molecule.cart_coords[i] - com
+                    v2 = molecule.cart_coords[j] - com
+                    angles.append(get_angle(v1, v2))
+
+        feats = [ARRAY_AGGREGATORS[a](angles) for a in self.aggreations]
+
+        return np.array(feats)
+
+    def featurize(self, molecule: Union[Molecule, IMolecule]) -> np.ndarray:
+        return self._featurize(molecule)
+
+    def citations(self) -> List[str]:
+        return []
+
+    def implementors(self) -> List[str]:
+        return ["Kevin Maik Jablonka"]
diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py
index 38e2ee7a..d67677b4 100644
--- a/src/mofdscribe/featurizers/bu/bu_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """Compute features on the BUs and then aggregate them."""
 from abc import abstractmethod
-from typing import Collection, List, Optional, Tuple, Union
+from typing import Callable, Collection, List, Optional, Tuple, Union
 
 import numpy as np
 from matminer.featurizers.base import MultipleFeaturizer
@@ -11,13 +11,20 @@
 
 from mofdscribe.featurizers.base import MOFBaseFeaturizer, MOFMultipleFeaturizer
 from mofdscribe.featurizers.bu.utils import boxed_molecule
+from mofdscribe.featurizers.graph.numhops import NumHops
 from mofdscribe.featurizers.utils import nan_array, set_operates_on
 from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
 from mofdscribe.mof import MOF
 
 if False:
-    from moffragmentor import SBU
-__all__ = ("BUFeaturizer", "BranchingSitesFeaturizer", "BindingSitesFeaturizer")
+    from moffragmentor import SBU  # for type hints
+__all__ = (
+    "BUFeaturizer",
+    "BranchingSitesFeaturizer",
+    "BindingSitesFeaturizer",
+    "BranchingNumHopFeaturizer",
+    "BindingNumHopFeaturizer",
+)
 
 
 # since fragmentation is not super straightforward,
@@ -353,6 +360,7 @@ def _extract(self, mof: MOF):
                 _structuregraph_from_indices(mof, node._original_binding_indices)
                 for node in mof.fragments.nodes
             ]
+            return nodes, linkers
         elif Molecule in self._operates_on:
             raise NotImplementedError("BindingSitesFeaturizer does not support Molecule yet.")
         else:
@@ -397,9 +405,123 @@ def _extract(self, mof: MOF):
                 _structuregraph_from_indices(mof, node._original_binding_indices)
                 for node in mof.fragments.nodes
             ]
+            return nodes, linkers
         elif Molecule in self._operates_on:
             raise NotImplementedError("BranchingSitesFeaturizer does not support Molecule yet.")
         else:
             raise NotImplementedError(
                 f"BranchingSitesFeaturizer does not support featurizers operating on {self._operates_on}."
             )
+
+
+def _extract_branching_indices(bu):
+    return bu.graph_branching_indices
+
+
+def _extract_binding_indices(bu):
+    return bu.binding_indices
+
+
+class _NumSiteHops(BUFeaturizer):
+    _NAME = "NumBranchingSiteHops"
+
+    def __init__(self, hop_stat_aggregations, aggregations, index_extractor: Callable):
+        self.hop_stat_aggregations = hop_stat_aggregations
+        self.aggregations = aggregations
+        self.index_extractor = index_extractor
+        self._featurizer = NumHops(self.hop_stat_aggregations)
+
+    def _extract(self, mof: MOF):
+
+        linkers = [linker.molecule_graph for linker in mof.fragments.linkers]
+        nodes = [node.molecule_graph for node in mof.fragments.nodes]
+
+        linker_indices = [self.index_extractor(linker) for linker in mof.fragments.linkers]
+        node_indices = [self.index_extractor(node) for node in mof.fragments.nodes]
+        return nodes, linkers, node_indices, linker_indices
+
+    def featurize(self, mof: Optional[MOF] = None) -> np.ndarray:
+        nodes, linkers, node_indices, linker_indices = self._extract(mof)
+        node_feats = np.array(
+            [
+                self._featurizer._featurize(node, node_idx)
+                for node, node_idx in zip(nodes, node_indices)
+            ]
+        )
+        linker_feats = np.array(
+            [
+                self._featurizer._featurize(linker, linker_idx)
+                for linker, linker_idx in zip(linkers, linker_indices)
+            ]
+        )
+
+        node_aggregated = np.concatenate(
+            [ARRAY_AGGREGATORS[agg](node_feats, axis=0) for agg in self.aggregations]
+        )
+        linker_aggregated = np.concatenate(
+            [ARRAY_AGGREGATORS[agg](linker_feats, axis=0) for agg in self.aggregations]
+        )
+        return np.concatenate([node_aggregated, linker_aggregated])
+
+    def feature_labels(self) -> List[str]:
+        base_feature_labels = self._featurizer.feature_labels()
+        feature_labels = []
+        for bb in ["node", "linker"]:
+            for agg in self.aggregations:
+                for feat in base_feature_labels:
+                    feature_labels.append(f"{self._NAME}_{bb}_{agg}_{feat}")
+        return feature_labels
+
+    def implementors(self) -> List[str]:
+        return ["Kevin Maik Jablonka"]
+
+    def citations(self) -> List[str]:
+        return []
+
+
+class BranchingNumHopFeaturizer(_NumSiteHops):
+    """Compute statistics on the shortest path lengths between branching sites."""
+
+    _NAME = "BranchingNumHopFeaturizer"
+
+    def __init__(
+        self,
+        hop_stat_aggregations: Tuple[str] = ("mean", "std", "min", "max"),
+        aggregations: Tuple[str] = ("mean", "std", "min", "max"),
+    ):
+        """Construct a BranchingNumHopFeaturizer.
+
+        Args:
+            hop_stat_aggregations (Tuple[str]): Aggregation functions to apply to the
+                shortest path lengths between branching sites on a building blocks.
+                Defaults to ("mean", "std", "min", "max").
+            aggregations (Tuple[str]): Aggregation functions to apply to the
+                aggregated statistcs of the shortest path lengths between branching sites
+                of different building blocks of the same .
+                Defaults to ("mean", "std", "min", "max").
+        """
+        super().__init__(hop_stat_aggregations, aggregations, _extract_branching_indices)
+
+
+class BindingNumHopFeaturizer(_NumSiteHops):
+    """Compute statistics on the shortest path lengths between binding sites."""
+
+    _NAME = "BindingNumHopFeaturizer"
+
+    def __init__(
+        self,
+        hop_stat_aggregations: Tuple[str] = ("mean", "std", "min", "max"),
+        aggregations: Tuple[str] = ("mean", "std", "min", "max"),
+    ):
+        """Construct a BindingNumHopFeaturizer.
+
+        Args:
+            hop_stat_aggregations (Tuple[str]): Aggregation functions to apply to the
+                shortest path lengths between branching sites on a building blocks.
+                Defaults to ("mean", "std", "min", "max").
+            aggregations (Tuple[str]): Aggregation functions to apply to the
+                aggregated statistcs of the shortest path lengths between branching sites
+                of different building blocks of the same .
+                Defaults to ("mean", "std", "min", "max").
+        """
+        super().__init__(hop_stat_aggregations, aggregations, _extract_binding_indices)
diff --git a/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py b/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py
index dc5fea1f..ef443fe4 100644
--- a/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py
@@ -13,13 +13,14 @@
     operates_on_structure,
 )
 from mofdscribe.featurizers.utils.histogram import get_rdf
+from mofdscribe.featurizers.utils.mixins import GetGridMixin
 
 
 @operates_on_molecule
 @operates_on_imolecule
 @operates_on_istructure
 @operates_on_structure
-class PairwiseDistanceHist(BaseFeaturizer):
+class PairwiseDistanceHist(BaseFeaturizer, GetGridMixin):
     """
     Describe the shape of molecules by computing a histogram of pairwise distances.
 
@@ -32,6 +33,8 @@ class PairwiseDistanceHist(BaseFeaturizer):
     reported in [Zhang2022]_.
     """
 
+    _NAME = "PairwiseDistanceHist"
+
     def __init__(
         self,
         lower_bound: float = 0.0,
@@ -56,11 +59,11 @@ def __init__(
         self.bin_size = bin_size
         self.density = density
 
-    def _get_grid(self):
-        return np.arange(self.lower_bound, self.upper_bound, self.bin_size)
-
     def feature_labels(self) -> List[str]:
-        return [f"pairwise_distance_hist_{a}" for a in self._get_grid()]
+        return [
+            f"{self._NAME}_{a}"
+            for a in self._get_grid(self.lower_bound, self.upper_bound, self.bin_size)
+        ]
 
     def featurize(self, structure: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray:
         return self._featurize(structure)
diff --git a/src/mofdscribe/featurizers/chemistry/__init__.py b/src/mofdscribe/featurizers/chemistry/__init__.py
index ade4df4c..f569d3e3 100644
--- a/src/mofdscribe/featurizers/chemistry/__init__.py
+++ b/src/mofdscribe/featurizers/chemistry/__init__.py
@@ -4,6 +4,7 @@
 from .aprdf import APRDF  # noqa: F401
 from .energygrid import EnergyGridHistogram  # noqa: F401
 from .henry import Henry  # noqa: F401
+from .numatoms import NumAtoms  # noqa: F401
 from .partialchargehistogram import PartialChargeHistogram  # noqa: F401
 from .partialchargestats import PartialChargeStats  # noqa: F401
 from .price import PriceLowerBound  # noqa: F401
diff --git a/src/mofdscribe/featurizers/chemistry/energygrid.py b/src/mofdscribe/featurizers/chemistry/energygrid.py
index e8a8dd5b..1ccee4d1 100644
--- a/src/mofdscribe/featurizers/chemistry/energygrid.py
+++ b/src/mofdscribe/featurizers/chemistry/energygrid.py
@@ -11,6 +11,7 @@
 from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
 from mofdscribe.featurizers.utils.histogram import get_rdf
+from mofdscribe.featurizers.utils.mixins import GetGridMixin
 from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell
 from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa
 from mofdscribe.mof import MOF
@@ -64,7 +65,7 @@ def read_ascii_grid(filename: Union[str, os.PathLike]) -> pd.DataFrame:
 
 @operates_on_istructure
 @operates_on_structure
-class EnergyGridHistogram(MOFBaseFeaturizer):
+class EnergyGridHistogram(MOFBaseFeaturizer, GetGridMixin):
     """Computes the energy grid histograms as originally proposed by Bucior et al. [Bucior2019]_.
 
     Conventionally, energy grids can be used to speed up molecular simulations.
@@ -88,6 +89,8 @@ class EnergyGridHistogram(MOFBaseFeaturizer):
             https://doi.org/10.1063/5.0050823.
     """
 
+    _NAME = "EnergyGridHistogram"
+
     def __init__(
         self,
         raspa_dir: Union[str, os.PathLike, None] = None,
@@ -180,15 +183,12 @@ def fit_transform(self, structures: List[Union[Structure, IStructure]]):
     def fit(self, structure: Union[Structure, IStructure]):
         ...
 
-    def _get_grid(self):
-        return np.arange(self.min_energy_vdw, self.max_energy_vdw, self.bin_size_vdw)
-
     def feature_labels(self) -> List[str]:
-        grid = self._get_grid()
+        grid = self._get_grid(self.min_energy_vdw, self.max_energy_vdw, self.bin_size_vdw)
         labels = []
         for site in self.sites:
             for grid_point in grid:
-                labels.append(f"energygridhist_{self.mol_name}_{site}_{grid_point}")
+                labels.append(f"{self._NAME}_{self.mol_name}_{site}_{grid_point}")
         return labels
 
     def featurize(self, mof: MOF) -> np.ndarray:
diff --git a/src/mofdscribe/featurizers/chemistry/numatoms.py b/src/mofdscribe/featurizers/chemistry/numatoms.py
new file mode 100644
index 00000000..fd4632fc
--- /dev/null
+++ b/src/mofdscribe/featurizers/chemistry/numatoms.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+"""Atom count featurizer."""
+from typing import List, Union
+
+import numpy as np
+from pymatgen.core import IMolecule, IStructure, Molecule, Structure
+
+from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.utils.extend import (
+    operates_on_imolecule,
+    operates_on_istructure,
+    operates_on_molecule,
+    operates_on_structure,
+)
+from mofdscribe.mof import MOF
+
+
+@operates_on_structure
+@operates_on_istructure
+@operates_on_molecule
+@operates_on_imolecule
+class NumAtoms(MOFBaseFeaturizer):
+    """Featurizer that returns the number of atoms in a structure."""
+
+    def __init__(self) -> None:
+        """Construct a new NumAtoms featurizer."""
+        super().__init__()
+
+    def _featurize(
+        self, structure_object: Union[Structure, IStructure, Molecule, IMolecule]
+    ) -> np.ndarray:
+        if not isinstance(structure_object, (Structure, IStructure, Molecule, IMolecule)):
+            raise ValueError("Structure object must be a pymatgen Structure or Molecule.")
+        return np.array([len(structure_object)])
+
+    def featurize(self, mof: MOF) -> int:
+        return len(mof.structure)
+
+    def feature_labels(self) -> List[str]:
+        return ["num_atoms"]
+
+    def citations(self) -> List[str]:
+        return []
+
+    def implementors(self) -> List[str]:
+        return ["Kevin Maik Jablonka"]
diff --git a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py
index 963e7f78..7c5e43bd 100644
--- a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py
+++ b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py
@@ -9,6 +9,7 @@
 from mofdscribe.featurizers.utils.eqeq import get_eqeq_charges
 from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
 from mofdscribe.featurizers.utils.histogram import get_rdf
+from mofdscribe.featurizers.utils.mixins import GetGridMixin
 from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
@@ -17,13 +18,15 @@
 
 @operates_on_istructure
 @operates_on_structure
-class PartialChargeHistogram(MOFBaseFeaturizer):
+class PartialChargeHistogram(MOFBaseFeaturizer, GetGridMixin):
     """Compute partial charges using the EqEq charge equilibration method [Ongari2019]_.
 
     Then derive a fix-length feature vector from the partial charges by binning
     charges in a histogram.
     """
 
+    _NAME = "PartialChargeHistogram"
+
     def __init__(
         self,
         min_charge: float = -4,
@@ -44,11 +47,11 @@ def __init__(
         self.max_charge = max_charge
         self.bin_size = bin_size
 
-    def _get_grid(self):
-        return np.arange(self.min_charge, self.max_charge, self.bin_size)
-
     def feature_labels(self) -> List[str]:
-        return [f"chargehist_{val}" for val in self._get_grid()]
+        return [
+            f"{self._NAME}_{val}"
+            for val in self._get_grid(self.min_charge, self.max_charge, self.bin_size)
+        ]
 
     def featurize(self, mof: MOF) -> np.ndarray:
         return self._featurize(s=mof.structure)
diff --git a/src/mofdscribe/featurizers/graph/numhops.py b/src/mofdscribe/featurizers/graph/numhops.py
new file mode 100644
index 00000000..7540aec3
--- /dev/null
+++ b/src/mofdscribe/featurizers/graph/numhops.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+"""Featurizer that computes statistics on the number of hops between binding/branching sites."""
+
+from typing import Collection, Tuple, Union
+
+import networkx as nx
+import numpy as np
+from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph
+
+from mofdscribe.featurizers.graph.graphfeaturizer import GraphFeaturizer
+from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
+from mofdscribe.featurizers.utils.extend import (
+    operates_on_moleculegraph,
+    operates_on_structuregraph,
+)
+from mofdscribe.mof import MOF
+
+
+def _hop_stats(
+    structuregraph: Union[StructureGraph, MoleculeGraph], indices: Collection[int], aggregations
+) -> np.ndarray:
+    # for all combinations of indices compute the shortest path
+    # and return the statistics of the shortest paths
+    shortest_paths = []
+    undirected = structuregraph.graph.to_undirected()
+    for i_count, i in enumerate(indices):
+        for j_count, j in enumerate(indices):
+            if i_count > j_count:
+                shortest_paths.append(nx.shortest_path_length(undirected, i, j))
+    return np.array([ARRAY_AGGREGATORS[agg](shortest_paths) for agg in aggregations])
+
+
+@operates_on_structuregraph
+@operates_on_moleculegraph
+class NumHops(GraphFeaturizer):
+    """Featurizer that computes statistics on the shortest path lengths between sites."""
+
+    _NAME = "num_hops"
+
+    def __init__(self, aggregations: Tuple[str] = ("mean", "std", "min", "max")) -> None:
+        """Construct a NumHops featurizer.
+
+        Args:
+            aggregations: The aggregations to compute on the shortest path lengths.
+                Defaults to ("mean", "std", "min", "max").
+        """
+        self.aggregations = aggregations
+
+    def _featurize(
+        self, structuregraph: Union[StructureGraph, MoleculeGraph], indices: Collection[int]
+    ) -> np.ndarray:
+        # for all combinations of indices compute the shortest path
+        # and return the statistics of the shortest paths
+        return _hop_stats(structuregraph, indices, self.aggregations)
+
+    def featurize(self, mof: MOF, indices: Collection[int]) -> np.ndarray:
+        return self._featurize(mof.structure_graph, indices)
+
+    def feature_labels(self) -> Collection[str]:
+        return [f"{self._NAME}_{agg}" for agg in self.aggregations]
+
+    def citations(self) -> Collection[str]:
+        return []
+
+    def implementors(self) -> Collection[str]:
+        return ["Kevin Maik Jablonka"]
diff --git a/src/mofdscribe/featurizers/utils/mixins.py b/src/mofdscribe/featurizers/utils/mixins.py
new file mode 100644
index 00000000..d995456a
--- /dev/null
+++ b/src/mofdscribe/featurizers/utils/mixins.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+"""Mixin classes for featurizers."""
+import numpy as np
+
+
+class GetGridMixin:
+    """Mixin class for getting a linearly spaced grid."""
+
+    def _get_grid(self, lower_bound, upper_bound, bin_size):
+        return np.arange(lower_bound, upper_bound, bin_size)
diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py
index f0cc03b5..4cb43e75 100644
--- a/src/mofdscribe/mof.py
+++ b/src/mofdscribe/mof.py
@@ -102,7 +102,9 @@ def structure_graph(self) -> StructureGraph:
         return self.__structure_graph
 
     @classmethod
-    def from_file(cls, path: PathType, primitive: bool = True, fragmentation_kwargs: Optional[dict]=None) -> "MOF":
+    def from_file(
+        cls, path: PathType, primitive: bool = True, fragmentation_kwargs: Optional[dict] = None
+    ) -> "MOF":
         """Create a MOF class from a file.
 
         Args:
diff --git a/tests/featurizers/bu/test_angle_hist_featurizer.py b/tests/featurizers/bu/test_angle_hist_featurizer.py
new file mode 100644
index 00000000..e58ab7ab
--- /dev/null
+++ b/tests/featurizers/bu/test_angle_hist_featurizer.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+from mofdscribe.featurizers.bu.angle_hist_featurizer import PairWiseAngleHist
+
+
+def test_angle_hist_featurizer(molecule):
+    featurizer = PairWiseAngleHist(density=True)
+    feats = featurizer.featurize(molecule)
+    labels = featurizer.feature_labels()
+    assert len(feats) == len(labels)
+    assert feats[0] == 0
diff --git a/tests/featurizers/bu/test_angle_stats_featurizer.py b/tests/featurizers/bu/test_angle_stats_featurizer.py
new file mode 100644
index 00000000..d4d2f9d1
--- /dev/null
+++ b/tests/featurizers/bu/test_angle_stats_featurizer.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+from mofdscribe.featurizers.bu.angle_stats_featurizer import PairWiseAngleStats
+
+
+def test_angle_stats_featurizer(molecule):
+    featurizer = PairWiseAngleStats()
+    feats = featurizer.featurize(molecule)
+    labels = featurizer.feature_labels()
+    assert len(feats) == len(labels)
+    assert (feats[0] > 90) & (feats[0] < 120)
+    assert feats[-1] == 180
diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py
index 510b435a..a49f8c87 100644
--- a/tests/featurizers/bu/test_bu_featurizer.py
+++ b/tests/featurizers/bu/test_bu_featurizer.py
@@ -8,7 +8,9 @@
 from pymatgen.core import Structure
 
 from mofdscribe.featurizers.bu.bu_featurizer import (
+    BindingNumHopFeaturizer,
     BindingSitesFeaturizer,
+    BranchingNumHopFeaturizer,
     BranchingSitesFeaturizer,
     BUFeaturizer,
     MOFBBs,
@@ -122,3 +124,60 @@ def test_branching_site_featurizer(hkust_structure):
     assert df["BranchingSitesFeaturizer_node_mean_lsop_sq_plan_max"].values[0] > 0.9
 
     assert df["BranchingSitesFeaturizer_linker_mean_lsop_tri_plan"].values[0] > 0.98
+
+
+def test_branching_num_hop(hkust_structure):
+    mof = MOF(hkust_structure)
+
+    featurizer = BranchingNumHopFeaturizer()
+    feats = featurizer.featurize(mof=mof)
+    labels = featurizer.feature_labels()
+    assert len(feats) == len(labels)
+    assert np.allclose(
+        feats,
+        [
+            4.0,
+            0.0,
+            4.0,
+            4.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            4.0,
+            0.0,
+            4.0,
+            4.0,
+            4.0,
+            0.0,
+            4.0,
+            4.0,
+            4.0,
+            0.0,
+            4.0,
+            4.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            4.0,
+            0.0,
+            4.0,
+            4.0,
+            4.0,
+            0.0,
+            4.0,
+            4.0,
+        ],
+    )
+
+
+def test_binding_num_hop(hkust_structure):
+    mof = MOF(hkust_structure)
+
+    featurizer = BindingNumHopFeaturizer()
+    feats = featurizer.featurize(mof=mof)
+    labels = featurizer.feature_labels()
+    assert len(feats) == len(labels)
+    assert feats[2] == 2.0
+    assert feats[3] == 4.0
diff --git a/tests/featurizers/chemistry/test_num_atoms.py b/tests/featurizers/chemistry/test_num_atoms.py
new file mode 100644
index 00000000..30f52c68
--- /dev/null
+++ b/tests/featurizers/chemistry/test_num_atoms.py
@@ -0,0 +1,8 @@
+from mofdscribe.featurizers.chemistry import NumAtoms
+from mofdscribe.mof import MOF
+
+
+def test_num_atoms(hkust_structure):
+    mof = MOF(hkust_structure)
+    num_atoms = NumAtoms()
+    assert num_atoms.featurize(mof) == 624
diff --git a/tests/featurizers/graph/test_num_hop.py b/tests/featurizers/graph/test_num_hop.py
new file mode 100644
index 00000000..a4bb6a44
--- /dev/null
+++ b/tests/featurizers/graph/test_num_hop.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+
+from mofdscribe.featurizers.graph.numhops import NumHops
+from mofdscribe.mof import MOF
+
+
+def test_num_hops(hkust_structure):
+    mof = MOF(hkust_structure)
+    num_hops = NumHops()
+    features = num_hops.featurize(mof, [0, 463])
+    assert len(features) == len(num_hops.feature_labels())
+    assert np.allclose(features, [1.0, 0, 1.0, 1.0])
+
+    features = num_hops.featurize(mof, [0, 272])
+    assert len(features) == len(num_hops.feature_labels())
+    assert np.allclose(features, [2.0, 0.0, 2.0, 2.0])

From 0990f9a100807d71f9a3cba101c08bd81cfaac4f Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Thu, 29 Dec 2022 18:25:22 +0100
Subject: [PATCH 13/17] wip: update racs

[ci skip]
---
 src/mofdscribe/featurizers/bu/racs.py         | 203 ++++++++++++++++++
 src/mofdscribe/featurizers/chemistry/racs.py  |  42 +++-
 .../featurizers/utils/structure_graph.py      |  80 ++++---
 tests/featurizers/chemistry/test_fragment.py  |   3 +
 tests/featurizers/chemistry/test_racs.py      |  15 +-
 tests/utils/test_structure_graph.py           |  14 +-
 6 files changed, 311 insertions(+), 46 deletions(-)
 create mode 100644 src/mofdscribe/featurizers/bu/racs.py

diff --git a/src/mofdscribe/featurizers/bu/racs.py b/src/mofdscribe/featurizers/bu/racs.py
new file mode 100644
index 00000000..5f29e872
--- /dev/null
+++ b/src/mofdscribe/featurizers/bu/racs.py
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+from typing import Tuple, Union, List, Optional, Collection, Dict, Set
+import networkx.algorithms.community as nx_comm
+from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph
+from mofdscribe.utils.extend import operates_on_moleculegraph, operates_on_structuregraph
+from mofdscribe.mof import MOF
+from element_coder.data.coding_data import get_coding_dict
+from mofdscribe.utils.structuregraph import get_neighbors_up_to_scope
+from collections import defaultdict
+from mofdscribe.featurizers.utils.aggregators import AGGREGATORS, ARRAY_AGGREGATORS
+from mofdscribe.featurizers.chemistry.racs import compute_racs
+from collections import OrderedDict
+
+def _get_site_iter(structuregraph: Union[StructureGraph, MoleculeGraph]):
+    if isinstance(structuregraph, StructureGraph):
+        return structuregraph.structure.sites
+    elif isinstance(structuregraph, MoleculeGraph):
+        return structuregraph.molecule.sites
+    else:
+        raise ValueError("structuregraph must be either a StructureGraph or a MoleculeGraph")
+
+def _get_atom_site_indices(structuregraph: Union[StructureGraph, MoleculeGraph], atom_groups: Collection[Tuple[str, List[str], bool]]):
+    """Get the indices of the sites that belong to the atom groups."""
+    atom_grouped_indices = defaultdict(set)
+    for atom_group_name, elements, _no_terminal in atom_groups:
+        for site in _get_site_iter(structuregraph):
+            if site.specie.symbol in elements:
+                atom_grouped_indices[atom_group_name].add(site.index)
+    return atom_grouped_indices
+
+def _split_up_communities(
+    structuregraph: Union[StructureGraph, MoleculeGraph],
+    communities: List[List[int]],
+    atom_groups: Collection[Tuple[str, List[str], bool]],
+):
+    """Create dictionary of communities with atom groups as keys."""
+    atom_grouped_communities = defaultdict(set)
+    indices_to_atom_group = _get_atom_site_indices()
+
+    for atom_group_name, elements, _no_terminal in atom_groups:
+        for communities in communities:
+            atom_grouped_communities[atom_group_name].update(indices_to_atom_group & set(communities))
+
+    return atom_grouped_communities
+    
+    
+
+
+_ALL_ELEMENTS = set(get_coding_dict("atomic").keys())
+_ALL_ELEMENTS_EXCEPT_C_H = _ALL_ELEMENTS - {"C", "H"}
+
+
+# ToDo: generalize the code and implement in only one place 
+# the main racs code is quite similar to this one
+def _get_racs_for_community(
+    community_indices,
+    structure_graph: StructureGraph,
+    neighbors_at_distance: Dict[int, Dict[int, Set[int]]],
+    properties: Tuple[str],
+    scopes: List[int],
+    property_aggregations: Tuple[str],
+    corr_aggregations: Tuple[str],
+    community_aggregations: Tuple[str],
+    community_name: str = "",
+):
+    community_racs = defaultdict(list)
+
+    if not community_indices:
+        # one nested list to make it trigger filling it with nan values
+        community_indices = [[]]
+    for start_indices in community_indices:
+        for scope in scopes:
+            racs = compute_racs(
+                start_indices,
+                structure_graph,
+                neighbors_at_distance,
+                properties,
+                scope,
+                property_aggregations,
+                corr_aggregations,
+                community_name,
+            )
+            for k, v in racs.items():
+                community_racs[k].append(v)
+
+    aggregated_racs = {}
+    for racs_name, racs_values in community_racs.items():
+        for community_agg in community_aggregations:
+            agg_func = ARRAY_AGGREGATORS[community_agg]
+            name = f"{racs_name}__atomgroupagg-{community_agg}"
+            aggregated_racs[name] = agg_func(racs_values)
+
+    return aggregated_racs
+
+
+@operates_on_structuregraph
+@operates_on_moleculegraph
+class ModularityCommunityCenteredRACS(MOFBaseFeaturizer):
+    _NAME = "ModularityCommunityCenteredRACS"
+
+    def __init__(
+        self,
+        atom_groups: Optional[Collection[Tuple[str, Collection[str], bool]]] = None,
+        attributes: Tuple[Union[int, str]] = ("X", "mod_pettifor", "I", "T"),
+        scopes: Tuple[int] = (1, 2, 3),
+        prop_agg: Tuple[str] = ("product", "diff"),
+        corr_agg: Tuple[str] = ("sum", "avg"),
+        atom_groups_agg: Tuple[str] = ("avg", "sum"),
+        bond_heuristic: str = "vesta",
+        dont_use_communities: bool = False,
+    ):
+        # heteroatom groups is a list of atoms which we will aggregate seperately if they are in a community
+        # additional, we can specify if we only aggregate them seperately if they are not terminal or part of a bridge
+        # if the atom groups are specified, there should maybe (?) also be an option to specify "all"
+        if atom_groups is None:
+            atom_groups = [("all", _ALL_ELEMENTS, False)]
+        self.atom_groups = atom_groups
+        self.attributes = attributes
+        self.scopes = scopes
+        self.prop_agg = prop_agg
+        self.corr_agg = corr_agg
+        self.atom_groups_agg = atom_groups_agg
+        self.bond_heuristic = bond_heuristic
+        self.dont_use_communities = dont_use_communities
+
+    def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]):
+        
+        if not self.dont_use_communities:
+            communities = list(
+                nx_comm.greedy_modularity_communities(structuregraph.graph.to_undirected())
+            )
+        else:
+            communities = [range(len(structuregraph))] # make one community with all atoms
+
+        neighbors_at_distance = {
+            i: get_neighbors_up_to_scope(structuregraph, i, max(self.scopes))
+            for i in range(len(structuregraph))
+        }
+
+        groups = _split_up_communities(structuregraph, communities, self.atom_groups)
+
+        racs = {}
+        for group_name, group_indices in groups.items():
+            racs.update(
+                _get_racs_for_community(
+                    group_indices,
+                    structuregraph,
+                    neighbors_at_distance,
+                    self.attributes,
+                    self.scopes,
+                    self.prop_agg,
+                    self.corr_agg,
+                    self.atom_groups_agg,
+                    group_name,
+                )
+            )
+
+        racs_ordered = OrderedDict(sorted(racs.items()))
+        return np.array(list(racs_ordered.values()))
+
+
+    def featurize(self, mof: MOF):
+        return self._featurize(mof.structure_graph)
+
+    def _get_feature_labels(self):
+        names = []
+        for atom_group_name, _, _ in self.atom_groups:
+            for scope in self.scopes:
+                for prop in self.attributes:
+                    for property_agg in self.prop_agg:
+                        for cor_agg in self.corr_agg:
+                            for atom_group_agg in self.atom_groups_agg:
+                                names.append(
+                                    f"{self._NAME}-{atom_group_name}_prop-{prop}_scope-{scope}_propagg-{property_agg}_corragg-{cor_agg}_atomgroupagg-{atom_group_agg}"  # noqa: E501
+                                )
+
+        names = sorted(names)
+        return names
+
+    def feature_labels(self):
+        return self._get_feature_labels()
+
+    def citations(self) -> List[str]:
+        return [
+            "@article{Janet2017,"
+            "doi = {10.1021/acs.jpca.7b08750},"
+            "url = {https://doi.org/10.1021/acs.jpca.7b08750},"
+            "year = {2017},"
+            "month = nov,"
+            "publisher = {American Chemical Society ({ACS})},"
+            "volume = {121},"
+            "number = {46},"
+            "pages = {8939--8954},"
+            "author = {Jon Paul Janet and Heather J. Kulik},"
+            "title = {Resolving Transition Metal Chemical Space: "
+            "Feature Selection for Machine Learning and Structure{\textendash}Property Relationships},"
+            "journal = {The Journal of Physical Chemistry A}"
+            "}",
+        ]
+
+    def implementors(self):
+        return ["Kevin Maik Jablonka"]
diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py
index 23159ce8..baf78958 100644
--- a/src/mofdscribe/featurizers/chemistry/racs.py
+++ b/src/mofdscribe/featurizers/chemistry/racs.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """Revised autocorrelation functions (RACs) for MOFs."""
 from collections import OrderedDict, defaultdict
-from typing import Collection, List, Optional, Tuple, Union
+from typing import Collection, List, Optional, Tuple, Union, Dict, Set
 
 import numpy as np
 from element_coder import encode
@@ -19,18 +19,19 @@
 )
 from mofdscribe.featurizers.utils.structure_graph import (
     get_connected_site_indices,
-    get_neighbors_at_distance,
     get_structure_graph,
+    get_neighbors_up_to_scope,
 )
 from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
 
-__all__ = ("RACS",)
+__all__ = ("RACS", "compute_racs")
 
 
-def _compute_racs(
+def compute_racs(
     start_indices: Collection[int],
     structure_graph: StructureGraph,
+    neighbors_at_distance: Dict[int, Dict[int, Set[int]]],
     properties: Tuple[str],
     scope: int,
     property_aggregations: Tuple[str],
@@ -38,6 +39,22 @@ def _compute_racs(
     part_name: str = "",
     nan_value: float = np.nan,
 ):
+    """Compute the RACs for a given set of properties and scope.
+
+    Args:
+        start_indices (Collection[int]): The indices of the sites to start from.
+        structure_graph (StructureGraph): The structure graph.
+        neighbors_at_distance (Dict[int, Dict[int, Set[int]]], optional): The neighbors at distance. Defaults to None.
+        properties (Tuple[str]): The properties that are correlated
+        scope (int): The scope of the RACs.
+        property_aggregations (Tuple[str]): The aggregations to perform on the properties.
+        corr_aggregations (Tuple[str]): The aggregations to perform on the correlations.
+        part_name (str, optional): The name of the part. Defaults to "".
+        nan_value (float, optional): The value to use for missing values. Defaults to np.nan.
+
+    Returns:
+        Dict[str, float]: The RACs.
+    """
     racs = defaultdict(lambda: defaultdict(list))
     if len(start_indices) == 0:
         logger.debug(f"No start indices for {part_name}")
@@ -46,7 +63,7 @@ def _compute_racs(
                 racs[prop][agg].append(nan_value)
     else:
         for start_atom in start_indices:
-            _, neighbors = get_neighbors_at_distance(structure_graph, start_atom, scope)
+            neighbors = neighbors_at_distance[start_atom][scope]
             num_neighbors = len(neighbors)
             # We only branch if there are actually neighbors scope many bonds away ...
 
@@ -81,7 +98,7 @@ def _compute_racs(
         for aggregation_name, aggregation_values in property_values.items():
             for corr_agg in corr_aggregations:
                 agg_func = ARRAY_AGGREGATORS[corr_agg]
-                name = f"racs_bb-{part_name}_prop-{property_name}_scope-{scope}_propagg-{aggregation_name}_corragg-{corr_agg}"  # noqa: E501
+                name = f"racs-{part_name}_prop-{property_name}_scope-{scope}_propagg-{aggregation_name}_corragg-{corr_agg}"  # noqa: E501
                 aggregated_racs[name] = agg_func(aggregation_values)
 
     return aggregated_racs
@@ -90,6 +107,7 @@ def _compute_racs(
 def _get_racs_for_bbs(
     bb_indices: Collection[int],
     structure_graph: StructureGraph,
+    neighbors_at_distance: Dict[int, Dict[int, Set[int]]],
     properties: Tuple[str],
     scopes: List[int],
     property_aggregations: Tuple[str],
@@ -104,9 +122,10 @@ def _get_racs_for_bbs(
         bb_indices = [[]]
     for start_indices in bb_indices:
         for scope in scopes:
-            racs = _compute_racs(
+            racs = compute_racs(
                 start_indices,
                 structure_graph,
+                neighbors_at_distance,
                 properties,
                 scope,
                 property_aggregations,
@@ -156,7 +175,7 @@ class RACS(MOFBaseFeaturizer):
     To use to original implementation, see `molSimplify
     <https://github.com/hjkgrp/molSimplify>`_.
     """
-
+    _MAME = "RACS"
     def __init__(
         self,
         attributes: Tuple[Union[int, str]] = ("X", "mod_pettifor", "I", "T"),
@@ -222,6 +241,10 @@ def _featurize(self, structure: Union[StructureIStructureType, StructureGraph])
                 f"Structure must be pymatgen Structure or StructureGraph, found {type(structure)}"
             )
 
+        neighbors_at_distance = {
+            i: get_neighbors_up_to_scope(sg, i, max(self.scopes)) for i in range(len(structure))
+        }
+
         racs = {}
         # This finds all indices for a particular subset of atoms
         # e.g. "nodes", "linker_all", "linker_connecting", "linker_functional", "linker_scaffold"
@@ -231,6 +254,7 @@ def _featurize(self, structure: Union[StructureIStructureType, StructureGraph])
                 _get_racs_for_bbs(
                     bb_indices[bb],
                     sg,
+                    neighbors_at_distance,
                     self.attributes,
                     self.scopes,
                     self.prop_agg,
@@ -251,7 +275,7 @@ def _get_feature_labels(self) -> List[str]:
                         for cor_agg in self.corr_agg:
                             for bb_agg in self.bb_agg:
                                 names.append(
-                                    f"racs_bb-{bb}_prop-{prop}_scope-{scope}_propagg-{property_agg}_corragg-{cor_agg}_bbagg-{bb_agg}"  # noqa: E501
+                                    f"{self._NAME}-{bb}_prop-{prop}_scope-{scope}_propagg-{property_agg}_corragg-{cor_agg}_bbagg-{bb_agg}"  # noqa: E501
                                 )
 
         names = sorted(names)
diff --git a/src/mofdscribe/featurizers/utils/structure_graph.py b/src/mofdscribe/featurizers/utils/structure_graph.py
index 8d8a150c..79d19842 100644
--- a/src/mofdscribe/featurizers/utils/structure_graph.py
+++ b/src/mofdscribe/featurizers/utils/structure_graph.py
@@ -1,45 +1,69 @@
 # -*- coding: utf-8 -*-
 """perform analyses on structure graphs."""
 from functools import lru_cache
-from typing import List, Optional, Set, Tuple
+from typing import List, Optional, Set, Tuple, Dict
 
 from pymatgen.analysis.graphs import StructureGraph
 from pymatgen.core import IStructure
 from structuregraph_helpers.create import get_structure_graph as get_sg
+from collections import defaultdict
+import networkx as nx
 
+def leads_to_terminal(nx_graph, edge, bridges: Dict[int, int]=None):
+    if bridges is None:
+        bridges = _generate_bridges(nx_graph)
+    sorted_edge = sorted(edge)
+    try:
+        bridge_edge = bridges[sorted_edge[0]]
+        return sorted_edge[1] in bridge_edge
+    except KeyError:
+        return False
 
-def get_neighbors_at_distance(
-    structure_graph: StructureGraph, start: int, scope: int
-) -> Tuple[Set[int], List[int]]:
-    """For a structure graph and a start site, return all sites within a certain\
-        distance (scope) of the start site.
+
+
+def _generate_bridges(nx_graph) -> Dict[int, int]:
+    
+    bridges = list(nx.bridges(nx_graph))
+
+    bridges_dict = defaultdict(list)
+    for key, value in bridges:
+        bridges_dict[key].append(value)
+
+   
+    return bridges_dict
+
+
+
+
+def get_connected_site_indices(sg, site: int):
+    """Get list of indices of neighboring sites."""
+    return [site.index for site in sg.get_connected_sites(site)]
+
+def get_neighbors_up_to_scope(structure_graph, site_index, scope):
+    """Get only the neighbors at a certain scope.
+
+    That is, scope=3 will return all neighbors three bonds away from the
+    site_index.
 
     Args:
-        structure_graph (StructureGraph): pymatgen StructureGraph object
-        start (int): starting atom
-        scope (int): distance to search
+        structure_graph (StructureGraph): The structure graph.
+        site_index (int): The site index.
+        scope (int): The scope.
 
     Returns:
-        Tuple[Set[int], List[int]]: All sites within the scope of the start
-        site, and the indices of the sites in the last shell
+        list: The list of neighbors.
     """
-    # Todo: This code is stupid.
-    neighbors_at_last_level = [start]
-    all_neighbors = set()
-    neighbors_at_next_level = []
-    for _ in range(scope):
-        for n in neighbors_at_last_level:
-            neighbors_at_next_level.extend(get_connected_site_indices(structure_graph, n))
-
-        all_neighbors.update(neighbors_at_last_level)
-        neighbors_at_last_level = neighbors_at_next_level
-        neighbors_at_next_level = []
-    all_neighbors.remove(start)
-    neighbors_at_last_level = set(neighbors_at_last_level)
-    if start in neighbors_at_last_level:
-        neighbors_at_last_level.remove(start)
-
-    return all_neighbors, neighbors_at_last_level
+    neighbors_in_scope = defaultdict(set)
+    neighbors_in_scope[0] = [site_index]
+    visited = set([site_index])
+    for i in range(1, scope + 1):
+        for site in neighbors_in_scope[i - 1]:
+            neighbors = get_connected_site_indices(structure_graph, site)
+            neighbors = [n for n in neighbors if n not in visited]
+            neighbors_in_scope[i].update(neighbors)
+            visited.update(neighbors)
+
+    return neighbors_in_scope
 
 
 @lru_cache()
diff --git a/tests/featurizers/chemistry/test_fragment.py b/tests/featurizers/chemistry/test_fragment.py
index e49fdfb5..cc386125 100644
--- a/tests/featurizers/chemistry/test_fragment.py
+++ b/tests/featurizers/chemistry/test_fragment.py
@@ -53,6 +53,8 @@ def test_get_bb_indices(hkust_structure):
     bb_indices = get_bb_indices(sg)
     assert len(bb_indices["nodes"]) < len(sg)
     assert len(bb_indices["nodes"]) > 0
+
+    # in this fragmentation, we assign as node only the metal part, but not the carboxy or
     assert (
         len(set(sum(bb_indices["nodes"], []))) == dict(hkust_structure.composition)[Element("Cu")]
     )
@@ -70,6 +72,7 @@ def test_get_bb_indices(hkust_structure):
         == 0
     )
 
+    # HKUST has no functional groups
     assert len(sum(bb_indices["linker_functional"], [])) == 0
     assert len(sum(bb_indices["linker_all"], [])) == len(
         sum(bb_indices["linker_scaffold"], [])
diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py
index 6262be54..3a0a2c48 100644
--- a/tests/featurizers/chemistry/test_racs.py
+++ b/tests/featurizers/chemistry/test_racs.py
@@ -7,7 +7,10 @@
 
 from mofdscribe.featurizers.chemistry._fragment import get_bb_indices
 from mofdscribe.featurizers.chemistry.racs import RACS, _get_racs_for_bbs
-from mofdscribe.featurizers.utils.structure_graph import get_structure_graph
+from mofdscribe.featurizers.utils.structure_graph import (
+    get_structure_graph,
+    get_neighbors_up_to_scope,
+)
 from mofdscribe.mof import MOF
 
 from ..helpers import is_jsonable
@@ -21,10 +24,15 @@ def test_racs(hkust_structure, irmof_structure):
         sg = get_structure_graph(IStructure.from_sites(structure), featurizer.bond_heuristic)
         racs = {}
         bb_indices = get_bb_indices(sg)
+        neighbors_at_distance = {
+            i: get_neighbors_up_to_scope(sg, i, max(featurizer.scopes))
+            for i in range(len(structure))
+        }
         for bb in featurizer._bbs:
             v = _get_racs_for_bbs(
                 bb_indices[bb],
                 sg,
+                neighbors_at_distance,
                 featurizer.attributes,
                 featurizer.scopes,
                 featurizer.prop_agg,
@@ -55,12 +63,17 @@ def test_racs_functional(irmof_structure, abacuf_structure, floating_structure):
         feats = featurizer.featurize(MOF(structure))
         # assert len(feats) == 4 * 3 * 8 * 5  # 4 properties, 3 scopes, 8 aggregations, 5 bb types
         sg = get_structure_graph(IStructure.from_sites(structure), featurizer.bond_heuristic)
+        neighbors_at_distance = {
+            i: get_neighbors_up_to_scope(sg, i, max(featurizer.scopes))
+            for i in range(len(structure))
+        }
         racs = {}
         bb_indices = get_bb_indices(sg)
         for bb in featurizer._bbs:
             v = _get_racs_for_bbs(
                 bb_indices[bb],
                 sg,
+                neighbors_at_distance,
                 featurizer.attributes,
                 featurizer.scopes,
                 featurizer.prop_agg,
diff --git a/tests/utils/test_structure_graph.py b/tests/utils/test_structure_graph.py
index 6a8233af..0578feaf 100644
--- a/tests/utils/test_structure_graph.py
+++ b/tests/utils/test_structure_graph.py
@@ -1,11 +1,9 @@
 # -*- coding: utf-8 -*-
 """Test helper functions for dealing with structure graphs."""
-from mofdscribe.featurizers.utils.structure_graph import get_neighbors_at_distance
+from mofdscribe.featurizers.utils.structure_graph import get_neighbors_up_to_scope
 
-
-def test_get_neighbors_at_distance(hkust_structure_graph):
-    """Test the get_neighbors_at_distance function
-    at some case studies (manually verified)."""
-    assert len(get_neighbors_at_distance(hkust_structure_graph, 1, 1)[1]) == 4
-
-    assert len(get_neighbors_at_distance(hkust_structure_graph, 1, 2)[1]) == 4
+def test_get_neighbors_up_to_scope(hkust_structure_graph): 
+    res = get_neighbors_up_to_scope(hkust_structure_graph, 1, 3)
+    assert len(res[1]) == 4
+    assert len(res[2]) == 4
+    assert len(res[3]) == 8
\ No newline at end of file

From d6bc873b1fad3dd7b2f3ff6ff675374dcdf1586a Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Thu, 29 Dec 2022 21:54:57 +0100
Subject: [PATCH 14/17] wip: updated RACS

[ci skip]
---
 .../source/featurizers/atom_centered/racs.rst |   2 +
 docs/source/featurizers/bu_centered/racs.rst  |  20 +++
 src/mofdscribe/featurizers/__init__.py        |  33 -----
 src/mofdscribe/featurizers/bu/racs.py         | 123 +++++++++++++-----
 src/mofdscribe/featurizers/chemistry/racs.py  |   5 +-
 .../featurizers/utils/definitions.py          | 107 +++++++++++++++
 .../featurizers/utils/structure_graph.py      |  16 +--
 tests/conftest.py                             |   6 +
 tests/featurizers/bu/test_racs.py             |  15 +++
 tests/featurizers/chemistry/test_racs.py      |   2 +-
 tests/test_files/COF-1.cif                    |  75 +++++++++++
 tests/utils/test_structure_graph.py           |   5 +-
 12 files changed, 331 insertions(+), 78 deletions(-)
 create mode 100644 docs/source/featurizers/bu_centered/racs.rst
 create mode 100644 src/mofdscribe/featurizers/utils/definitions.py
 create mode 100644 tests/featurizers/bu/test_racs.py
 create mode 100644 tests/test_files/COF-1.cif

diff --git a/docs/source/featurizers/atom_centered/racs.rst b/docs/source/featurizers/atom_centered/racs.rst
index 905030ad..5d2d5dc5 100644
--- a/docs/source/featurizers/atom_centered/racs.rst
+++ b/docs/source/featurizers/atom_centered/racs.rst
@@ -2,6 +2,8 @@
 Revised autocorrelation functions (RACs)
 .............................................
 
+.. _RACS: 
+
 Revised autocorrelation functions have originally been proposed for
 metal-complexes [Janet2017]_. Autocorrelation functions have been widely used as
 compact, fixed-length descriptors and are defined as
diff --git a/docs/source/featurizers/bu_centered/racs.rst b/docs/source/featurizers/bu_centered/racs.rst
new file mode 100644
index 00000000..12018890
--- /dev/null
+++ b/docs/source/featurizers/bu_centered/racs.rst
@@ -0,0 +1,20 @@
+
+Revised autocorrelation functions (RACs)
+.............................................
+
+See also :ref:`RACs <RACs>`.
+
+This featurizer is a flavor of the :ref:`RACs <RACs>` featurizer, that can split the computation over user-defined atom groups and automatically determined communities.
+
+In mofdscribe, you can customize the encodings :math:`P` (using all properties that are available in our `element-coder <https://github.com/kjappelbaum/element-coder>`_ package) as well as the aggregation functions.
+
+.. featurizer::  ModularityCommunityCenteredRACS
+    :id: ModularityCommunityCenteredRACS
+    :considers_geometry: False
+    :considers_structure_graph: True
+    :encodes_chemistry: optionally
+    :scope: local
+    :scalar: False
+    :style: only-light
+
+    Initially described in [Janet2017]_ for metal complexes, extended to MOFs in [Moosavi2021]_.
diff --git a/src/mofdscribe/featurizers/__init__.py b/src/mofdscribe/featurizers/__init__.py
index 73f995e3..e69de29b 100644
--- a/src/mofdscribe/featurizers/__init__.py
+++ b/src/mofdscribe/featurizers/__init__.py
@@ -1,33 +0,0 @@
-from .bu import LSOP  # noqa: F401
-from .bu import PMI1  # noqa: F401
-from .bu import PMI2  # noqa: F401
-from .bu import PMI3  # noqa: F401
-from .bu import Asphericity  # noqa: F401
-from .bu import BUFeaturizer  # noqa: F401
-from .bu import BUMatch  # noqa: F401
-from .bu import CompositionStats  # noqa: F401
-from .bu import DiskLikeness  # noqa: F401
-from .bu import Eccentricity  # noqa: F401
-from .bu import InertialShapeFactor  # noqa: F401
-from .bu import MOFBBs  # noqa: F401
-from .bu import NConf20  # noqa: F401
-from .bu import PairwiseDistanceHist  # noqa: F401
-from .bu import RadiusOfGyration  # noqa: F401
-from .bu import RDKitAdaptor  # noqa: F401
-from .bu import RodLikeness  # noqa: F401
-from .bu import SphereLikeness  # noqa: F401
-from .bu import SpherocityIndex  # noqa: F401
-from .chemistry import AMD  # noqa: F401
-from .chemistry import APRDF  # noqa: F401
-from .chemistry import RACS  # noqa: F401
-from .chemistry import EnergyGridHistogram  # noqa: F401
-from .chemistry import Henry  # noqa: F401
-from .chemistry import PartialChargeHistogram  # noqa: F401
-from .chemistry import PartialChargeStats  # noqa: F401
-from .chemistry import PriceLowerBound  # noqa: F401
-from .pore import AccessibleVolume  # noqa: F401
-from .pore import PoreDiameters  # noqa: F401
-from .pore import PoreSizeDistribution  # noqa: F401
-from .pore import RayTracingHistogram  # noqa: F401
-from .pore import SurfaceArea  # noqa: F401
-from .topology import AtomCenteredPH, PHHist, PHImage, PHStats, PHVect  # noqa: F401
diff --git a/src/mofdscribe/featurizers/bu/racs.py b/src/mofdscribe/featurizers/bu/racs.py
index 5f29e872..2144e5a9 100644
--- a/src/mofdscribe/featurizers/bu/racs.py
+++ b/src/mofdscribe/featurizers/bu/racs.py
@@ -1,16 +1,28 @@
 # -*- coding: utf-8 -*-
-from typing import Tuple, Union, List, Optional, Collection, Dict, Set
+""""RACs on molecule and structure graphs with optional community detection."""
+from collections import OrderedDict, defaultdict
+from typing import Collection, Dict, List, Optional, Set, Tuple, Union
+
 import networkx.algorithms.community as nx_comm
-from mofdscribe.featurizers.base import MOFBaseFeaturizer
+import numpy as np
 from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph
-from mofdscribe.utils.extend import operates_on_moleculegraph, operates_on_structuregraph
-from mofdscribe.mof import MOF
-from element_coder.data.coding_data import get_coding_dict
-from mofdscribe.utils.structuregraph import get_neighbors_up_to_scope
-from collections import defaultdict
-from mofdscribe.featurizers.utils.aggregators import AGGREGATORS, ARRAY_AGGREGATORS
+
+from mofdscribe.featurizers.base import MOFBaseFeaturizer
 from mofdscribe.featurizers.chemistry.racs import compute_racs
-from collections import OrderedDict
+from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
+from mofdscribe.featurizers.utils.definitions import (
+    ALL_ELEMENTS,
+    ALL_ELEMENTS_EXCEPT_C_H,
+    ALL_METAL_ELEMENTS,
+    ALL_NONMETAL_ELEMENTS,
+)
+from mofdscribe.featurizers.utils.extend import (
+    operates_on_moleculegraph,
+    operates_on_structuregraph,
+)
+from mofdscribe.featurizers.utils.structure_graph import get_neighbors_up_to_scope
+from mofdscribe.mof import MOF
+
 
 def _get_site_iter(structuregraph: Union[StructureGraph, MoleculeGraph]):
     if isinstance(structuregraph, StructureGraph):
@@ -20,38 +32,43 @@ def _get_site_iter(structuregraph: Union[StructureGraph, MoleculeGraph]):
     else:
         raise ValueError("structuregraph must be either a StructureGraph or a MoleculeGraph")
 
-def _get_atom_site_indices(structuregraph: Union[StructureGraph, MoleculeGraph], atom_groups: Collection[Tuple[str, List[str], bool]]):
+
+def _get_atom_site_indices(
+    structuregraph: Union[StructureGraph, MoleculeGraph],
+    atom_groups: Collection[Tuple[str, List[str], bool]],
+):
     """Get the indices of the sites that belong to the atom groups."""
     atom_grouped_indices = defaultdict(set)
     for atom_group_name, elements, _no_terminal in atom_groups:
-        for site in _get_site_iter(structuregraph):
+        for i, site in enumerate(_get_site_iter(structuregraph)):
             if site.specie.symbol in elements:
-                atom_grouped_indices[atom_group_name].add(site.index)
+                atom_grouped_indices[atom_group_name].add(i)
     return atom_grouped_indices
 
+
 def _split_up_communities(
     structuregraph: Union[StructureGraph, MoleculeGraph],
     communities: List[List[int]],
     atom_groups: Collection[Tuple[str, List[str], bool]],
 ):
     """Create dictionary of communities with atom groups as keys."""
-    atom_grouped_communities = defaultdict(set)
-    indices_to_atom_group = _get_atom_site_indices()
+    atom_grouped_communities = defaultdict(list)
+    indices_to_atom_group = _get_atom_site_indices(
+        structuregraph=structuregraph, atom_groups=atom_groups
+    )
 
     for atom_group_name, elements, _no_terminal in atom_groups:
         for communities in communities:
-            atom_grouped_communities[atom_group_name].update(indices_to_atom_group & set(communities))
+            if not isinstance(communities, set):
+                communities = set([communities])
+            atom_grouped_communities[atom_group_name].append(
+                indices_to_atom_group[atom_group_name] & set(communities)
+            )
 
     return atom_grouped_communities
-    
-    
-
 
-_ALL_ELEMENTS = set(get_coding_dict("atomic").keys())
-_ALL_ELEMENTS_EXCEPT_C_H = _ALL_ELEMENTS - {"C", "H"}
 
-
-# ToDo: generalize the code and implement in only one place 
+# ToDo: generalize the code and implement in only one place
 # the main racs code is quite similar to this one
 def _get_racs_for_community(
     community_indices,
@@ -97,6 +114,8 @@ def _get_racs_for_community(
 @operates_on_structuregraph
 @operates_on_moleculegraph
 class ModularityCommunityCenteredRACS(MOFBaseFeaturizer):
+    """RACs on molecule and structure graphs with optional community detection."""
+
     _NAME = "ModularityCommunityCenteredRACS"
 
     def __init__(
@@ -107,31 +126,72 @@ def __init__(
         prop_agg: Tuple[str] = ("product", "diff"),
         corr_agg: Tuple[str] = ("sum", "avg"),
         atom_groups_agg: Tuple[str] = ("avg", "sum"),
-        bond_heuristic: str = "vesta",
         dont_use_communities: bool = False,
     ):
-        # heteroatom groups is a list of atoms which we will aggregate seperately if they are in a community
-        # additional, we can specify if we only aggregate them seperately if they are not terminal or part of a bridge
-        # if the atom groups are specified, there should maybe (?) also be an option to specify "all"
+        """Constuct a ModularityCommunityCenteredRACS featurizer.
+
+        Features are computed for each atom group and for each community. The features are then aggregated
+        over the communities.
+
+        Args:
+            atom_groups (Optional[Collection[Tuple[str, Collection[str], bool]]], optional):
+                Elements which form start scopes for RACs.
+                The first element is the name of the atom group, and will appear in the feature name.
+                The second element is a list of elements that belong to the atom group. For example,
+                ('C-H', ['C', 'H'], False) will create a feature for all carbon atoms and hydrogen atoms.
+                The third element of the tuple is currently not used (but will be used as a flag to
+                indicate whether to include terminal/leading to terminal atoms in the start scope).
+                If set to None, there will be one atom group with all atoms.
+                Defaults to None.
+            attributes (Tuple[Union[int, str]], optional):
+                Elemental properties used for the construction of RACs. Defaults to ("X", "mod_pettifor", "I", "T").
+            scopes (Tuple[int], optional): Scopes used for the construction of RACs.
+                Defaults to (1, 2, 3).
+            prop_agg (Tuple[str], optional): Aggregation methods used for the aggregation of
+                properties. "Product" corresponds to "product-RACs" and "diff" to "difference-RACs".
+                Defaults to ("product", "diff").
+            corr_agg (Tuple[str], optional): Aggregation methods used for the aggregation of the
+                correlated properties. Defaults to ("sum", "avg").
+            atom_groups_agg (Tuple[str], optional): Aggregation methods used for the pooling
+                over atom communities within one atom group. Defaults to ("avg", "sum").
+            dont_use_communities (bool, optional): If set to true, we do not use modularity-based community detection.
+                Features are then simply averaged over all atoms.
+                Defaults to False.
+        """
         if atom_groups is None:
-            atom_groups = [("all", _ALL_ELEMENTS, False)]
+            atom_groups = [("all", ALL_ELEMENTS, False)]
         self.atom_groups = atom_groups
         self.attributes = attributes
         self.scopes = scopes
         self.prop_agg = prop_agg
         self.corr_agg = corr_agg
         self.atom_groups_agg = atom_groups_agg
-        self.bond_heuristic = bond_heuristic
         self.dont_use_communities = dont_use_communities
 
+    @classmethod
+    def from_preset(cls, preset: str, **kwargs):
+        if preset.lower() == "cof":
+            atom_groups = [
+                ("all", ALL_ELEMENTS, False),
+                ("C-H", {"C", "H"}, False),
+                ("not_C-H", ALL_ELEMENTS_EXCEPT_C_H, False),
+            ]
+        elif preset.lower() == "mof":
+            atom_groups = [
+                ("all", ALL_ELEMENTS, False),
+                ("metal", ALL_METAL_ELEMENTS, False),
+                ("nonmetal", ALL_NONMETAL_ELEMENTS, False),
+            ]
+        return cls(atom_groups=atom_groups, **kwargs)
+
     def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]):
-        
+
         if not self.dont_use_communities:
             communities = list(
                 nx_comm.greedy_modularity_communities(structuregraph.graph.to_undirected())
             )
         else:
-            communities = [range(len(structuregraph))] # make one community with all atoms
+            communities = [range(len(structuregraph))]  # make one community with all atoms
 
         neighbors_at_distance = {
             i: get_neighbors_up_to_scope(structuregraph, i, max(self.scopes))
@@ -140,6 +200,8 @@ def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]):
 
         groups = _split_up_communities(structuregraph, communities, self.atom_groups)
 
+        print(groups)
+
         racs = {}
         for group_name, group_indices in groups.items():
             racs.update(
@@ -159,7 +221,6 @@ def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]):
         racs_ordered = OrderedDict(sorted(racs.items()))
         return np.array(list(racs_ordered.values()))
 
-
     def featurize(self, mof: MOF):
         return self._featurize(mof.structure_graph)
 
diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py
index baf78958..7c35fd49 100644
--- a/src/mofdscribe/featurizers/chemistry/racs.py
+++ b/src/mofdscribe/featurizers/chemistry/racs.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """Revised autocorrelation functions (RACs) for MOFs."""
 from collections import OrderedDict, defaultdict
-from typing import Collection, List, Optional, Tuple, Union, Dict, Set
+from typing import Collection, Dict, List, Optional, Set, Tuple, Union
 
 import numpy as np
 from element_coder import encode
@@ -19,8 +19,8 @@
 )
 from mofdscribe.featurizers.utils.structure_graph import (
     get_connected_site_indices,
-    get_structure_graph,
     get_neighbors_up_to_scope,
+    get_structure_graph,
 )
 from mofdscribe.mof import MOF
 from mofdscribe.types import StructureIStructureType
@@ -176,6 +176,7 @@ class RACS(MOFBaseFeaturizer):
     <https://github.com/hjkgrp/molSimplify>`_.
     """
     _MAME = "RACS"
+
     def __init__(
         self,
         attributes: Tuple[Union[int, str]] = ("X", "mod_pettifor", "I", "T"),
diff --git a/src/mofdscribe/featurizers/utils/definitions.py b/src/mofdscribe/featurizers/utils/definitions.py
new file mode 100644
index 00000000..d3f33bd0
--- /dev/null
+++ b/src/mofdscribe/featurizers/utils/definitions.py
@@ -0,0 +1,107 @@
+from element_coder.data.coding_data import get_coding_dict
+
+ALL_ELEMENTS = set(get_coding_dict("atomic").keys())
+ALL_ELEMENTS_EXCEPT_C_H = ALL_ELEMENTS - {"C", "H"}
+ALL_METAL_ELEMENTS = set(
+    (
+        "Li",
+        "Be",
+        "Na",
+        "Mg",
+        "Al",
+        "K",
+        "Ca",
+        "Sc",
+        "Ti",
+        "V",
+        "Cr",
+        "Mn",
+        "Fe",
+        "Co",
+        "Ni",
+        "Cu",
+        "Zn",
+        "Ga",
+        "Rb",
+        "Sr",
+        "Y",
+        "Zr",
+        "Nb",
+        "Mo",
+        "Tc",
+        "Ru",
+        "Rh",
+        "Pd",
+        "Ag",
+        "Cd",
+        "In",
+        "Sn",
+        "Cs",
+        "Ba",
+        "La",
+        "Ce",
+        "Pr",
+        "Nd",
+        "Pm",
+        "Sm",
+        "Eu",
+        "Gd",
+        "Tb",
+        "Dy",
+        "Ho",
+        "Er",
+        "Tm",
+        "Yb",
+        "Lu",
+        "Hf",
+        "Ta",
+        "W",
+        "Re",
+        "Os",
+        "Ir",
+        "Pt",
+        "Au",
+        "Hg",
+        "Tl",
+        "Pb",
+        "Bi",
+        "Fr",
+        "Ra",
+        "Ac",
+        "Th",
+        "Pa",
+        "U",
+        "Np",
+        "Pu",
+        "Am",
+        "Cm",
+        "Bk",
+        "Cf",
+        "Es",
+        "Fm",
+        "Md",
+        "No",
+        "Lr",
+        "Rf",
+        "Db",
+        "Sg",
+        "Bh",
+        "Hs",
+        "Mt",
+        "Ds",
+        "Rg",
+        "Cn",
+        "Al",
+        "Ga",
+        "In",
+        "Tl",
+        "Ge",
+        "Sn",
+        "Pb",
+        "Sb",
+        "Bi",
+        "Po",
+    )
+)
+
+_ALL_NONMETAL_ELEMENTS = ALL_ELEMENTS - ALL_METAL_ELEMENTS
diff --git a/src/mofdscribe/featurizers/utils/structure_graph.py b/src/mofdscribe/featurizers/utils/structure_graph.py
index 79d19842..4fa05e69 100644
--- a/src/mofdscribe/featurizers/utils/structure_graph.py
+++ b/src/mofdscribe/featurizers/utils/structure_graph.py
@@ -1,15 +1,16 @@
 # -*- coding: utf-8 -*-
 """perform analyses on structure graphs."""
+from collections import defaultdict
 from functools import lru_cache
-from typing import List, Optional, Set, Tuple, Dict
+from typing import Dict, List, Optional, Set, Tuple
 
+import networkx as nx
 from pymatgen.analysis.graphs import StructureGraph
 from pymatgen.core import IStructure
 from structuregraph_helpers.create import get_structure_graph as get_sg
-from collections import defaultdict
-import networkx as nx
 
-def leads_to_terminal(nx_graph, edge, bridges: Dict[int, int]=None):
+
+def leads_to_terminal(nx_graph, edge, bridges: Dict[int, int] = None):
     if bridges is None:
         bridges = _generate_bridges(nx_graph)
     sorted_edge = sorted(edge)
@@ -20,25 +21,22 @@ def leads_to_terminal(nx_graph, edge, bridges: Dict[int, int]=None):
         return False
 
 
-
 def _generate_bridges(nx_graph) -> Dict[int, int]:
-    
+
     bridges = list(nx.bridges(nx_graph))
 
     bridges_dict = defaultdict(list)
     for key, value in bridges:
         bridges_dict[key].append(value)
 
-   
     return bridges_dict
 
 
-
-
 def get_connected_site_indices(sg, site: int):
     """Get list of indices of neighboring sites."""
     return [site.index for site in sg.get_connected_sites(site)]
 
+
 def get_neighbors_up_to_scope(structure_graph, site_index, scope):
     """Get only the neighbors at a certain scope.
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 433b6286..c6a5e65e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,6 +18,12 @@ def hkust_structure():
     return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "HKUST-1.cif"))
 
 
+@pytest.fixture(scope="session")
+def cof_1_structure():
+    """Return a pymatgen Structure for COF"""
+    return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "COF-1.cif"))
+
+
 @pytest.fixture(scope="session")
 def mof74_structure():
     """Return a pymatgen Structure for MOF74"""
diff --git a/tests/featurizers/bu/test_racs.py b/tests/featurizers/bu/test_racs.py
new file mode 100644
index 00000000..379dfff7
--- /dev/null
+++ b/tests/featurizers/bu/test_racs.py
@@ -0,0 +1,15 @@
+from mofdscribe.featurizers.bu.racs import ModularityCommunityCenteredRACS
+from mofdscribe.mof import MOF
+
+
+def test_modularitycommunity_racs(cof_1_structure):
+    featurizer = ModularityCommunityCenteredRACS()
+    feats = featurizer.featurize(MOF(cof_1_structure))
+    labels = featurizer.feature_labels()
+    assert len(feats) == len(labels)
+
+    # cof preset
+    featurizer = ModularityCommunityCenteredRACS.from_preset("cof")
+    feats = featurizer.featurize(MOF(cof_1_structure))
+    labels = featurizer.feature_labels()
+    assert len(feats) == len(labels)
diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py
index 3a0a2c48..7027452a 100644
--- a/tests/featurizers/chemistry/test_racs.py
+++ b/tests/featurizers/chemistry/test_racs.py
@@ -8,8 +8,8 @@
 from mofdscribe.featurizers.chemistry._fragment import get_bb_indices
 from mofdscribe.featurizers.chemistry.racs import RACS, _get_racs_for_bbs
 from mofdscribe.featurizers.utils.structure_graph import (
-    get_structure_graph,
     get_neighbors_up_to_scope,
+    get_structure_graph,
 )
 from mofdscribe.mof import MOF
 
diff --git a/tests/test_files/COF-1.cif b/tests/test_files/COF-1.cif
new file mode 100644
index 00000000..5c2a5709
--- /dev/null
+++ b/tests/test_files/COF-1.cif
@@ -0,0 +1,75 @@
+data_COF-1
+
+_audit_creation_method RASPA-1.0
+_audit_creation_date 2011-3-7
+_audit_author_name 'David Dubbeldam'
+
+_citation_author_name        'A.P. Cote, A.I. Benin, N.W. Ockwig, M. O’Keeffe, A.J. Matzger, and O.M. Yaghi''
+_citation_title              'Porous, crystalline, covalent organic frameworks'
+_citation_journal_abbrev     'Science'
+_citation_journal_volume     310
+_citation_journal_number     5751
+_citation_page_first         1166
+_citation_page_last          1170
+_citation_year               2005
+
+_cell_length_a    15.6529
+_cell_length_b    15.6529
+_cell_length_c    6.7005
+_cell_angle_alpha 90
+_cell_angle_beta  90
+_cell_angle_gamma 120
+_cell_volume      1421.76
+
+_symmetry_cell_setting          hexagonal
+_symmetry_space_group_name_Hall '-P 6c 2c'
+_symmetry_space_group_name_H-M  'P 63/m m c'
+_symmetry_Int_Tables_number     194
+
+loop_
+_symmetry_equiv_pos_as_xyz
+ 'x,y,z'
+ '-y,x-y,z'
+ '-x+y,-x,z'
+ '-x,-y,z+1/2'
+ 'y,-x+y,z+1/2'
+ 'x-y,x,z+1/2'
+ 'y,x,-z'
+ 'x-y,-y,-z'
+ '-x,-x+y,-z'
+ '-y,-x,-z+1/2'
+ '-x+y,y,-z+1/2'
+ 'x,x-y,-z+1/2'
+ '-x,-y,-z'
+ 'y,-x+y,-z'
+ 'x-y,x,-z'
+ 'x,y,-z+1/2'
+ '-y,x-y,-z+1/2'
+ '-x+y,-x,-z+1/2'
+ '-y,-x,z'
+ '-x+y,y,z'
+ 'x,x-y,z'
+ 'y,x,z+1/2'
+ 'x-y,-y,z+1/2'
+ '-x,-x+y,z+1/2'
+
+loop_
+_atom_site_label
+_atom_site_type_symbol
+_atom_site_fract_x
+_atom_site_fract_y
+_atom_site_fract_z
+_atom_site_charge
+B1       B      0.05772    0.11543    0.25       0 
+B2       B      0.44466    0.72233    0.25       0 
+O1       O      0.11133    0.05567    0.25       0 
+O2       O      0.389      0.778      0.25       0 
+C1       C      0.11184    0.38361    0.25       0 
+C2       C      0.21864    0.43728    0.25       0 
+C3       C      0.21837    0.27685    0.25       0 
+C4       C      0.1103     0.8897     0.25       0 
+H1       H      0.57703    0.92773    0.25       0 
+H2       H      1.02045    0.76181    0.75       0 
+
+
+
diff --git a/tests/utils/test_structure_graph.py b/tests/utils/test_structure_graph.py
index 0578feaf..8ea4a227 100644
--- a/tests/utils/test_structure_graph.py
+++ b/tests/utils/test_structure_graph.py
@@ -2,8 +2,9 @@
 """Test helper functions for dealing with structure graphs."""
 from mofdscribe.featurizers.utils.structure_graph import get_neighbors_up_to_scope
 
-def test_get_neighbors_up_to_scope(hkust_structure_graph): 
+
+def test_get_neighbors_up_to_scope(hkust_structure_graph):
     res = get_neighbors_up_to_scope(hkust_structure_graph, 1, 3)
     assert len(res[1]) == 4
     assert len(res[2]) == 4
-    assert len(res[3]) == 8
\ No newline at end of file
+    assert len(res[3]) == 8

From 6a887dc660b2bff230ff358d6a6a8a0d9cb6d960 Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Fri, 30 Dec 2022 08:15:36 +0100
Subject: [PATCH 15/17] fix lint and tests, start adding metrics docs

[ci skip]
---
 docs/source/featurizers/bu_centered/racs.rst  |  3 +-
 docs/source/index.rst                         |  1 +
 docs/source/metrics.rst                       | 14 +++++++
 docs/source/references.rst                    |  4 +-
 src/mofdscribe/featurizers/bu/racs.py         | 40 ++++++++++++++-----
 src/mofdscribe/featurizers/chemistry/racs.py  |  7 ++--
 .../featurizers/utils/definitions.py          |  2 +
 .../featurizers/utils/structure_graph.py      |  7 +---
 8 files changed, 56 insertions(+), 22 deletions(-)
 create mode 100644 docs/source/metrics.rst

diff --git a/docs/source/featurizers/bu_centered/racs.rst b/docs/source/featurizers/bu_centered/racs.rst
index 12018890..23458eee 100644
--- a/docs/source/featurizers/bu_centered/racs.rst
+++ b/docs/source/featurizers/bu_centered/racs.rst
@@ -4,7 +4,8 @@ Revised autocorrelation functions (RACs)
 
 See also :ref:`RACs <RACs>`.
 
-This featurizer is a flavor of the :ref:`RACs <RACs>` featurizer, that can split the computation over user-defined atom groups and automatically determined communities.
+This featurizer is a flavor of the :ref:`RACs <RACs>` featurizer, that can split the computation over user-defined atom groups and automatically determined communities. For determining communities, we use ``networkx``'s implementation of greedy modularity maximization [NewmanModularity]_.
+This community detection often corresponds to chemically meaningful parts (often ring systems) of the structure (but does not require an explicit fragmentation algorithm).
 
 In mofdscribe, you can customize the encodings :math:`P` (using all properties that are available in our `element-coder <https://github.com/kjappelbaum/element-coder>`_ package) as well as the aggregation functions.
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1a240b14..014a5ebb 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -37,6 +37,7 @@ dependencies, the featurizers are currently not integrated in matminer itself.
    splitters
    datasets
    background
+   metrics
    leaderboard
    contributing
    api
diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
new file mode 100644
index 00000000..f27ee506
--- /dev/null
+++ b/docs/source/metrics.rst
@@ -0,0 +1,14 @@
+Metrics
+===================
+
+In order to compare our models we need to score them using a metric. 
+Most commonly used in are scores such as accuracy, precision, recall, or the mean absolute error for regression problem. 
+
+However, these metrics are not always the best choice. 
+It is well known, for instance, that accuracy is not a good metric for imbalanced datasets.
+However, even beyond such considerations it is important to take into consideration for what purpose the model is used.
+
+For materials discovery, this often implies that a metric that measures how many of the top materials we find is more 
+important than an averaged, overall score.
+
+``mofdscribe`` provides some utilities to help with this in the ``mofdscribe.metrics`` subpackage.
\ No newline at end of file
diff --git a/docs/source/references.rst b/docs/source/references.rst
index 3bf3bd47..b959f0ff 100644
--- a/docs/source/references.rst
+++ b/docs/source/references.rst
@@ -113,4 +113,6 @@ References
 
 .. [Varoquaux] `Varoquaux, G. Cross-Validation Failure: Small Sample Sizes Lead to Large Error Bars. NeuroImage 2018, 180, 68–77. <https://doi.org/10.1016/j.neuroimage.2017.06.061>`_
 
-.. [LarsenDimensionality] `Larsen, P. M.; Pandey, M.; Strange, M.; Jacobsen, K. W. Definition of a Scoring Parameter to Identify Low-Dimensional Materials Components. Phys. Rev. Materials 2019, 3 (3), 034003. <https://doi.org/10.1103/PhysRevMaterials.3.034003>`_
\ No newline at end of file
+.. [LarsenDimensionality] `Larsen, P. M.; Pandey, M.; Strange, M.; Jacobsen, K. W. Definition of a Scoring Parameter to Identify Low-Dimensional Materials Components. Phys. Rev. Materials 2019, 3 (3), 034003. <https://doi.org/10.1103/PhysRevMaterials.3.034003>`_
+
+.. [NewmanModularity] `Newman, M. E. J. Modularity and community structure in networks. Proceedings of the National Academy of Sciences 2006, 103 (23), 8577–8582. <https://doi.org/10.1073/pnas.0601602103>`_
\ No newline at end of file
diff --git a/src/mofdscribe/featurizers/bu/racs.py b/src/mofdscribe/featurizers/bu/racs.py
index 2144e5a9..f4c55b78 100644
--- a/src/mofdscribe/featurizers/bu/racs.py
+++ b/src/mofdscribe/featurizers/bu/racs.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-""""RACs on molecule and structure graphs with optional community detection."""
+"""RACs on molecule and structure graphs with optional community detection."""
 from collections import OrderedDict, defaultdict
 from typing import Collection, Dict, List, Optional, Set, Tuple, Union
 
@@ -57,12 +57,14 @@ def _split_up_communities(
         structuregraph=structuregraph, atom_groups=atom_groups
     )
 
-    for atom_group_name, elements, _no_terminal in atom_groups:
-        for communities in communities:
-            if not isinstance(communities, set):
-                communities = set([communities])
+    for atom_group_name, _elements, _no_terminal in atom_groups:
+        for community in communities:
+            if not isinstance(community, set):
+                communities_set = set([community])
+            else:
+                communities_set = community
             atom_grouped_communities[atom_group_name].append(
-                indices_to_atom_group[atom_group_name] & set(communities)
+                indices_to_atom_group[atom_group_name] & set(communities_set)
             )
 
     return atom_grouped_communities
@@ -114,7 +116,26 @@ def _get_racs_for_community(
 @operates_on_structuregraph
 @operates_on_moleculegraph
 class ModularityCommunityCenteredRACS(MOFBaseFeaturizer):
-    """RACs on molecule and structure graphs with optional community detection."""
+    """RACs on molecule and structure graphs with optional community detection.
+
+    This featurizer is a flavor of the :ref:`RACs <RACs>` featurizer.
+    It can split the computation over user-defined atom groups and automatically determined communities.
+    For determining communities, we use ``networkx``'s implementation
+    of greedy modularity maximization [NewmanModularity]_.
+
+    The features are computed for each communinty within each atom group and then aggregated over the communities.
+    That is, the number of features will depend on the number of atom groups.
+    The communities will only impact how the features within an atom group are aggregated.
+
+    There are different ways in which you might want to use this featurizer.
+
+    1) No prior on communitiy structure. In this case, you can set ``dont_use_communities=True``.
+    It will still compute the RACs over the atom groups you specify but won't have an inner loop over communities
+    that are then aggegated for the final features for a given atom group.
+
+    2) No prior on atom grouping. In this case, you can set ``atom_groups=None``.
+    This will compute the RACs over all atoms.
+    """
 
     _NAME = "ModularityCommunityCenteredRACS"
 
@@ -154,7 +175,7 @@ def __init__(
                 correlated properties. Defaults to ("sum", "avg").
             atom_groups_agg (Tuple[str], optional): Aggregation methods used for the pooling
                 over atom communities within one atom group. Defaults to ("avg", "sum").
-            dont_use_communities (bool, optional): If set to true, we do not use modularity-based community detection.
+            dont_use_communities (bool): If set to true, we do not use modularity-based community detection.
                 Features are then simply averaged over all atoms.
                 Defaults to False.
         """
@@ -185,7 +206,6 @@ def from_preset(cls, preset: str, **kwargs):
         return cls(atom_groups=atom_groups, **kwargs)
 
     def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]):
-
         if not self.dont_use_communities:
             communities = list(
                 nx_comm.greedy_modularity_communities(structuregraph.graph.to_undirected())
@@ -200,8 +220,6 @@ def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]):
 
         groups = _split_up_communities(structuregraph, communities, self.atom_groups)
 
-        print(groups)
-
         racs = {}
         for group_name, group_indices in groups.items():
             racs.update(
diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py
index 7c35fd49..a252a554 100644
--- a/src/mofdscribe/featurizers/chemistry/racs.py
+++ b/src/mofdscribe/featurizers/chemistry/racs.py
@@ -49,8 +49,8 @@ def compute_racs(
         scope (int): The scope of the RACs.
         property_aggregations (Tuple[str]): The aggregations to perform on the properties.
         corr_aggregations (Tuple[str]): The aggregations to perform on the correlations.
-        part_name (str, optional): The name of the part. Defaults to "".
-        nan_value (float, optional): The value to use for missing values. Defaults to np.nan.
+        part_name (str): The name of the part. Defaults to "".
+        nan_value (float): The value to use for missing values. Defaults to np.nan.
 
     Returns:
         Dict[str, float]: The RACs.
@@ -175,7 +175,8 @@ class RACS(MOFBaseFeaturizer):
     To use to original implementation, see `molSimplify
     <https://github.com/hjkgrp/molSimplify>`_.
     """
-    _MAME = "RACS"
+
+    _NAME = "RACS"
 
     def __init__(
         self,
diff --git a/src/mofdscribe/featurizers/utils/definitions.py b/src/mofdscribe/featurizers/utils/definitions.py
index d3f33bd0..87bd533f 100644
--- a/src/mofdscribe/featurizers/utils/definitions.py
+++ b/src/mofdscribe/featurizers/utils/definitions.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+"""Constants used in the featurizers."""
 from element_coder.data.coding_data import get_coding_dict
 
 ALL_ELEMENTS = set(get_coding_dict("atomic").keys())
diff --git a/src/mofdscribe/featurizers/utils/structure_graph.py b/src/mofdscribe/featurizers/utils/structure_graph.py
index 4fa05e69..d7889eae 100644
--- a/src/mofdscribe/featurizers/utils/structure_graph.py
+++ b/src/mofdscribe/featurizers/utils/structure_graph.py
@@ -2,7 +2,7 @@
 """perform analyses on structure graphs."""
 from collections import defaultdict
 from functools import lru_cache
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Dict, List, Optional
 
 import networkx as nx
 from pymatgen.analysis.graphs import StructureGraph
@@ -32,11 +32,6 @@ def _generate_bridges(nx_graph) -> Dict[int, int]:
     return bridges_dict
 
 
-def get_connected_site_indices(sg, site: int):
-    """Get list of indices of neighboring sites."""
-    return [site.index for site in sg.get_connected_sites(site)]
-
-
 def get_neighbors_up_to_scope(structure_graph, site_index, scope):
     """Get only the neighbors at a certain scope.
 

From 318968f68427c63b8f041d85a1c45e905cf35a5c Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Fri, 30 Dec 2022 08:50:57 +0100
Subject: [PATCH 16/17] fix failing tests

---
 src/mofdscribe/featurizers/utils/definitions.py | 3 ++-
 tests/featurizers/chemistry/test_racs.py        | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mofdscribe/featurizers/utils/definitions.py b/src/mofdscribe/featurizers/utils/definitions.py
index 87bd533f..e9b6400b 100644
--- a/src/mofdscribe/featurizers/utils/definitions.py
+++ b/src/mofdscribe/featurizers/utils/definitions.py
@@ -30,6 +30,7 @@
         "Zr",
         "Nb",
         "Mo",
+
         "Tc",
         "Ru",
         "Rh",
@@ -106,4 +107,4 @@
     )
 )
 
-_ALL_NONMETAL_ELEMENTS = ALL_ELEMENTS - ALL_METAL_ELEMENTS
+ALL_NONMETAL_ELEMENTS = ALL_ELEMENTS - ALL_METAL_ELEMENTS
diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py
index 7027452a..939b7125 100644
--- a/tests/featurizers/chemistry/test_racs.py
+++ b/tests/featurizers/chemistry/test_racs.py
@@ -48,7 +48,7 @@ def test_racs(hkust_structure, irmof_structure):
                 assert np.isnan(np.array(list(v.values()))).sum() == len(v)
         racs_ordered = OrderedDict(sorted(racs.items()))
 
-        assert list(racs_ordered.keys()) == featurizer.feature_labels()
+        assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list(map(lambda x: x.lower(), featurizer.feature_labels()))
 
     # assert len(featurizer.feature_labels()) == 120
     assert len(featurizer.citations()) == 2
@@ -85,7 +85,7 @@ def test_racs_functional(irmof_structure, abacuf_structure, floating_structure):
             # we classify the "O" as a functional group
             assert np.isnan(np.array(list(v.values()))).sum() == 0
         racs_ordered = OrderedDict(sorted(racs.items()))
-        assert list(racs_ordered.keys()) == featurizer.feature_labels()
+        assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list(map(lambda x: x.lower(), featurizer.feature_labels()))
 
     # assert len(featurizer.feature_labels()) == 120
     assert len(featurizer.citations()) == 2

From 294cfe1d6f5d2519141b9e2e3689b0fc651d11fd Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Sun, 1 Jan 2023 11:27:04 +0100
Subject: [PATCH 17/17] capture errors

---
 src/mofdscribe/featurizers/bu/__init__.py     |  1 +
 .../featurizers/bu/bu_featurizer.py           | 70 ++++++++++++++-----
 src/mofdscribe/featurizers/bu/rdkitadaptor.py | 10 ++-
 .../featurizers/utils/aggregators.py          | 65 +++++++++--------
 .../featurizers/utils/definitions.py          |  1 -
 tests/featurizers/chemistry/test_racs.py      |  8 ++-
 6 files changed, 104 insertions(+), 51 deletions(-)

diff --git a/src/mofdscribe/featurizers/bu/__init__.py b/src/mofdscribe/featurizers/bu/__init__.py
index 1add3acf..fc999d9e 100644
--- a/src/mofdscribe/featurizers/bu/__init__.py
+++ b/src/mofdscribe/featurizers/bu/__init__.py
@@ -30,3 +30,4 @@
     SphereLikeness,
     SpherocityIndex,
 )
+from .smarts_matches import AcidGroupCounter, BaseGroupCounter, SmartsMatchCounter  # noqa: F401
diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py
index d67677b4..5090538f 100644
--- a/src/mofdscribe/featurizers/bu/bu_featurizer.py
+++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py
@@ -348,8 +348,16 @@ class BindingSitesFeaturizer(_BUSubBaseFeaturizer):
 
     def _extract(self, mof: MOF):
         if Structure in self._operates_on:
-            linkers = [linker._get_binding_sites_structure() for linker in mof.fragments.linkers]
-            nodes = [node._get_binding_sites_structure() for node in mof.fragments.nodes]
+            try:
+                linkers = [
+                    linker._get_binding_sites_structure() for linker in mof.fragments.linkers
+                ]
+            except Exception:
+                linkers = []
+            try:
+                nodes = [node._get_binding_sites_structure() for node in mof.fragments.nodes]
+            except Exception:
+                nodes = []
             return nodes, linkers
         elif StructureGraph in self._operates_on:
             linkers = [
@@ -393,8 +401,16 @@ class BranchingSitesFeaturizer(_BUSubBaseFeaturizer):
 
     def _extract(self, mof: MOF):
         if Structure in self._operates_on:
-            linkers = [linker._get_branching_sites_structure() for linker in mof.fragments.linkers]
-            nodes = [node._get_branching_sites_structure() for node in mof.fragments.nodes]
+            try:
+                linkers = [
+                    linker._get_branching_sites_structure() for linker in mof.fragments.linkers
+                ]
+            except Exception:
+                linkers = []
+            try:
+                nodes = [node._get_branching_sites_structure() for node in mof.fragments.nodes]
+            except Exception:
+                nodes = []
             return nodes, linkers
         elif StructureGraph in self._operates_on:
             linkers = [
@@ -434,7 +450,9 @@ def __init__(self, hop_stat_aggregations, aggregations, index_extractor: Callabl
     def _extract(self, mof: MOF):
 
         linkers = [linker.molecule_graph for linker in mof.fragments.linkers]
-        nodes = [node.molecule_graph for node in mof.fragments.nodes]
+        nodes = [
+            node.molecule_graph for node in mof.fragments.nodes
+        ]  # fixme: should use the dummy graph
 
         linker_indices = [self.index_extractor(linker) for linker in mof.fragments.linkers]
         node_indices = [self.index_extractor(node) for node in mof.fragments.nodes]
@@ -442,18 +460,36 @@ def _extract(self, mof: MOF):
 
     def featurize(self, mof: Optional[MOF] = None) -> np.ndarray:
         nodes, linkers, node_indices, linker_indices = self._extract(mof)
-        node_feats = np.array(
-            [
-                self._featurizer._featurize(node, node_idx)
-                for node, node_idx in zip(nodes, node_indices)
-            ]
-        )
-        linker_feats = np.array(
-            [
-                self._featurizer._featurize(linker, linker_idx)
-                for linker, linker_idx in zip(linkers, linker_indices)
-            ]
-        )
+        node_feats = []
+
+        # np.array(
+        #     [
+        #         self._featurizer._featurize(node, node_idx)
+        #         for node, node_idx in zip(nodes, node_indices)
+        #     ]
+        # )
+
+        for node, node_idx in zip(nodes, node_indices):
+            try:
+                node_feats.append(self._featurizer._featurize(node, node_idx))
+            except Exception:
+                node_feats.append([np.nan] * len(self._featurizer.feature_labels()))
+
+        node_feats = np.array(node_feats)
+
+        linker_feats = []  # np.array(
+        #     [
+        #         self._featurizer._featurize(linker, linker_idx)
+        #         for linker, linker_idx in zip(linkers, linker_indices)
+        #     ]
+        # )
+
+        for linker, linker_idx in zip(linkers, linker_indices):
+            try:
+                linker_feats.append(self._featurizer._featurize(linker, linker_idx))
+            except Exception:
+                linker_feats.append([np.nan] * len(self._featurizer.feature_labels()))
+        linker_feats = np.array(linker_feats)
 
         node_aggregated = np.concatenate(
             [ARRAY_AGGREGATORS[agg](node_feats, axis=0) for agg in self.aggregations]
diff --git a/src/mofdscribe/featurizers/bu/rdkitadaptor.py b/src/mofdscribe/featurizers/bu/rdkitadaptor.py
index b8e1ee35..02b23d24 100644
--- a/src/mofdscribe/featurizers/bu/rdkitadaptor.py
+++ b/src/mofdscribe/featurizers/bu/rdkitadaptor.py
@@ -97,9 +97,13 @@ def _featurize(self, molecule: Union[Molecule, MoleculeGraph]) -> np.ndarray:
             molecule_graph = MoleculeGraph.with_local_env_strategy(
                 molecule, get_local_env_method(self._local_env_strategy)
             )
-        rdkit_mol = create_rdkit_mol_from_mol_graph(
-            molecule_graph, force_sanitize=self._force_sanitize
-        )
+        try:
+            rdkit_mol = create_rdkit_mol_from_mol_graph(
+                molecule_graph, force_sanitize=self._force_sanitize
+            )
+        except Exception:
+            logger.warning("Could not create RDKit molecule from pymatgen molecule.")
+            return np.array([np.nan] * len(self._feature_labels))
         feats = self._featurizer(rdkit_mol)
         if isinstance(feats, (list, tuple, np.ndarray)):
             return np.asarray(feats)
diff --git a/src/mofdscribe/featurizers/utils/aggregators.py b/src/mofdscribe/featurizers/utils/aggregators.py
index 01628776..ae3fa976 100644
--- a/src/mofdscribe/featurizers/utils/aggregators.py
+++ b/src/mofdscribe/featurizers/utils/aggregators.py
@@ -48,6 +48,13 @@ def masked_trimean(values):
     return (q1 + 2 * q2 + q3) / 4
 
 
+def try_except_nan(func, *args, **kwargs):
+    try:
+        return func(*args, **kwargs)
+    except Exception:
+        return np.nan
+
+
 AGGREGATORS = {
     "sum": lambda x: x[0] + x[1],
     "avg": lambda x: (x[0] + x[1]) / 2,
@@ -62,36 +69,38 @@ def masked_trimean(values):
 
 
 ARRAY_AGGREGATORS = {
-    "sum": lambda x, **kwargs: np.sum(x, **kwargs),
-    "avg": lambda x, **kwargs: np.mean(x, **kwargs),
-    "max": lambda x, **kwargs: np.max(x, **kwargs),
-    "min": lambda x, **kwargs: np.min(x, **kwargs),
-    "std": lambda x, **kwargs: np.std(x, **kwargs),
-    "range": lambda x, **kwargs: np.max(x, **kwargs) - np.min(x, **kwargs),
-    "mean": lambda x, **kwargs: np.mean(x, **kwargs),
-    "median": lambda x, **kwargs: np.median(x, **kwargs),
-    "geomean": lambda x, **kwargs: gmean(x, **kwargs),
-    "harmean": lambda x, **kwargs: hmean(x, **kwargs),
-    "mad": lambda x, **kwargs: mad(x, **kwargs),
-    "trimean": lambda x, **kwargs: trimean(x, **kwargs),
-    "inf": lambda x, **kwargs: np.linalg.norm(x, ord=np.inf, **kwargs),
-    "manhattan": lambda x, **kwargs: np.linalg.norm(x, ord=1, **kwargs),
+    "sum": lambda x, **kwargs: try_except_nan(np.sum, x, **kwargs),
+    "avg": lambda x, **kwargs: try_except_nan(np.mean, x, **kwargs),
+    "max": lambda x, **kwargs: try_except_nan(np.max, x, **kwargs),
+    "min": lambda x, **kwargs: try_except_nan(np.min, x, **kwargs),
+    "std": lambda x, **kwargs: try_except_nan(np.std, x, **kwargs),
+    "range": lambda x, **kwargs: try_except_nan(np.max, x, **kwargs)
+    - try_except_nan(np.min, x, **kwargs),
+    "mean": lambda x, **kwargs: try_except_nan(np.mean, x, **kwargs),
+    "median": lambda x, **kwargs: try_except_nan(np.median, x, **kwargs),
+    "geomean": lambda x, **kwargs: try_except_nan(gmean, x, **kwargs),
+    "harmean": lambda x, **kwargs: try_except_nan(hmean, x, **kwargs),
+    "mad": lambda x, **kwargs: try_except_nan(mad, x, **kwargs),
+    "trimean": lambda x, **kwargs: try_except_nan(trimean, x, **kwargs),
+    "inf": lambda x, **kwargs: try_except_nan(np.linalg.norm, x, ord=np.inf, **kwargs),
+    "manhattan": lambda x, **kwargs: try_except_nan(np.linalg.norm, x, ord=1, **kwargs),
 }
 
 
 MA_ARRAY_AGGREGATORS = {
-    "sum": lambda x, **kwargs: np.ma.sum(x, **kwargs),
-    "avg": lambda x, **kwargs: np.ma.mean(x, **kwargs),
-    "max": lambda x, **kwargs: np.ma.max(x, **kwargs),
-    "min": lambda x, **kwargs: np.ma.min(x, **kwargs),
-    "std": lambda x, **kwargs: np.ma.std(x, **kwargs),
-    "range": lambda x, **kwargs: np.ma.max(x, **kwargs) - np.ma.min(x, **kwargs),
-    "mean": lambda x, **kwargs: np.ma.mean(x, **kwargs),
-    "median": lambda x, **kwargs: np.ma.median(x, **kwargs),
-    "geomean": lambda x, **kwargs: mstast_gmean(x, **kwargs),
-    "harmean": lambda x, **kwargs: mstast_hmean(x, **kwargs),
-    "mad": lambda x, **kwargs: masked_mad(x, **kwargs),
-    "trimean": lambda x, **kwargs: masked_trimean(x, **kwargs),
-    "inf": lambda x, **kwargs: np.linalg.norm(x, ord=np.inf, **kwargs),
-    "manhattan": lambda x, **kwargs: np.linalg.norm(x, ord=1, **kwargs),
+    "sum": lambda x, **kwargs: try_except_nan(np.ma.sum, x, **kwargs),
+    "avg": lambda x, **kwargs: try_except_nan(np.ma.mean, x, **kwargs),
+    "max": lambda x, **kwargs: try_except_nan(np.ma.max, x, **kwargs),
+    "min": lambda x, **kwargs: try_except_nan(np.ma.min, x, **kwargs),
+    "std": lambda x, **kwargs: try_except_nan(np.ma.std, x, **kwargs),
+    "range": lambda x, **kwargs: try_except_nan(np.ma.max, x, **kwargs)
+    - try_except_nan(np.ma.min, x, **kwargs),
+    "mean": lambda x, **kwargs: try_except_nan(np.ma.mean, x, **kwargs),
+    "median": lambda x, **kwargs: try_except_nan(np.ma.median, x, **kwargs),
+    "geomean": lambda x, **kwargs: try_except_nan(mstast_gmean, x, **kwargs),
+    "harmean": lambda x, **kwargs: try_except_nan(mstast_hmean, x, **kwargs),
+    "mad": lambda x, **kwargs: try_except_nan(masked_mad, x, **kwargs),
+    "trimean": lambda x, **kwargs: try_except_nan(masked_trimean, x, **kwargs),
+    "inf": lambda x, **kwargs: try_except_nan(np.linalg.norm, x, ord=np.inf, **kwargs),
+    "manhattan": lambda x, **kwargs: try_except_nan(np.linalg.norm, x, ord=1, **kwargs),
 }
diff --git a/src/mofdscribe/featurizers/utils/definitions.py b/src/mofdscribe/featurizers/utils/definitions.py
index e9b6400b..d1bae93e 100644
--- a/src/mofdscribe/featurizers/utils/definitions.py
+++ b/src/mofdscribe/featurizers/utils/definitions.py
@@ -30,7 +30,6 @@
         "Zr",
         "Nb",
         "Mo",
-
         "Tc",
         "Ru",
         "Rh",
diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py
index 939b7125..218ff977 100644
--- a/tests/featurizers/chemistry/test_racs.py
+++ b/tests/featurizers/chemistry/test_racs.py
@@ -48,7 +48,9 @@ def test_racs(hkust_structure, irmof_structure):
                 assert np.isnan(np.array(list(v.values()))).sum() == len(v)
         racs_ordered = OrderedDict(sorted(racs.items()))
 
-        assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list(map(lambda x: x.lower(), featurizer.feature_labels()))
+        assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list(
+            map(lambda x: x.lower(), featurizer.feature_labels())
+        )
 
     # assert len(featurizer.feature_labels()) == 120
     assert len(featurizer.citations()) == 2
@@ -85,7 +87,9 @@ def test_racs_functional(irmof_structure, abacuf_structure, floating_structure):
             # we classify the "O" as a functional group
             assert np.isnan(np.array(list(v.values()))).sum() == 0
         racs_ordered = OrderedDict(sorted(racs.items()))
-        assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list(map(lambda x: x.lower(), featurizer.feature_labels()))
+        assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list(
+            map(lambda x: x.lower(), featurizer.feature_labels())
+        )
 
     # assert len(featurizer.feature_labels()) == 120
     assert len(featurizer.citations()) == 2