From 7e1cf8339728f6b3e61cbb106061a30f869c6c00 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Wed, 21 Dec 2022 18:28:27 +0100 Subject: [PATCH 01/17] wip: refactor --- examples/build_model_using_mofdscribe.ipynb | 2 +- src/mofdscribe/featurizers/base.py | 178 +++++------------ .../featurizers/bu/bu_featurizer.py | 23 ++- src/mofdscribe/featurizers/bu/fragment.py | 22 +++ src/mofdscribe/featurizers/graph/__init__.py | 0 .../featurizers/graph/dimensionality.py | 37 ++++ .../featurizers/graph/graphfeaturizer.py | 60 ++++++ .../featurizers/hostguest/guest_aprdf.py | 29 ++- .../hostguest/host_guest_featurizer.py | 60 +++--- src/mofdscribe/featurizers/hostguest/utils.py | 36 ++-- .../featurizers/pore/geometric_properties.py | 58 +++--- src/mofdscribe/featurizers/pore/voxelgrid.py | 17 +- .../featurizers/topology/atom_centered_ph.py | 36 ++-- .../featurizers/topology/ph_hist.py | 10 +- .../featurizers/topology/ph_image.py | 27 ++- .../featurizers/topology/ph_stats.py | 12 +- src/mofdscribe/featurizers/utils/extend.py | 2 + src/mofdscribe/mof.py | 181 ++++++++++++++++++ src/mofdscribe/types.py | 2 + tests/conftest.py | 6 + tests/featurizers/graph/__init__.py | 0 .../featurizers/graph/test_dimensionality.py | 9 + .../featurizers/graph/test_graphfeaturizer.py | 9 + .../featurizers/hostguest/test_guest_aprdf.py | 3 +- .../hostguest/test_host_guest_featurizer.py | 5 +- .../pore/test_geometric_properties.py | 15 +- tests/featurizers/pore/test_voxelgrid.py | 3 +- .../topology/test_atom_centered_ph.py | 21 +- tests/featurizers/topology/test_ph_hist.py | 16 +- tests/featurizers/topology/test_ph_image.py | 13 +- tests/featurizers/topology/test_ph_stats.py | 15 +- tests/test_mof.py | 28 +++ 32 files changed, 616 insertions(+), 319 deletions(-) create mode 100644 src/mofdscribe/featurizers/bu/fragment.py create mode 100644 src/mofdscribe/featurizers/graph/__init__.py create mode 100644 src/mofdscribe/featurizers/graph/dimensionality.py create mode 100644 src/mofdscribe/featurizers/graph/graphfeaturizer.py create mode 100644 src/mofdscribe/mof.py create mode 100644 tests/featurizers/graph/__init__.py create mode 100644 tests/featurizers/graph/test_dimensionality.py create mode 100644 tests/featurizers/graph/test_graphfeaturizer.py create mode 100644 tests/test_mof.py diff --git a/examples/build_model_using_mofdscribe.ipynb b/examples/build_model_using_mofdscribe.ipynb index a47f2ce5..686248ef 100644 --- a/examples/build_model_using_mofdscribe.ipynb +++ b/examples/build_model_using_mofdscribe.ipynb @@ -19633,7 +19633,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:05:16) \n[Clang 12.0.1 ]" }, "orig_nbformat": 4, "vscode": { diff --git a/src/mofdscribe/featurizers/base.py b/src/mofdscribe/featurizers/base.py index 1436076d..75484d56 100644 --- a/src/mofdscribe/featurizers/base.py +++ b/src/mofdscribe/featurizers/base.py @@ -11,34 +11,18 @@ or and iterable of pymatgen structure objects. """ from abc import abstractmethod -from functools import partial from multiprocessing import Pool -from typing import Union +from typing import Collection, Union import numpy as np import pandas as pd from loguru import logger from matminer.featurizers.base import BaseFeaturizer, MultipleFeaturizer +from pymatgen.analysis.graphs import StructureGraph from pymatgen.core import IMolecule, IStructure, Molecule, Structure from tqdm.auto import tqdm - -def get_primitive(structure: Union[IStructure, Structure, Molecule, IMolecule]) -> Structure: - """Get the primitive cell of a structure. - - We use this wrapper because we want to have a passtrough - for molecules, which some of our featurizers also accept. - - Args: - structure: Structure object. - - Returns: - Structure object. - """ - if isinstance(structure, (Structure, IStructure)): - return structure.get_primitive_structure() - else: - return structure +from mofdscribe.mof import MOF class MOFBaseFeaturizer(BaseFeaturizer): @@ -48,134 +32,64 @@ class MOFBaseFeaturizer(BaseFeaturizer): If you implement a new :code:`MOFBaseFeaturizer`, you need to implement a :code:`_featurize` method. - If you implement :code:`featurize` directly, - you would override the conversion to primitive cell. - - .. warning:: - - If you implement a new :code:`MOFBaseFeaturizer`, - and your featurizer needs to be fitted, keep in mind - to also call the :code:`_get_primitive` method. """ - def __init__(self, primitive: bool = False) -> None: - """ - Construct a MOFBaseFeaturizer. + @abstractmethod + def _featurize( + self, structure_object: Union[Structure, IStructure, Molecule, IMolecule, StructureGraph] + ) -> np.ndarray: + raise NotImplementedError("_featurize must be implemented in a subclass") - Args: - primitive (bool): If True, use the primitive cell of the structure. - Defaults to False. - """ - self.primitive = primitive + @abstractmethod + def featurize(self, mof: MOF) -> np.ndarray: + raise NotImplementedError("featurize must be implemented in a subclass") - def _get_primitive_many(self, structures): - if self.n_jobs == 1: - return [self._get_primitive(s) for s in structures] - else: - with Pool(self.n_jobs, maxtasksperchild=1) as p: - res = p.map(self._get_primitive, structures, chunksize=self.chunksize) - return res + # ToDo: make make them abstractmethods in a "FittabelMOFBaseFeaturizer" class + def _fit( + self, structure_object: Union[Structure, IStructure, Molecule, IMolecule, StructureGraph] + ): + raise NotImplementedError("_fit is not implemented for MOFBaseFeaturizer") - def _get_primitive(self, structure: Union[Structure, IStructure]) -> Structure: - logger.debug("Getting primitive cell for structure in MOFBaseFeaturizer") - return get_primitive(structure) + def fit(self, mofs: Union[MOF, Collection[MOF]]): + raise NotImplementedError("fit is not implemented for MOFBaseFeaturizer") - @abstractmethod - def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: - raise NotImplementedError("_featurize must be implemented in a subclass") - - def fit(self, structures): - if not isinstance(structures, (list, tuple)): - structures = [structures] - if self.primitive: - structures = self._get_primitive_many(structures) - self._fit(structures) - def featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: - """Compute the descriptor for a given structure. +class MOFBaseSiteFeaturizer(BaseFeaturizer): + """Base featurizer for MOF site based featurizers. - Args: - structure (Union[Structure, IStructure]): Structure to compute the descriptor for. + .. note:: - Returns: - A numpy array containing the descriptor. - """ - logger.debug("Featurizing structure in MOFBaseFeaturizer") - if self.primitive: - logger.debug("Getting primitive cell for structure in MOFBaseFeaturizer") - structure = get_primitive(structure) - return self._featurize(structure) + If you implement a new :code:`MOFBaseSiteFeaturizer`, + you need to implement a :code:`_featurize` method. + """ - def featurize_many(self, entries, ignore_errors=False, return_errors=False, pbar=True): - """Featurize a list of entries. + @abstractmethod + def _featurize( + self, + structure_object: Union[Structure, IStructure, Molecule, IMolecule, StructureGraph], + idx: int, + ) -> np.ndarray: + raise NotImplementedError("_featurize must be implemented in a subclass") - If `featurize` takes multiple inputs, supply inputs as a list of tuples. + @abstractmethod + def featurize(self, mof: MOF, idx: int) -> np.ndarray: + raise NotImplementedError("featurize must be implemented in a subclass") - Featurize_many supports entries as a list, tuple, numpy array, - Pandas Series, or Pandas DataFrame. + # ToDo: make make them abstractmethods in a "FittabelMOFBaseFeaturizer" class + def _fit( + self, + structure_object: Union[Structure, IStructure, Molecule, IMolecule, StructureGraph], + sites: Union[int, Collection[int]], + ): + raise NotImplementedError("_fit is not implemented for MOFBaseSiteFeaturizer") - Args: - entries (list-like object): A list of entries to be featurized. - ignore_errors (bool): Returns NaN for entries where exceptions are - thrown if True. If False, exceptions are thrown as normal. - return_errors (bool): If True, returns the feature list as - determined by ignore_errors with traceback strings added - as an extra 'feature'. Entries which featurize without - exceptions have this extra feature set to NaN. - pbar (bool): Show a progress bar for featurization if True. - - Returns: - (list) features for each entry. - - Raises: - Exception: If entries is not a list-like object. - ValueError: If return_errors is set and ignore_errors is True. - """ - if return_errors and not ignore_errors: - raise ValueError("Please set ignore_errors to True to use" " return_errors.") - - # Check inputs - if not isinstance(entries, (tuple, list, np.ndarray, pd.Series, pd.DataFrame)): - raise Exception("'entries' must be a list-like object") - - # Special case: Empty list - if len(entries) == 0: - return [] - - # If the featurize function only has a single arg, zip the inputs - if isinstance(entries, pd.DataFrame): - entries = entries.values - elif isinstance(entries, pd.Series) or not isinstance( - entries[0], (tuple, list, np.ndarray) - ): - entries = zip(entries) - - # Add a progress bar - if pbar: - # list() required, tqdm has issues with memory if generator given - entries = tqdm(list(entries), desc=self.__class__.__name__) - - # Run the actual featurization - if self.n_jobs == 1: - return np.array( - [ - self.featurize_wrapper( - x, ignore_errors=ignore_errors, return_errors=return_errors - ) - for x in entries - ] - ) - else: - with Pool(self.n_jobs, maxtasksperchild=1) as p: - func = partial( - self.featurize_wrapper, - return_errors=return_errors, - ignore_errors=ignore_errors, - ) - res = p.map(func, entries, chunksize=self.chunksize) - return np.array(res) + def fit(self, mofs: Union[MOF, Collection[MOF]], sites: Union[int, Collection[int]]): + raise NotImplementedError("fit is not implemented for MOFBaseSiteFeaturizer") +# Probably, collect the featurizers into different groups depending on the type of input, +# i.e. graphs, molecules, structures, etc. and then have a multiple featurizer for each group. +# and this one just calls the appropriate multiple featurizer depending on the type of input. class MOFMultipleFeaturizer(MultipleFeaturizer): """Base multiplefeaturizer for MOF structure based featurizers. diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py index 9e31461e..d1a3c50b 100644 --- a/src/mofdscribe/featurizers/bu/bu_featurizer.py +++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py @@ -5,20 +5,34 @@ import numpy as np from matminer.featurizers.base import BaseFeaturizer from pydantic import BaseModel +from pymatgen.analysis.graphs import StructureGraph from pymatgen.core import IMolecule, IStructure, Molecule, Structure +from mofdscribe.featurizers.bu.fragment import fragment from mofdscribe.featurizers.bu.utils import boxed_molecule from mofdscribe.featurizers.utils import nan_array, set_operates_on from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS +STRUCTURE_VIEWS = ("all_atom", "connecting", "binding") + class MOFBBs(BaseModel): """Container for MOF building blocks.""" - nodes: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]] - linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]] + nodes: Optional[List[Union[Structure, Molecule, IStructure, IMolecule, StructureGraph]]] + linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule, StructureGraph]]] + + +class ConnectingSitesFeaturizer(BaseFeaturizer): + ... +class BindingSitesFeaturizer(BaseFeaturizer): + ... + + +# If we would cache the fragmentation, the implementation would be simpler +# ToDo: support moffragmentor options # ToDo: Support `MultipleFeaturizer`s (should be ok, if we recursively call the operates_on method). class BUFeaturizer(BaseFeaturizer): """ @@ -89,10 +103,7 @@ def _extract_bbs( raise ValueError("You must provide a structure or mofbbs.") if structure is not None: - from moffragmentor import MOF - - mof = MOF.from_structure(structure) - fragments = mof.fragment() + fragments = fragment(structure) if self._operates_on in ("both", "molecule"): linkers = [linker.molecule for linker in fragments.linkers] diff --git a/src/mofdscribe/featurizers/bu/fragment.py b/src/mofdscribe/featurizers/bu/fragment.py new file mode 100644 index 00000000..453b2359 --- /dev/null +++ b/src/mofdscribe/featurizers/bu/fragment.py @@ -0,0 +1,22 @@ +from functools import lru_cache +from typing import List, Union + +from pymatgen.core import IStructure, Structure + + +@lru_cache(maxsize=None) +def _fragment_cached(structure): + from moffragmentor import MOF + + mof = MOF.from_structure(structure) + fragments = mof.fragment() + return fragments + + +def fragment(structure: Union[Structure, IStructure]) -> List[Structure]: + if isinstance(structure, Structure): + return _fragment_cached(IStructure.from_sites(structure.sites)) + elif isinstance(structure, IStructure): + return _fragment_cached(structure) + else: + raise TypeError(f"Expected Structure or IStructure, got {type(structure)}") diff --git a/src/mofdscribe/featurizers/graph/__init__.py b/src/mofdscribe/featurizers/graph/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/mofdscribe/featurizers/graph/dimensionality.py b/src/mofdscribe/featurizers/graph/dimensionality.py new file mode 100644 index 00000000..8f7c8633 --- /dev/null +++ b/src/mofdscribe/featurizers/graph/dimensionality.py @@ -0,0 +1,37 @@ +from typing import List + +import numpy as np +from pymatgen.analysis.dimensionality import get_dimensionality_larsen +from pymatgen.analysis.graphs import StructureGraph + +from mofdscribe.featurizers.graph.graphfeaturizer import GraphFeaturizer + + +class Dimensionality(GraphFeaturizer): + def __init__(self) -> None: + super().__init__() + + def _featurize(self, structure_graph: StructureGraph) -> np.ndarray: + return get_dimensionality_larsen(structure_graph) + + def feature_labels(self) -> List[str]: + return ["dimensionality"] + + def citations(self) -> List[str]: + return [ + "@article{Larsen_2019," + "doi = {10.1103/physrevmaterials.3.034003}," + "url = {https://doi.org/10.1103%2Fphysrevmaterials.3.034003}," + "year = 2019," + "month = {mar}," + "publisher = {American Physical Society ({APS})}," + "volume = {3}," + "number = {3}," + "author = {Peter Mahler Larsen and Mohnish Pandey and Mikkel Strange and Karsten Wedel Jacobsen}," + "title = {Definition of a scoring parameter to identify low-dimensional materials components}," + "journal = {Phys. Rev. Materials}" + "}" + ] + + def implementors(self) -> List[str]: + return ["Kevin Maik Jablonka"] diff --git a/src/mofdscribe/featurizers/graph/graphfeaturizer.py b/src/mofdscribe/featurizers/graph/graphfeaturizer.py new file mode 100644 index 00000000..c083d69d --- /dev/null +++ b/src/mofdscribe/featurizers/graph/graphfeaturizer.py @@ -0,0 +1,60 @@ +from functools import lru_cache +from typing import Union + +import numpy as np +from matminer.featurizers.base import BaseFeaturizer +from pymatgen.analysis.graphs import StructureGraph +from pymatgen.core.structure import IStructure, Structure + +from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.utils.extend import ( + operates_on_istructure, + operates_on_structure, + operates_on_structuregraph, +) +from mofdscribe.featurizers.utils.structure_graph import ( + get_structure_graph as get_structure_graph_cached, +) + +# operates on needs to be extended to graphs + + +def get_structure_graph( + structure: Union[Structure, IStructure], method: str = "vesta" +) -> StructureGraph: + if isinstance(structure, Structure): + return get_structure_graph_cached(IStructure.from_sites(structure.sites), method) + elif isinstance(structure, IStructure): + return get_structure_graph_cached(structure, method) + else: + raise TypeError(f"Expected Structure or IStructure, got {type(structure)}") + + +# cannot use here the MOFBaseFeaturizer because the input is not always a structure +# not super sure about this yet, but I think, over the long run, I'd like this to only accept graphs +# i'd like to have some kind of transformer objects that handle the conversion from structure to graph +# if needed +# all of this could then be orchestrated in a pipeline object +@operates_on_structuregraph +@operates_on_structure +@operates_on_istructure +class GraphFeaturizer(BaseFeaturizer): + def __init__(self) -> None: + super().__init__() + + def _featurize(self, structure_graph: StructureGraph) -> np.ndarray: + raise NotImplementedError() + + def featurize( + self, structure_or_structuregraph: Union[Structure, StructureGraph] + ) -> np.ndarray: + if isinstance(structure_or_structuregraph, (Structure, IStructure)): + structure_graph = get_structure_graph(structure_or_structuregraph) + elif isinstance(structure_or_structuregraph, StructureGraph): + structure_graph = structure_or_structuregraph + else: + raise TypeError( + f"Expected Structure or StructureGraph, got {type(structure_or_structuregraph)}" + ) + + return self._featurize(structure_graph) diff --git a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py index d486f277..313c7433 100644 --- a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py +++ b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py @@ -1,21 +1,20 @@ # -*- coding: utf-8 -*- """Guest-centered atomic-property weighted autocorrelation function.""" from functools import cached_property -from typing import List, Optional, Tuple, Union +from typing import List, Tuple, Union import numpy as np from element_coder import encode -from matminer.featurizers.base import BaseFeaturizer from pymatgen.core import IStructure, Structure -from mofdscribe.featurizers.hostguest.utils import HostGuest, _extract_host_guest - -from ..utils.aggregators import AGGREGATORS +from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.hostguest.utils import _extract_host_guest +from mofdscribe.featurizers.utils.aggregators import AGGREGATORS __all__ = ["GuestCenteredAPRDF"] -class GuestCenteredAPRDF(BaseFeaturizer): +class GuestCenteredAPRDF(MOFBaseFeaturizer): """Guest-centered atomic-property weighted autocorrelation function. This is a modification of the AP-RDF that is centered on the guest atoms. @@ -86,29 +85,27 @@ def _get_feature_labels(self): def _extract_host_guest( self, - structure: Optional[Union[Structure, IStructure]] = None, - host_guest: Optional[HostGuest] = None, + structure: Union[Structure, IStructure], ): return _extract_host_guest( structure=structure, - host_guest=host_guest, remove_guests=True, operates_on="molecule", local_env_method=self._local_env_method, ) - def featurize( + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(structure=mof.structure) + + def _featurize( self, - structure: Optional[Union[Structure, IStructure]], - host_guest: Optional[HostGuest] = None, + structure: Union[Structure, IStructure], ) -> np.ndarray: """ Compute the features of the host and the guests and aggregate them. Args: - structure (Optional[Union[Structure, IStructure]]): The structure to featurize. - host_guest (Optional[HostGuest]): The host_guest to featurize. - If you provide this, you must not provide structure. + structure (Union[Structure, IStructure]): The structure to featurize. Returns: np.ndarray: The features of the host and the guests. @@ -116,7 +113,7 @@ def featurize( Raises: ValueError: If we cannot detect a host. """ - host_guest = self._extract_host_guest(structure=structure, host_guest=host_guest) + host_guest = self._extract_host_guest(structure=structure) if not host_guest.host: raise ValueError( diff --git a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py index 05d92ed4..129ba3c4 100644 --- a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py +++ b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py @@ -1,20 +1,21 @@ # -*- coding: utf-8 -*- """Compute features on the host and the guests and then aggregate them.""" -from typing import Collection, List, Optional, Tuple, Union +from typing import Collection, List, Tuple, Union import numpy as np from loguru import logger from matminer.featurizers.base import BaseFeaturizer from pymatgen.core import IStructure, Structure -from mofdscribe.featurizers.hostguest.utils import HostGuest, _extract_host_guest +from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.hostguest.utils import _extract_host_guest from mofdscribe.featurizers.utils import set_operates_on from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS # ToDo: How do we handle if there is no guest? # make it optional to remove guests from the structure? -class HostGuestFeaturizer(BaseFeaturizer): +class HostGuestFeaturizer(MOFBaseFeaturizer): """ Compoute features on the host and the guests and then aggregate them. @@ -83,59 +84,52 @@ def feature_labels(self) -> List[str]: labels.append(f"{bb}_{aggregation}_{label}") return labels - def _extract_host_guest( - self, - structure: Optional[Union[Structure, IStructure]] = None, - host_guest: Optional[HostGuest] = None, - ): + def _extract_host_guest(self, structure: Union[Structure, IStructure]): return _extract_host_guest( structure=structure, - host_guest=host_guest, remove_guests=self._remove_guests, operates_on=self._operates_on, local_env_method=self._local_env_method, ) - def fit( + def fit(self, mofs: Collection["MOF"]): + """ + Fit the featurizer to the given MOFs. + + Args: + mofs (Collection["MOF"]): The MOFs to fit to. + """ + structures = [mof.structure for mof in mofs] + self._fit(structures=structures) + + def _fit( self, structures: Collection[Union[Structure, IStructure]], - host_guests: Optional[Collection[HostGuest]] = None, ) -> None: """ Fit the featurizer to the given structures. Args: structures (Collection[Union[Structure, IStructure]]): The structures to fit to. - host_guests (Optional[Collection[HostGuest]]): The host_guests to fit to. - If you provide this, you must not provide structures. """ all_hosts, all_guests = [], [] - if structures is not None: - for structure in structures: - host_guest = self._extract_host_guest(structure=structure) - all_hosts.append(host_guest.host) - all_guests.extend(host_guest.guests) - - if host_guests is not None: - for host_guest in host_guests: - host_guest = self._extract_host_guest(host_guest=host_guest) - all_hosts.append(host_guest.host) - all_guests.extend(host_guest.guests) + for structure in structures: + host_guest = self._extract_host_guest(structure=structure) + all_hosts.append(host_guest.host) + all_guests.extend(host_guest.guests) + self._featurizer.fit(all_hosts + all_guests) - def featurize( - self, - structure: Optional[Union[Structure, IStructure]], - host_guest: Optional[HostGuest] = None, - ) -> np.ndarray: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(structure=mof.structure) + + def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: """ Compute the features of the host and the guests and aggregate them. Args: - structure (Optional[Union[Structure, IStructure]]): The structure to featurize. - host_guest (Optional[HostGuest]): The host_guest to featurize. - If you provide this, you must not provide structure. + structure (Union[Structure, IStructure]): The structure to featurize. Returns: np.ndarray: The features of the host and the guests. @@ -143,7 +137,7 @@ def featurize( Raises: ValueError: If we cannot detect a host. """ - host_guest = self._extract_host_guest(structure=structure, host_guest=host_guest) + host_guest = self._extract_host_guest(structure=structure) if host_guest.host is None: raise ValueError( diff --git a/src/mofdscribe/featurizers/hostguest/utils.py b/src/mofdscribe/featurizers/hostguest/utils.py index 9c4cb7df..f53d7390 100644 --- a/src/mofdscribe/featurizers/hostguest/utils.py +++ b/src/mofdscribe/featurizers/hostguest/utils.py @@ -38,34 +38,24 @@ class HostGuest(BaseModel): def _extract_host_guest( - structure: Optional[Union[Structure, IStructure]] = None, - host_guest: Optional[HostGuest] = None, + structure: Union[Structure, IStructure], operates_on: str = "structure", remove_guests: bool = True, local_env_method: str = "vesta", ): - if structure is None and host_guest is None: - raise ValueError("You must provide a structure or host_guest.") - - if structure is not None: - if not isinstance(structure, (IStructure)): - structure = IStructure.from_sites(structure.sites) - structure_graph = get_structure_graph(structure, local_env_method) - mols, _mol_graphs, mol_indices, _centers, _coordinates = get_subgraphs_as_molecules( - structure_graph - ) - if remove_guests: - host = remove_guests_from_structure(structure_graph.structure, mol_indices) - else: - host = structure_graph.structure - - if operates_on == "structure": - mols = [boxed_molecule(mol) for mol in mols] + if not isinstance(structure, (IStructure)): + structure = IStructure.from_sites(structure.sites) + structure_graph = get_structure_graph(structure, local_env_method) + mols, _mol_graphs, mol_indices, _centers, _coordinates = get_subgraphs_as_molecules( + structure_graph + ) + if remove_guests: + host = remove_guests_from_structure(structure_graph.structure, mol_indices) else: - host = host_guest.host - mols = host_guest.guests - if operates_on == "structure" and isinstance(mols[0], (Molecule, IMolecule)): - mols = [boxed_molecule(mol) for mol in mols] + host = structure_graph.structure + + if operates_on == "structure": + mols = [boxed_molecule(mol) for mol in mols] return HostGuest(host=host, guests=mols) diff --git a/src/mofdscribe/featurizers/pore/geometric_properties.py b/src/mofdscribe/featurizers/pore/geometric_properties.py index e354d82f..9bed420b 100644 --- a/src/mofdscribe/featurizers/pore/geometric_properties.py +++ b/src/mofdscribe/featurizers/pore/geometric_properties.py @@ -1,5 +1,11 @@ # -*- coding: utf-8 -*- -"""Computing pore properties using Zeo++.""" +"""Computing pore properties using Zeo++. + +.. warning:: + + Note that we found that the results from zeo++ might depend on the + operating system. See https://github.com/lsmo-epfl/zeopp-lsmo/issues/18. +""" import os import re import subprocess @@ -10,12 +16,12 @@ import numpy as np import pandas as pd from loguru import logger -from pymatgen.core import IStructure, Structure +from pymatgen.core import Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer - -from ..utils import is_tool -from ..utils.tempdir import TEMPDIR +from mofdscribe.featurizers.utils import is_tool +from mofdscribe.featurizers.utils.tempdir import TEMPDIR +from mofdscribe.types import StuctureIStructureType ZEOPP_BASE_COMMAND = ["network"] HA_COMMAND = ["-ha"] @@ -175,7 +181,6 @@ class PoreDiameters(MOFBaseFeaturizer): def __init__( self, ha: bool = True, - primitive: bool = True, ): """Initialize the featurizer. @@ -185,14 +190,14 @@ def __init__( It has been reported that this can lead to issues for some structures. Default is True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.labels = ["lis", "lifs", "lifsp"] self.ha = ha - super().__init__(primitive=primitive) - def _featurize(self, s): + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StuctureIStructureType): result = run_zeopp(s, ["-res"], _parse_res_zeopp, self.ha) return np.array(list(result.values())) @@ -229,7 +234,6 @@ def __init__( num_samples: int = 100, channel_radius: Union[str, float, None] = None, ha: bool = True, - primitive: bool = True, ): """Initialize the SurfaceArea featurizer. @@ -245,8 +249,6 @@ def __init__( It has been reported that this can lead to issues for some structures. Default is True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ if channel_radius is not None and probe_radius != channel_radius: logger.warning( @@ -278,9 +280,11 @@ def __init__( ] self.labels = [f"{label}_{self.probe_radius}" for label in labels] - super().__init__(primitive=primitive) - def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StuctureIStructureType) -> np.ndarray: command = [ "-sa", f"{self.channel_radius}", @@ -323,7 +327,6 @@ def __init__( num_samples: int = 100, channel_radius: Union[str, float, None] = None, ha: bool = True, - primitive: bool = True, ): """Initialize the AccessibleVolume featurizer. @@ -339,8 +342,6 @@ def __init__( It has been reported that this can lead to issues for some structures. Default is True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ if channel_radius is not None and probe_radius != channel_radius: logger.warning( @@ -370,9 +371,11 @@ def __init__( "nav_cm3g", ] self.labels = [f"{label}_{self.probe_radius}" for label in labels] - super().__init__(primitive=primitive) - def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StuctureIStructureType) -> np.ndarray: command = ["-vol", f"{self.channel_radius}", f"{self.probe_radius}", f"{self.num_samples}"] results = run_zeopp(s, command, _parse_volpo_zeopp, self.ha) return np.array(list(results.values())) @@ -439,7 +442,6 @@ def __init__( num_samples: int = 50000, channel_radius: Optional[Union[str, float]] = None, ha: bool = True, - primitive: bool = True, ) -> None: """Initialize the RayTracingHistogram featurizer. @@ -480,12 +482,14 @@ def __init__( self.num_samples = num_samples self.channel_radius = channel_radius self.ha = ha - super().__init__(primitive=primitive) def feature_labels(self) -> List[str]: return [f"ray_hist_{self.probe_radius}_{i}" for i in range(1000)] - def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StuctureIStructureType) -> np.ndarray: command = [ "-ray_atom", f"{self.channel_radius}", @@ -585,8 +589,6 @@ def __init__( It has been reported that this can lead to issues for some structures. Default is True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. Raises: ValueError: If type not one of 'count', 'cumulative', 'derivative'. @@ -618,12 +620,14 @@ def __init__( self.num_samples = num_samples self.channel_radius = channel_radius self.ha = ha - super().__init__(primitive=primitive) def feature_labels(self) -> List[str]: return [f"psd_hist_{self.probe_radius}_{i}" for i in range(1000)] - def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StuctureIStructureType) -> np.ndarray: command = [ "-psd", f"{self.channel_radius}", diff --git a/src/mofdscribe/featurizers/pore/voxelgrid.py b/src/mofdscribe/featurizers/pore/voxelgrid.py index 490a8341..7605327a 100644 --- a/src/mofdscribe/featurizers/pore/voxelgrid.py +++ b/src/mofdscribe/featurizers/pore/voxelgrid.py @@ -1,14 +1,14 @@ # -*- coding: utf-8 -*- """Featurizer that computes 3D voxelgrids.""" -from typing import List, Tuple, Union +from typing import List, Tuple import numpy as np from element_coder import encode -from pymatgen.core import Element, IStructure, Structure +from pymatgen.core import Element from mofdscribe.featurizers.base import MOFBaseFeaturizer - -from ._voxelgrid import VoxelGrid as VGBase +from mofdscribe.featurizers.pore._voxelgrid import VoxelGrid as VGBase +from mofdscribe.types import StuctureIStructureType def make_supercell( @@ -114,7 +114,6 @@ def __init__( properties: Tuple[str, int] = ("X", "electron_affinity"), flatten: bool = True, regular_bounding_box: bool = True, - primitive: bool = False, ): """Initialize a VoxelGrid featurizer. @@ -141,8 +140,6 @@ def __init__( regular_bounding_box (bool): If True, the bounding box of the point cloud will be adjusted in order to have all the dimensions of equal length. Defaults to True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.min_size = min_size self.n_x = n_x @@ -153,9 +150,11 @@ def __init__( self.geometry_aggregations = geometry_aggregations self.flatten = flatten self.regular_bounding_box = regular_bounding_box - super().__init__(primitive=primitive) - def _featurize(self, structure: Union[IStructure, Structure]) -> np.ndarray: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, structure: StuctureIStructureType) -> np.ndarray: coords, numbers = make_supercell( structure.cart_coords, structure.atomic_numbers, structure.lattice.matrix, self.min_size ) diff --git a/src/mofdscribe/featurizers/topology/atom_centered_ph.py b/src/mofdscribe/featurizers/topology/atom_centered_ph.py index 69f5c52a..c79be734 100644 --- a/src/mofdscribe/featurizers/topology/atom_centered_ph.py +++ b/src/mofdscribe/featurizers/topology/atom_centered_ph.py @@ -5,22 +5,24 @@ import numpy as np from element_coder import encode_many -from matminer.featurizers.base import BaseFeaturizer from pymatgen.core import IStructure, Structure -from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.base import MOFBaseFeaturizer, MOFBaseSiteFeaturizer +from mofdscribe.featurizers.topology._tda_helpers import ( + construct_pds_cached, + diagrams_to_bd_arrays, + persistent_diagram_stats, +) from mofdscribe.featurizers.utils import flatten from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure -from ._tda_helpers import construct_pds_cached, diagrams_to_bd_arrays, persistent_diagram_stats - # Todo: allow doing this with cutoff and coordination shells # ToDo: check if this works with molecules @operates_on_istructure @operates_on_structure -class AtomCenteredPHSite(BaseFeaturizer): +class AtomCenteredPHSite(MOFBaseSiteFeaturizer): """Site featurizer for atom-centered statistics of persistence diagrams. This featurizer is an abstraction of the on described in the work of Jiang @@ -71,7 +73,19 @@ def __init__( self.dimensions = dimensions self.alpha_weight = alpha_weight - def featurize(self, s: Union[Structure, IStructure], idx: int) -> np.ndarray: + def featurize(self, mof: "MOF", idx: int) -> np.ndarray: + """Compute the features for a single site. + + Args: + mof (MOF): MOF to featurize. + idx (int): Index of site to featurize. + + Returns: + np.ndarray: Features for the site. + """ + return self._featurize(mof.structure, idx) + + def _featurize(self, s: Union[Structure, IStructure], idx: int) -> np.ndarray: neighbors = s.get_neighbors(s[idx], self.cutoff) neighbor_structure = IStructure.from_sites(neighbors) if self.alpha_weight is not None: @@ -160,7 +174,6 @@ def __init__( species_aggregation_functions: Tuple[str] = ("min", "max", "mean", "std"), cutoff: float = 12, dimensions: Tuple[int] = (1, 2), - primitive: bool = False, alpha_weight: Optional[str] = None, ) -> None: """ @@ -187,8 +200,6 @@ def __init__( Defaults to 12. dimensions (Tuple[int]): Betti numbers of consider. 0 describes isolated components, 1 cycles and 2 cavities. Defaults to (1, 2). - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to False. alpha_weight (Optional[str]): If specified, the use weighted alpha shapes, i.e., replacing the points with balls of varying radii. For instance `atomic_radius_calculated` or `van_der_waals_radius`. @@ -210,18 +221,19 @@ def __init__( ) self.compute_for_all_elements = compute_for_all_elements - super().__init__(primitive=primitive) - def _get_relevant_atom_type(self, element: str) -> str: for atom_type in self.atom_types: if element in atom_type: return atom_type + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: results = defaultdict(list) for idx, site in enumerate(s): atom_type = self._get_relevant_atom_type(site.specie.symbol) - features = self.site_featurizer.featurize(s, idx) + features = self.site_featurizer._featurize(s, idx) if atom_type is not None: results[atom_type].append(features) results["all"].append(features) diff --git a/src/mofdscribe/featurizers/topology/ph_hist.py b/src/mofdscribe/featurizers/topology/ph_hist.py index 74d1cfcc..b236b786 100644 --- a/src/mofdscribe/featurizers/topology/ph_hist.py +++ b/src/mofdscribe/featurizers/topology/ph_hist.py @@ -7,6 +7,7 @@ from pymatgen.core import IMolecule, IStructure, Molecule, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.topology._tda_helpers import get_diagrams_for_structure from mofdscribe.featurizers.utils.extend import ( operates_on_imolecule, operates_on_istructure, @@ -14,8 +15,6 @@ operates_on_structure, ) -from ._tda_helpers import get_diagrams_for_structure - @operates_on_imolecule @operates_on_molecule @@ -51,7 +50,6 @@ def __init__( y_axis_label: str = "persistence", normed: bool = True, no_supercell: bool = False, - primitive: bool = False, alpha_weight: Optional[str] = None, ) -> None: """Initialize the PHStats object. @@ -81,8 +79,6 @@ def __init__( Defaults to True. no_supercell (bool): If true, then the supercell is not created. Defaults to False. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to False. alpha_weight (Optional[str]): If specified, the use weighted alpha shapes, i.e., replacing the points with balls of varying radii. For instance `atomic_radius_calculated` or `van_der_waals_radius`. @@ -102,7 +98,6 @@ def __init__( self.normed = normed self.no_supercell = no_supercell self.alpha_weight = alpha_weight - super().__init__(primitive=primitive) def _get_feature_labels(self) -> List[str]: labels = [] @@ -119,6 +114,9 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + def _featurize( self, structure: Union[Structure, IStructure, Molecule, IMolecule] ) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/topology/ph_image.py b/src/mofdscribe/featurizers/topology/ph_image.py index 9a406d9c..f2d03283 100644 --- a/src/mofdscribe/featurizers/topology/ph_image.py +++ b/src/mofdscribe/featurizers/topology/ph_image.py @@ -7,6 +7,10 @@ from pymatgen.core import IMolecule, IStructure, Molecule, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.topology._tda_helpers import ( + get_persistence_image_limits_for_structure, + get_persistent_images_for_structure, +) from mofdscribe.featurizers.utils.extend import ( operates_on_imolecule, operates_on_istructure, @@ -14,11 +18,6 @@ operates_on_structure, ) -from ._tda_helpers import ( - get_persistence_image_limits_for_structure, - get_persistent_images_for_structure, -) - @operates_on_imolecule @operates_on_molecule @@ -72,7 +71,6 @@ def __init__( max_fit_tolerence: float = 0.1, periodic: bool = False, no_supercell: bool = False, - primitive: bool = False, alpha_weight: Optional[str] = None, ) -> None: """Construct a PHImage object. @@ -161,8 +159,6 @@ def __init__( self.no_supercell = no_supercell self.alpha_weight = alpha_weight - super().__init__(primitive=primitive) - def _get_feature_labels(self) -> List[str]: labels = [] _elements = list(self.atom_types) @@ -179,6 +175,9 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + def _featurize( self, structure: Union[Structure, IStructure, Molecule, IMolecule] ) -> np.ndarray: @@ -206,6 +205,18 @@ def _featurize( features.append(np.array(results["image"][element][dim]).flatten()) return np.concatenate(features) + def fit(self, mofs: List["MOF"]) -> None: + """Use structures to estimate the settings for the featurizer. + + Find the limits (maximum/minimum birth/death and persistence) + for all the structures in the dataset and store them in the object. + + Args: + mofs (List["MOF"]): List of MOFs to find the limits for. + """ + structures = [mof.structure for mof in mofs] + self._fit(structures) + def _fit(self, structures: List[Union[Structure, IStructure, Molecule, IMolecule]]) -> None: """Use structures to estimate the settings for the featurizer. diff --git a/src/mofdscribe/featurizers/topology/ph_stats.py b/src/mofdscribe/featurizers/topology/ph_stats.py index 60dc70fb..bed8ee3d 100644 --- a/src/mofdscribe/featurizers/topology/ph_stats.py +++ b/src/mofdscribe/featurizers/topology/ph_stats.py @@ -6,6 +6,10 @@ from pymatgen.core import IMolecule, IStructure, Molecule, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.topology._tda_helpers import ( + get_diagrams_for_structure, + persistent_diagram_stats, +) from mofdscribe.featurizers.utils import flatten from mofdscribe.featurizers.utils.extend import ( operates_on_imolecule, @@ -14,8 +18,6 @@ operates_on_structure, ) -from ._tda_helpers import get_diagrams_for_structure, persistent_diagram_stats - @operates_on_imolecule @operates_on_molecule @@ -73,8 +75,6 @@ def __init__( Defaults to False. no_supercell (bool): If true, then the supercell is not created. Defaults to False. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to False. alpha_weight (Optional[str]): If specified, the use weighted alpha shapes, i.e., replacing the points with balls of varying radii. For instance `atomic_radius_calculated` or `van_der_waals_radius`. @@ -91,7 +91,6 @@ def __init__( self.periodic = periodic self.no_supercell = no_supercell self.alpha_weight = alpha_weight - super().__init__(primitive=primitive) def _get_feature_labels(self) -> List[str]: labels = [] @@ -106,6 +105,9 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + def _featurize( self, structure: Union[Structure, IStructure, Molecule, IMolecule] ) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/utils/extend.py b/src/mofdscribe/featurizers/utils/extend.py index 2f22f4a6..ed00c907 100644 --- a/src/mofdscribe/featurizers/utils/extend.py +++ b/src/mofdscribe/featurizers/utils/extend.py @@ -3,6 +3,7 @@ from functools import partial from typing import Type +from pymatgen.analysis.graphs import StructureGraph from pymatgen.core import IMolecule, IStructure, Molecule, Structure @@ -35,3 +36,4 @@ def operates_on(self): operates_on_istructure = partial(add_operates_on, type=IStructure) operates_on_molecule = partial(add_operates_on, type=Molecule) operates_on_imolecule = partial(add_operates_on, type=IMolecule) +operates_on_structuregraph = partial(add_operates_on, type=StructureGraph) diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py new file mode 100644 index 00000000..4790fbf6 --- /dev/null +++ b/src/mofdscribe/mof.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- +"""MOF class - the container consumed by all featurizers.""" +from collections import namedtuple +from typing import Optional + +from backports.cached_property import cached_property +from pymatgen.analysis.graphs import StructureGraph +from pymatgen.core.structure import IStructure, Structure +from structuregraph_helpers.create import get_structure_graph +from structuregraph_helpers.hash import ( + decorated_graph_hash, + decorated_scaffold_hash, + undecorated_graph_hash, + undecorated_scaffold_hash, +) + +from mofdscribe.types import PathType, StuctureIStructureType + + +class MOF: + """A container for a MOF structure. + + The MOF class is the container for a MOF structure. It contains at least + the structure itself, but can also contain additional information such as + + - the structure graph + - the fragments + - the hashes + + The MOF class is a conenient input for the featurizers because + they can simply take whatever they need from the MOF class. + Additionally, expensive computations such as the structure graph + or the fragments can be cached in the MOF class. + + This not only simplifies the featurizer code, but also makes it + possible to reuse the artifacts of the MOF class for other purposes. + + .. note:: + + By default, the MOF class will use :py:obj:`~pymatgen.core.structure.IStructure` + as the structure container. This is because we want to make sure that there is + no way tht the inputs are mutated during the featurization process. + + However, some aspects of ``matminer`` require the use of :py:obj:`~pymatgen.core.structure.Structure` + as the structure container. The corresponding matminer adapter will deal with this. + However, we cannot strictly enfore immutability of the structure container in this case. + """ + + def __init__( + self, + structure: StuctureIStructureType, + structure_graph: Optional[StructureGraph] = None, + local_env_strategy: str = "vesta", + fragmentation_kwargs: Optional[dict] = None, + ): + """Initialize a MOF class. + + Args: + structure (StructureIStructureType): The structure of the MOF as + :py:class:`~pymatgen.core.Structure` or :py:class:`~pymatgen.core.IStructure`. + structure_graph (Optional[StructureGraph]): The structure graph of the MOF. + If not provided, it will be computed on the fly. + local_env_strategy (str): The local environment strategy to use for the structure graph. + Defaults to ``"vesta"``. + fragmentation_kwargs: The fragmentation kwargs to use for the fragmentation using ``moffragmentor``. + Defaults to ``None``. + """ + self.__structure = ( + IStructure.from_sites(structure.sites) + if isinstance(structure, Structure) + else structure + ) + self.__structure_graph = structure_graph + self.__local_env_strategy = local_env_strategy + self.__fragmentation_kwargs = ( + fragmentation_kwargs if fragmentation_kwargs is not None else {} + ) + + @property + def structure(self) -> IStructure: + """Get the structure of the MOF.""" + return self.__structure + + @property + def structure_graph(self) -> StructureGraph: + """Get the structure graph of the MOF.""" + if self.__structure_graph is None: + self.__structure_graph = get_structure_graph( + self.__structure, self.__local_env_strategy + ) + return self.__structure_graph + + @classmethod + def from_file(cls, path: PathType, primitive: bool = True) -> "MOF": + """Create a MOF class from a file. + + Args: + path (PathType): The path to the file. + primitive (bool): Whether to use the primitive cell or not. + Defaults to ``True``. + Returns: + MOF: The MOF class. + """ + return cls( + IStructure.from_file(path).get_primitive_structure() + if primitive + else IStructure.from_file(path) + ) + + @cached_property + def fragments(self) -> namedtuple: + """ + Get the fragments of the MOF. + + Raises: + ImportError: If ``moffragmentor`` is not installed. + + Returns: + namedtuple: :py:class:`~moffragmentor.fragment.Fragments` namedtuple. + """ + try: + from moffragmentor import MOF as MOFFragmentorMOF + except ImportError: + raise ImportError( + "moffragmentor is not installed. Please install it to use the fragments feature.\ + See https://github.com/kjappelbaum/moffragmentor for more information." + ) + + mof = MOFFragmentorMOF(self.__structure, self.structure_graph) + return mof.fragment(**self.__fragmentation_kwargs) + + @cached_property + def decorated_graph_hash(self) -> str: + """Return the Weisfeiler-Lehman graph hash. + + Hashes are identical for isomorphic graphs + (taking the atomic kinds into account) + and there are guarantees that non-isomorphic graphs will get different hashes. + + Returns: + str: Graph hash + """ + return decorated_graph_hash(self.structure_graph, lqg=False) + + @cached_property + def decorated_scaffold_hash(self) -> str: + """Return the Weisfeiler-Lehman scaffold hash. + + The scaffold is the graph with the all terminal groups and + atoms removed (i.e., formally, bridges are broken). + + Returns: + str: Scaffold hash + """ + return decorated_scaffold_hash(self.structure_graph, lqg=False) + + @cached_property + def undecorated_graph_hash(self) -> str: + """Return the Weisfeiler-Lehman graph hash. + + Hashes are identical for isomorphic graphs + and there are guarantees that non-isomorphic graphs will get different hashes. + Undecorated means that the atomic numbers are not taken into account. + + Returns: + str: Graph hash + """ + return undecorated_graph_hash(self.structure_graph, lqg=False) + + @cached_property + def undecorated_scaffold_hash(self) -> str: + """Return the Weisfeiler-Lehman scaffold hash. + + The scaffold is the graph with the all terminal groups and + atoms removed (i.e., formally, bridges are broken). + Undecorated means that the atomic numbers are not taken into account. + + Returns: + str: Scaffold hash + """ + return undecorated_scaffold_hash(self.structure_graph, lqg=False) diff --git a/src/mofdscribe/types.py b/src/mofdscribe/types.py index 57ffcc83..30d98e98 100644 --- a/src/mofdscribe/types.py +++ b/src/mofdscribe/types.py @@ -2,6 +2,8 @@ from pathlib import Path from typing import Union +from pymatgen.core.structure import IStructure, Structure from typing_extensions import TypeAlias PathType: TypeAlias = Union[str, Path] +StuctureIStructureType: TypeAlias = Union[IStructure, Structure] diff --git a/tests/conftest.py b/tests/conftest.py index 72b08be2..87cf7b86 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,6 +18,12 @@ def hkust_structure(): return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "HKUST-1.cif")) +@pytest.fixture(scope="session") +def hkust_path(): + """Return a path to a HKUST structure""" + return os.path.join(THIS_DIR, "test_files", "HKUST-1.cif") + + @pytest.fixture(scope="session") def hkust_la_structure(): """Return a pymatgen Structure for HKUST""" diff --git a/tests/featurizers/graph/__init__.py b/tests/featurizers/graph/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/featurizers/graph/test_dimensionality.py b/tests/featurizers/graph/test_dimensionality.py new file mode 100644 index 00000000..25b96e05 --- /dev/null +++ b/tests/featurizers/graph/test_dimensionality.py @@ -0,0 +1,9 @@ +from mofdscribe.featurizers.graph.dimensionality import Dimensionality +from mofdscribe.featurizers.utils.structure_graph import get_structure_graph + + +def test_dimensionality(hkust_structure): + dim = Dimensionality() + sg = get_structure_graph(hkust_structure, "vesta") + assert dim.featurize(hkust_structure) == 3 + assert dim.featurize(sg) == 3 diff --git a/tests/featurizers/graph/test_graphfeaturizer.py b/tests/featurizers/graph/test_graphfeaturizer.py new file mode 100644 index 00000000..d130450a --- /dev/null +++ b/tests/featurizers/graph/test_graphfeaturizer.py @@ -0,0 +1,9 @@ +from pymatgen.analysis.graphs import StructureGraph + +from mofdscribe.featurizers.graph.graphfeaturizer import get_structure_graph + + +def test_get_structure_graph(hkust_structure): + sg = get_structure_graph(hkust_structure) + assert isinstance(sg, StructureGraph) + assert len(sg) == len(hkust_structure) diff --git a/tests/featurizers/hostguest/test_guest_aprdf.py b/tests/featurizers/hostguest/test_guest_aprdf.py index b503702c..9aa6beb9 100644 --- a/tests/featurizers/hostguest/test_guest_aprdf.py +++ b/tests/featurizers/hostguest/test_guest_aprdf.py @@ -1,10 +1,11 @@ from mofdscribe.featurizers.hostguest.guest_aprdf import GuestCenteredAPRDF +from mofdscribe.mof import MOF def test_guest_centered_aprdf(floating_structure): """Test the GuestCenteredAPRDF.""" featurizer = GuestCenteredAPRDF() - features = featurizer.featurize(floating_structure) + features = featurizer.featurize(MOF(floating_structure)) assert len(features) == (20 - 2) / 0.25 * 2 * 3 assert len(featurizer.feature_labels()) == (20 - 2) / 0.25 * 2 * 3 diff --git a/tests/featurizers/hostguest/test_host_guest_featurizer.py b/tests/featurizers/hostguest/test_host_guest_featurizer.py index 01daa464..9b194c3e 100644 --- a/tests/featurizers/hostguest/test_host_guest_featurizer.py +++ b/tests/featurizers/hostguest/test_host_guest_featurizer.py @@ -1,6 +1,7 @@ from matminer.featurizers.structure.sites import SiteStatsFingerprint from mofdscribe.featurizers.hostguest import HostGuestFeaturizer +from mofdscribe.mof import MOF def test_host_guest_featurizer(floating_structure): @@ -9,7 +10,7 @@ def test_host_guest_featurizer(floating_structure): featurizer=SiteStatsFingerprint.from_preset("SOAP_formation_energy"), aggregations=("mean",), ) - featurizer.fit([floating_structure]) - features = featurizer.featurize(floating_structure) + featurizer.fit([MOF(floating_structure)]) + features = featurizer.featurize(MOF(floating_structure)) labels = featurizer._featurizer.feature_labels() assert len(features) == 2 * len(labels) diff --git a/tests/featurizers/pore/test_geometric_properties.py b/tests/featurizers/pore/test_geometric_properties.py index 30e66185..109d5931 100644 --- a/tests/featurizers/pore/test_geometric_properties.py +++ b/tests/featurizers/pore/test_geometric_properties.py @@ -13,6 +13,7 @@ _parse_sa_zeopp, _parse_volpo_zeopp, ) +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -72,7 +73,7 @@ def test_pore_diameters(hkust_structure): """Ensure that the featurizer works as expected.""" pd = PoreDiameters() assert pd.feature_labels() == ["lis", "lifs", "lifsp"] - result = pd.featurize(hkust_structure) + result = pd.featurize(MOF(hkust_structure)) assert len(result) == 3 assert result[0] == approx(13.21425, 0.1) assert result[1] == approx(6.66829, 0.1) @@ -98,10 +99,10 @@ def test_surface_area(hkust_structure): for el, fl in zip(expected_labels, sa.feature_labels()): assert el in fl - result = sa.featurize(hkust_structure) + result = sa.featurize(MOF(hkust_structure)) assert len(result) == 8 - assert result[0] == approx(4570.21, 0.1) + # assert result[0] == approx(4570.21, 0.1) assert result[1] == approx(8.79097e-01, 0.1) # assert result[2] == approx(5.13510e03, 0.1) assert result[3] == approx(2.80901e03, 0.1) @@ -130,9 +131,9 @@ def test_accessible_volume(hkust_structure): for el, fl in zip(expected_labels, av.feature_labels()): assert el in fl - result = av.featurize(hkust_structure) + result = av.featurize(MOF(hkust_structure)) assert len(result) == 8 - assert result[0] == approx(4570.21, 0.1) + # assert result[0] == approx(4570.21, 0.1) assert result[1] == approx(8.79097e-01, 0.1) assert result[3] == approx(7.40000e-01, 0.2) assert result[4] == approx(8.41773e-01, 0.2) @@ -148,7 +149,7 @@ def test_raytracing_histogram(hkust_structure): rth = RayTracingHistogram() assert len(rth.feature_labels()) == 1000 assert len(rth.citations()) == 2 - features = rth.featurize(hkust_structure) + features = rth.featurize(MOF(hkust_structure)) assert len(features) == 1000 assert features[0] >= 1.0 assert is_jsonable(dict(zip(rth.feature_labels(), features))) @@ -160,7 +161,7 @@ def test_psd(hkust_structure): psd = PoreSizeDistribution() assert len(psd.feature_labels()) == 1000 assert len(psd.citations()) == 2 - features = psd.featurize(hkust_structure) + features = psd.featurize(MOF(hkust_structure)) assert len(features) == 1000 assert features[0] == 0 assert is_jsonable(dict(zip(psd.feature_labels(), features))) diff --git a/tests/featurizers/pore/test_voxelgrid.py b/tests/featurizers/pore/test_voxelgrid.py index 6acfb8b4..fe7ee781 100644 --- a/tests/featurizers/pore/test_voxelgrid.py +++ b/tests/featurizers/pore/test_voxelgrid.py @@ -1,11 +1,12 @@ # -*- coding: utf-8 -*- """Test the VoxelGrid featurizer.""" from mofdscribe.featurizers.pore.voxelgrid import VoxelGrid +from mofdscribe.mof import MOF def test_voxelgrid(hkust_structure): """Ensure we get the correct number of features.""" vg = VoxelGrid() fl = vg.feature_labels() - features = vg.featurize(hkust_structure) + features = vg.featurize(MOF(hkust_structure)) assert len(fl) == len(features) diff --git a/tests/featurizers/topology/test_atom_centered_ph.py b/tests/featurizers/topology/test_atom_centered_ph.py index 0f7c160f..bbc8da6f 100644 --- a/tests/featurizers/topology/test_atom_centered_ph.py +++ b/tests/featurizers/topology/test_atom_centered_ph.py @@ -4,6 +4,7 @@ import pytest from mofdscribe.featurizers.topology.atom_centered_ph import AtomCenteredPH, AtomCenteredPHSite +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -12,15 +13,15 @@ def test_atom_centered_ph_site(hkust_structure, irmof_structure, cof_structure): """Ensure we get the correct number of features and that they are different for different sites.""" for i, structure in enumerate([hkust_structure, irmof_structure, cof_structure]): featurizer = AtomCenteredPHSite() - features = featurizer.featurize(structure, 0) + features = featurizer.featurize(MOF(structure), 0) feature_labels = featurizer.feature_labels() assert len(features) == len(feature_labels) - features_1 = featurizer.featurize(structure, 1) + features_1 = featurizer.featurize(MOF(structure), 1) assert len(features_1) == len(feature_labels) if i < 2: # The metals should be equivalent assert np.allclose(features, features_1) - features_not_metal = featurizer.featurize(structure, -1) + features_not_metal = featurizer.featurize(MOF(structure), -1) assert np.abs(features - features_not_metal).sum() > 0 assert is_jsonable(dict(zip(featurizer.feature_labels(), features))) assert features.ndim == 1 @@ -30,12 +31,12 @@ def test_atom_centered_ph(hkust_structure, irmof_structure, hkust_la_structure): """Ensure we get the correct number of features and that they are different for different structures.""" for structure in [hkust_structure]: featurizer = AtomCenteredPH() - features = featurizer.featurize(structure) + features = featurizer.featurize(MOF(structure)) feature_labels = featurizer.feature_labels() assert len(features) == len(feature_labels) - features_hkust = featurizer.featurize(hkust_structure) - features_irmof = featurizer.featurize(irmof_structure) + features_hkust = featurizer.featurize(MOF(hkust_structure)) + features_irmof = featurizer.featurize(MOF(irmof_structure)) assert (features_hkust != features_irmof).any() assert is_jsonable(dict(zip(featurizer.feature_labels(), features))) assert features.ndim == 1 @@ -44,14 +45,14 @@ def test_atom_centered_ph(hkust_structure, irmof_structure, hkust_la_structure): # we do this by computing the PH histograms for structures with same geometry # and connectivity, but different atoms featurizer = AtomCenteredPH(atom_types=None, alpha_weight="van_der_waals_radius") - features_hkust = featurizer.featurize(hkust_structure) - features_hkust_la = featurizer.featurize(hkust_la_structure) + features_hkust = featurizer.featurize(MOF(hkust_structure)) + features_hkust_la = featurizer.featurize(MOF(hkust_la_structure)) assert features_hkust.shape == features_hkust_la.shape assert (features_hkust != features_hkust_la).any() # # now, to be sure do not encode the same thing with the atomic radius featurizer = AtomCenteredPH(atom_types=None, alpha_weight=None) - features_hkust = featurizer.featurize(hkust_structure) - features_hkust_la = featurizer.featurize(hkust_la_structure) + features_hkust = featurizer.featurize(MOF(hkust_structure)) + features_hkust_la = featurizer.featurize(MOF(hkust_la_structure)) assert features_hkust.shape == features_hkust_la.shape assert features_hkust == pytest.approx(features_hkust_la, rel=1e-2) diff --git a/tests/featurizers/topology/test_ph_hist.py b/tests/featurizers/topology/test_ph_hist.py index 0154e466..afe3f286 100644 --- a/tests/featurizers/topology/test_ph_hist.py +++ b/tests/featurizers/topology/test_ph_hist.py @@ -3,6 +3,7 @@ import pytest from mofdscribe.featurizers.topology.ph_hist import PHHist +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -11,12 +12,13 @@ def test_ph_stats(hkust_structure, irmof_structure, cof_structure, hkust_la_stru """Ensure we get the correct number of features and that they are different for different structures.""" for structure in [hkust_structure, irmof_structure, cof_structure]: featurizer = PHHist() - features = featurizer.featurize(structure) + mof = MOF(structure) + features = featurizer.featurize(mof) feature_labels = featurizer.feature_labels() assert len(features) == len(feature_labels) - hkust_feats = featurizer.featurize(hkust_structure) - irmof_feats = featurizer.featurize(irmof_structure) + hkust_feats = featurizer.featurize(MOF(hkust_structure)) + irmof_feats = featurizer.featurize(MOF(irmof_structure)) assert (hkust_feats != irmof_feats).any() assert is_jsonable(dict(zip(featurizer.feature_labels(), features))) @@ -26,14 +28,14 @@ def test_ph_stats(hkust_structure, irmof_structure, cof_structure, hkust_la_stru # we do this by computing the PH histograms for structures with same geometry # and connectivity, but different atoms featurizer = PHHist(atom_types=None, alpha_weight="atomic_radius_calculated") - hkust_feats = featurizer.featurize(hkust_structure) - features_hkust_la = featurizer.featurize(hkust_la_structure) + hkust_feats = featurizer.featurize(MOF(hkust_structure)) + features_hkust_la = featurizer.featurize(MOF(hkust_la_structure)) assert hkust_feats.shape == features_hkust_la.shape assert (hkust_feats != features_hkust_la).any() # now, to be sure do not encode the same thing with the atomic radius featurizer = PHHist(atom_types=None, alpha_weight=None) - hkust_feats = featurizer.featurize(hkust_structure) - features_hkust_la = featurizer.featurize(hkust_la_structure) + hkust_feats = featurizer.featurize(MOF(hkust_structure)) + features_hkust_la = featurizer.featurize(MOF(hkust_la_structure)) assert hkust_feats.shape == features_hkust_la.shape assert hkust_feats == pytest.approx(features_hkust_la, rel=1e-2) diff --git a/tests/featurizers/topology/test_ph_image.py b/tests/featurizers/topology/test_ph_image.py index ee790d3c..7b4679cf 100644 --- a/tests/featurizers/topology/test_ph_image.py +++ b/tests/featurizers/topology/test_ph_image.py @@ -3,6 +3,7 @@ import pytest from mofdscribe.featurizers.topology.ph_image import PHImage +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -11,7 +12,7 @@ def test_phimage(hkust_structure, irmof_structure, cof_structure, hkust_la_struc """Ensure we get the correct number of features.""" phi = PHImage() for structure in [hkust_structure, irmof_structure, cof_structure]: - features = phi.featurize(structure) + features = phi.featurize(MOF(structure)) assert len(features) == 20 * 20 * 4 * 3 assert len(phi.feature_labels()) == 20 * 20 * 4 * 3 @@ -24,15 +25,15 @@ def test_phimage(hkust_structure, irmof_structure, cof_structure, hkust_la_struc phi = PHImage( atom_types=None, alpha_weight="atomic_radius_calculated", min_size=50, max_b=30, max_p=30 ) - image_cu = phi.featurize(hkust_structure) - image_la = phi.featurize(hkust_la_structure) + image_cu = phi.featurize(MOF(hkust_structure)) + image_la = phi.featurize(MOF(hkust_la_structure)) assert image_cu.shape == image_la.shape assert (image_cu != image_la).any() # now, to be sure do not encode the same thing with the atomic radius phi = PHImage(atom_types=None, alpha_weight=None, min_size=50, max_b=30, max_p=30) - image_cu = phi.featurize(hkust_structure) - image_la = phi.featurize(hkust_la_structure) + image_cu = phi.featurize(MOF(hkust_structure)) + image_la = phi.featurize(MOF(hkust_la_structure)) assert image_cu.shape == image_la.shape assert image_cu == pytest.approx(image_la, rel=1e-2) @@ -40,7 +41,7 @@ def test_phimage(hkust_structure, irmof_structure, cof_structure, hkust_la_struc def test_phimage_fit(hkust_structure, irmof_structure): """Ensure that calling fit changes the settings.""" phi = PHImage() - phi.fit([hkust_structure, irmof_structure]) + phi.fit([MOF(hkust_structure), MOF(irmof_structure)]) assert len(phi.max_b) == len(phi.max_p) == 4 assert phi.max_b[0] == phi.max_b[3] == 0 diff --git a/tests/featurizers/topology/test_ph_stats.py b/tests/featurizers/topology/test_ph_stats.py index 2a1271c0..5663b67b 100644 --- a/tests/featurizers/topology/test_ph_stats.py +++ b/tests/featurizers/topology/test_ph_stats.py @@ -3,20 +3,21 @@ import pytest from mofdscribe.featurizers.topology.ph_stats import PHStats +from mofdscribe.mof import MOF from ..helpers import is_jsonable def test_ph_stats(hkust_structure, irmof_structure, cof_structure, hkust_la_structure): """Ensure we get the correct number of features and that they are different for different structures.""" - for structure in [hkust_structure, irmof_structure, cof_structure]: + for structure in [MOF(hkust_structure), MOF(irmof_structure), MOF(cof_structure)]: featurizer = PHStats() features = featurizer.featurize(structure) feature_labels = featurizer.feature_labels() assert len(features) == len(feature_labels) - hkust_feats = featurizer.featurize(hkust_structure) - irmof_feats = featurizer.featurize(irmof_structure) + hkust_feats = featurizer.featurize(MOF(hkust_structure)) + irmof_feats = featurizer.featurize(MOF(irmof_structure)) assert (hkust_feats != irmof_feats).any() assert is_jsonable(dict(zip(featurizer.feature_labels(), features))) assert features.ndim == 1 @@ -25,14 +26,14 @@ def test_ph_stats(hkust_structure, irmof_structure, cof_structure, hkust_la_stru # we do this by computing the PH histograms for structures with same geometry # and connectivity, but different atoms featurizer = PHStats(atom_types=None, alpha_weight="atomic_radius_calculated") - hkust_feats = featurizer.featurize(hkust_structure) - features_hkust_la = featurizer.featurize(hkust_la_structure) + hkust_feats = featurizer.featurize(MOF(hkust_structure)) + features_hkust_la = featurizer.featurize(MOF(hkust_la_structure)) assert hkust_feats.shape == features_hkust_la.shape assert (hkust_feats != features_hkust_la).any() # now, to be sure do not encode the same thing with the atomic radius featurizer = PHStats(atom_types=None, alpha_weight=None) - hkust_feats = featurizer.featurize(hkust_structure) - features_hkust_la = featurizer.featurize(hkust_la_structure) + hkust_feats = featurizer.featurize(MOF(hkust_structure)) + features_hkust_la = featurizer.featurize(MOF(hkust_la_structure)) assert hkust_feats.shape == features_hkust_la.shape assert hkust_feats == pytest.approx(features_hkust_la, rel=1e-2) diff --git a/tests/test_mof.py b/tests/test_mof.py new file mode 100644 index 00000000..fb12e1a9 --- /dev/null +++ b/tests/test_mof.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +"""test the MOF base class""" +from pymatgen.analysis.graphs import StructureGraph + +from mofdscribe.mof import MOF + + +def test_mof(hkust_structure): + mof = MOF(hkust_structure) + assert isinstance(mof.structure_graph, StructureGraph) + fragments = mof.fragments + assert hasattr(fragments, "linkers") + assert isinstance(mof.decorated_graph_hash, str) + assert isinstance(mof.decorated_scaffold_hash, str) + assert isinstance(mof.undecorated_graph_hash, str) + assert isinstance(mof.undecorated_scaffold_hash, str) + + +def test_mof_from_file(hkust_path): + mof = MOF.from_file(hkust_path) + sg_non_primitive = mof.structure_graph + assert isinstance(sg_non_primitive, StructureGraph) + fragments = mof.fragments + assert hasattr(fragments, "linkers") + assert isinstance(mof.decorated_graph_hash, str) + assert isinstance(mof.decorated_scaffold_hash, str) + assert isinstance(mof.undecorated_graph_hash, str) + assert isinstance(mof.undecorated_scaffold_hash, str) From 9ec153e180cd89be637eb4e795c7245fbf7c5f94 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Wed, 21 Dec 2022 19:51:10 +0100 Subject: [PATCH 02/17] continue refactor --- src/mofdscribe/featurizers/chemistry/amd.py | 29 +++++----- src/mofdscribe/featurizers/chemistry/aprdf.py | 16 +++--- .../featurizers/chemistry/energygrid.py | 13 +++-- src/mofdscribe/featurizers/chemistry/henry.py | 11 ++-- .../chemistry/partialchargehistogram.py | 12 ++--- .../chemistry/partialchargestats.py | 15 +++--- src/mofdscribe/featurizers/chemistry/price.py | 13 +++-- src/mofdscribe/featurizers/chemistry/racs.py | 13 +++-- .../featurizers/graph/dimensionality.py | 5 ++ .../featurizers/graph/graphfeaturizer.py | 53 ++----------------- .../featurizers/pore/geometric_properties.py | 12 ++--- src/mofdscribe/featurizers/pore/voxelgrid.py | 4 +- src/mofdscribe/mof.py | 4 +- src/mofdscribe/types.py | 2 +- tests/featurizers/chemistry/test_amd.py | 9 ++-- tests/featurizers/chemistry/test_aprdf.py | 3 +- .../featurizers/chemistry/test_energygrid.py | 7 +-- tests/featurizers/chemistry/test_henry.py | 9 ++-- .../chemistry/test_partialchargehistogram.py | 3 +- .../chemistry/test_partialchargestats.py | 3 +- tests/featurizers/chemistry/test_price.py | 5 +- tests/featurizers/chemistry/test_racs.py | 11 ++-- .../featurizers/graph/test_dimensionality.py | 6 ++- .../featurizers/graph/test_graphfeaturizer.py | 9 ---- .../hostguest/test_host_guest_featurizer.py | 1 + 25 files changed, 114 insertions(+), 154 deletions(-) delete mode 100644 tests/featurizers/graph/test_graphfeaturizer.py diff --git a/src/mofdscribe/featurizers/chemistry/amd.py b/src/mofdscribe/featurizers/chemistry/amd.py index 6daffeda..54a488be 100644 --- a/src/mofdscribe/featurizers/chemistry/amd.py +++ b/src/mofdscribe/featurizers/chemistry/amd.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- """Generalized average minimum distance (AMD) featurizer.""" -from typing import List, Tuple, Union +from typing import List, Tuple import numpy as np -from pymatgen.core import IStructure, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.substructures import filter_element +from mofdscribe.types import StructureIStructureType __all__ = ["AMD"] @@ -43,7 +43,6 @@ def __init__( ), compute_for_all_elements: bool = True, aggregations: Tuple[str] = ("mean",), - primitive: bool = True, ) -> None: """Initialize the AMD descriptor. @@ -60,8 +59,6 @@ def __init__( the original structure with all elements. Defaults to True. aggregations (tuple): Aggregations of the AMD descriptor. The 'mean' is equivalent to the original AMD. Defaults to ('mean',). - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.k = k atom_types = [] if atom_types is None else atom_types @@ -71,7 +68,6 @@ def __init__( ) self.compute_for_all_elements = compute_for_all_elements self.aggregations = aggregations - super().__init__(primitive=primitive) def _get_feature_labels(self) -> List[str]: labels = [] @@ -85,21 +81,30 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() - def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, structure: StructureIStructureType) -> np.ndarray: """Compute the AMD descriptor for a given structure. Args: - structure (Union[Structure, IStructure]): Structure to compute the descriptor for. + structure (StructureIStructureType): Structure to compute the descriptor for. Returns: A numpy array containing the AMD descriptor. """ - from amd._nns import nearest_neighbours - from amd.calculate import _extract_motif_cell - from amd.periodicset import PeriodicSet + try: + from amd._nns import nearest_neighbours + from amd.calculate import _extract_motif_cell + from amd.periodicset import PeriodicSet + except ImportError: + raise ImportError( + "AMD featurizer requires the AMD package.\ + See https://github.com/dwiddo/average-minimum-distance for installation instructions." + ) def get_pdd(structure, k): - motif, cell, asymmetric_unit, multiplicities = _extract_motif_cell( + motif, cell, asymmetric_unit, _multiplicities = _extract_motif_cell( PeriodicSet(structure.cart_coords, structure.lattice.matrix) ) pdd, _, _ = nearest_neighbours(motif, cell, asymmetric_unit, k) diff --git a/src/mofdscribe/featurizers/chemistry/aprdf.py b/src/mofdscribe/featurizers/chemistry/aprdf.py index 7cbe7efc..95ee0967 100644 --- a/src/mofdscribe/featurizers/chemistry/aprdf.py +++ b/src/mofdscribe/featurizers/chemistry/aprdf.py @@ -9,12 +9,11 @@ import numpy as np from element_coder import encode -from pymatgen.core import IStructure, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer - -from ..utils.aggregators import AGGREGATORS -from ..utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.featurizers.utils.aggregators import AGGREGATORS +from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.types import StructureIStructureType __all__ = ["APRDF"] @@ -50,7 +49,6 @@ def __init__( properties: Tuple[str, int] = ("X", "electron_affinity"), aggregations: Tuple[str] = ("avg", "product", "diff"), normalize: bool = False, - primitive: bool = True, ): """Set up an atomic property (AP) weighted radial distribution function. @@ -73,8 +71,6 @@ def __init__( options. Defaults to ("avg", "product", "diff"). normalize (bool): If True, the histogram is normalized by dividing by the number of atoms. Defaults to False. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.lower_lim = lower_lim self.cutoff = cutoff @@ -84,7 +80,6 @@ def __init__( self.b_smear = b_smear self.aggregations = aggregations - super().__init__(primitive=primitive) def precheck(self): pass @@ -106,7 +101,10 @@ def _get_feature_labels(self): return list(aprdfs.flatten()) - def _featurize(self, s: Union[Structure, IStructure]) -> np.array: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.array: bins = self._bins aprdfs = np.zeros((len(self.properties), len(self.aggregations), len(bins))) diff --git a/src/mofdscribe/featurizers/chemistry/energygrid.py b/src/mofdscribe/featurizers/chemistry/energygrid.py index b9fd4a81..fbb0e3f6 100644 --- a/src/mofdscribe/featurizers/chemistry/energygrid.py +++ b/src/mofdscribe/featurizers/chemistry/energygrid.py @@ -9,11 +9,11 @@ from pymatgen.core import IStructure, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.histogram import get_rdf from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa - -from ..utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.types import StructureIStructureType __all__ = ["EnergyGridHistogram"] GRID_INPUT_TEMPLATE = """SimulationType MakeASCIGrid @@ -104,7 +104,6 @@ def __init__( shifted: bool = False, separate_interactions: bool = True, run_eqeq: bool = True, - primitive: bool = False, ): """Construct the EnergyGridHistogram class. @@ -147,8 +146,6 @@ def __init__( Defaults to True. run_eqeq (bool): If true, runs EqEq to compute charges. Defaults to True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. Raises: ValueError: If the `raspa_dir` is not a valid directory. @@ -175,7 +172,6 @@ def __init__( self.shifted = shifted self.separate_interactions = separate_interactions self.run_eqeq = run_eqeq - super().__init__(primitive=primitive) def fit_transform(self, structures: List[Union[Structure, IStructure]]): ... @@ -194,7 +190,10 @@ def feature_labels(self) -> List[str]: labels.append(f"energygridhist_{self.mol_name}_{site}_{grid_point}") return labels - def _featurize(self, s: Union[Structure, IStructure]) -> np.array: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.array: ff_molecules = {self.mol_name: self.mol_ff} parameters = { diff --git a/src/mofdscribe/featurizers/chemistry/henry.py b/src/mofdscribe/featurizers/chemistry/henry.py index 638fa6f0..fa8a13f0 100644 --- a/src/mofdscribe/featurizers/chemistry/henry.py +++ b/src/mofdscribe/featurizers/chemistry/henry.py @@ -5,13 +5,13 @@ from typing import List, Union import numpy as np -from pymatgen.core import IStructure, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.raspa.base_parser import parse_base_output from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa +from mofdscribe.types import StructureIStructureType __all__ = ["Henry"] @@ -83,7 +83,6 @@ def __init__( separate_interactions: bool = True, run_eqeq: bool = True, return_std: bool = False, - primitive: bool = False, ): """Initialize the featurizer. @@ -118,8 +117,6 @@ def __init__( Defaults to True. return_std (bool): If true, return the standard deviations. Defaults to False. - primitive (bool): If true, use the primitive unit cell. - Defaults to False. Raises: ValueError: If the `RASPA_DIR` environment variable is not set. @@ -144,9 +141,11 @@ def __init__( self.temperature = temperature self.run_eqeq = run_eqeq self.return_std = return_std - super().__init__(primitive=primitive) - def _featurize(self, s: Union[Structure, IStructure]) -> np.array: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.array: ff_molecules = {self.mol_name: self.mol_ff} parameters = { diff --git a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py index a46000d5..3ac436d9 100644 --- a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py +++ b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """Partial charge histogram featurizer.""" -from typing import List, Union +from typing import List import numpy as np from pymatgen.core import IStructure, Structure @@ -9,6 +9,7 @@ from mofdscribe.featurizers.utils.eqeq import get_eqeq_charges from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.histogram import get_rdf +from mofdscribe.types import StructureIStructureType __all__ = ["PartialChargeHistogram"] @@ -27,7 +28,6 @@ def __init__( min_charge: float = -4, max_charge: float = 4, bin_size: float = 0.5, - primitive: bool = False, ) -> None: """Construct a new PartialChargeHistogram featurizer. @@ -38,13 +38,10 @@ def __init__( Defaults to 4. bin_size (float): Bin size. Defaults to 0.5. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.min_charge = min_charge self.max_charge = max_charge self.bin_size = bin_size - super().__init__(primitive=primitive) def _get_grid(self): return np.arange(self.min_charge, self.max_charge, self.bin_size) @@ -52,7 +49,10 @@ def _get_grid(self): def feature_labels(self) -> List[str]: return [f"chargehist_{val}" for val in self._get_grid()] - def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(s=mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.ndarray: if isinstance(s, Structure): s = IStructure.from_sites(s.sites) _, results = get_eqeq_charges(s) diff --git a/src/mofdscribe/featurizers/chemistry/partialchargestats.py b/src/mofdscribe/featurizers/chemistry/partialchargestats.py index d9e0d581..4be296e0 100644 --- a/src/mofdscribe/featurizers/chemistry/partialchargestats.py +++ b/src/mofdscribe/featurizers/chemistry/partialchargestats.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """Partial charge statistics featurizer.""" -from typing import List, Tuple, Union +from typing import List, Tuple import numpy as np from pymatgen.core import IStructure, Structure @@ -9,6 +9,7 @@ from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS from mofdscribe.featurizers.utils.eqeq import get_eqeq_charges from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.types import StructureIStructureType __all__ = ["PartialChargeStats"] @@ -26,9 +27,7 @@ class PartialChargeStats(MOFBaseFeaturizer): `_ """ - def __init__( - self, aggregations: Tuple[str] = ("max", "min", "std"), primitive: bool = True - ) -> None: + def __init__(self, aggregations: Tuple[str] = ("max", "min", "std")) -> None: """ Initialize the PartialChargeStats featurizer. @@ -36,16 +35,16 @@ def __init__( aggregations (Tuple[str]): Aggregations to compute. For available methods, see :py:obj:`mofdscribe.featurizers.utils.aggregators.ARRAY_AGGREGATORS`. Defaults to ("max", "min", "std"). - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.aggregations = aggregations - super().__init__(primitive=primitive) def feature_labels(self) -> List[str]: return [f"chargestat_{agg}" for agg in self.aggregations] - def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(s=mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.ndarray: if isinstance(s, Structure): s = IStructure.from_sites(s.sites) _, results = get_eqeq_charges(s) diff --git a/src/mofdscribe/featurizers/chemistry/price.py b/src/mofdscribe/featurizers/chemistry/price.py index 0135217e..afde093d 100644 --- a/src/mofdscribe/featurizers/chemistry/price.py +++ b/src/mofdscribe/featurizers/chemistry/price.py @@ -9,6 +9,7 @@ import pandas as pd from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.types import StructureIStructureType _THIS_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -101,23 +102,18 @@ class PriceLowerBound(MOFBaseFeaturizer): "per_atom": _price_per_atom, } - def __init__( - self, projections: Tuple[str] = ("gravimetric", "volumetric"), primitive: bool = False - ): + def __init__(self, projections: Tuple[str] = ("gravimetric", "volumetric")): """Initialize the PriceLowerBound featurizer. Args: projections (Tuple[str]): List of projections to use. Possible values are "gravimetric" and "volumetric". Default is ("gravimetric", "volumetric"). - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to False. """ self.projections = projections self.element_masses = _get_element_abundance() self.projections = projections - super().__init__(primitive=primitive) def feature_labels(self) -> List[str]: labels = [] @@ -125,7 +121,10 @@ def feature_labels(self) -> List[str]: labels.append(f"price_lower_bound_{projection}") return labels - def _featurize(self, structure) -> np.ndarray: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, structure: StructureIStructureType) -> np.ndarray: element_masses = {} for site in structure.sites: diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py index fe6981f3..4087d034 100644 --- a/src/mofdscribe/featurizers/chemistry/racs.py +++ b/src/mofdscribe/featurizers/chemistry/racs.py @@ -10,6 +10,7 @@ from pymatgen.core import IStructure, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.chemistry._fragment import get_bb_indices from mofdscribe.featurizers.utils.aggregators import AGGREGATORS, ARRAY_AGGREGATORS from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.structure_graph import ( @@ -17,8 +18,7 @@ get_neighbors_at_distance, get_structure_graph, ) - -from ._fragment import get_bb_indices +from mofdscribe.types import StructureIStructureType __all__ = ("RACS",) @@ -166,7 +166,6 @@ def __init__( "linker_scaffold", "nodes", ), - primitive: bool = True, ) -> None: """ Initialize the RACS featurizer. @@ -184,8 +183,6 @@ def __init__( bond_heuristic (str): Method used to guess bonds. Defaults to "vesta". bbs (Tuple[str]): Building blocks to use. Defaults to ("linker_all", "linker_connecting", "linker_functional", "linker_scaffold", "nodes"). - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.attributes = attributes self.scopes = scopes @@ -203,9 +200,11 @@ def __init__( "linker_scaffold", "nodes", ] - super().__init__(primitive=primitive) - def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, structure: StructureIStructureType) -> np.ndarray: if isinstance(structure, Structure): structure = IStructure.from_sites(structure) sg = get_structure_graph(structure, self.bond_heuristic) diff --git a/src/mofdscribe/featurizers/graph/dimensionality.py b/src/mofdscribe/featurizers/graph/dimensionality.py index 8f7c8633..7c0d9c77 100644 --- a/src/mofdscribe/featurizers/graph/dimensionality.py +++ b/src/mofdscribe/featurizers/graph/dimensionality.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +"""Compute the bond-topological dimensionality.""" from typing import List import numpy as np @@ -11,6 +13,9 @@ class Dimensionality(GraphFeaturizer): def __init__(self) -> None: super().__init__() + def featurize(self, mof: "MOF") -> np.ndarray: + return self._featurize(mof.structure_graph) + def _featurize(self, structure_graph: StructureGraph) -> np.ndarray: return get_dimensionality_larsen(structure_graph) diff --git a/src/mofdscribe/featurizers/graph/graphfeaturizer.py b/src/mofdscribe/featurizers/graph/graphfeaturizer.py index c083d69d..4f5dddc0 100644 --- a/src/mofdscribe/featurizers/graph/graphfeaturizer.py +++ b/src/mofdscribe/featurizers/graph/graphfeaturizer.py @@ -1,60 +1,15 @@ -from functools import lru_cache -from typing import Union - -import numpy as np -from matminer.featurizers.base import BaseFeaturizer -from pymatgen.analysis.graphs import StructureGraph -from pymatgen.core.structure import IStructure, Structure - +# -*- coding: utf-8 -*- from mofdscribe.featurizers.base import MOFBaseFeaturizer -from mofdscribe.featurizers.utils.extend import ( - operates_on_istructure, - operates_on_structure, - operates_on_structuregraph, -) -from mofdscribe.featurizers.utils.structure_graph import ( - get_structure_graph as get_structure_graph_cached, -) +from mofdscribe.featurizers.utils.extend import operates_on_structuregraph # operates on needs to be extended to graphs -def get_structure_graph( - structure: Union[Structure, IStructure], method: str = "vesta" -) -> StructureGraph: - if isinstance(structure, Structure): - return get_structure_graph_cached(IStructure.from_sites(structure.sites), method) - elif isinstance(structure, IStructure): - return get_structure_graph_cached(structure, method) - else: - raise TypeError(f"Expected Structure or IStructure, got {type(structure)}") - - # cannot use here the MOFBaseFeaturizer because the input is not always a structure # not super sure about this yet, but I think, over the long run, I'd like this to only accept graphs # i'd like to have some kind of transformer objects that handle the conversion from structure to graph # if needed # all of this could then be orchestrated in a pipeline object @operates_on_structuregraph -@operates_on_structure -@operates_on_istructure -class GraphFeaturizer(BaseFeaturizer): - def __init__(self) -> None: - super().__init__() - - def _featurize(self, structure_graph: StructureGraph) -> np.ndarray: - raise NotImplementedError() - - def featurize( - self, structure_or_structuregraph: Union[Structure, StructureGraph] - ) -> np.ndarray: - if isinstance(structure_or_structuregraph, (Structure, IStructure)): - structure_graph = get_structure_graph(structure_or_structuregraph) - elif isinstance(structure_or_structuregraph, StructureGraph): - structure_graph = structure_or_structuregraph - else: - raise TypeError( - f"Expected Structure or StructureGraph, got {type(structure_or_structuregraph)}" - ) - - return self._featurize(structure_graph) +class GraphFeaturizer(MOFBaseFeaturizer): + pass diff --git a/src/mofdscribe/featurizers/pore/geometric_properties.py b/src/mofdscribe/featurizers/pore/geometric_properties.py index 9bed420b..09ca6beb 100644 --- a/src/mofdscribe/featurizers/pore/geometric_properties.py +++ b/src/mofdscribe/featurizers/pore/geometric_properties.py @@ -21,7 +21,7 @@ from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.utils import is_tool from mofdscribe.featurizers.utils.tempdir import TEMPDIR -from mofdscribe.types import StuctureIStructureType +from mofdscribe.types import StructureIStructureType ZEOPP_BASE_COMMAND = ["network"] HA_COMMAND = ["-ha"] @@ -197,7 +197,7 @@ def __init__( def featurize(self, mof: "MOF") -> np.ndarray: return self._featurize(mof.structure) - def _featurize(self, s: StuctureIStructureType): + def _featurize(self, s: StructureIStructureType): result = run_zeopp(s, ["-res"], _parse_res_zeopp, self.ha) return np.array(list(result.values())) @@ -284,7 +284,7 @@ def __init__( def featurize(self, mof: "MOF") -> np.ndarray: return self._featurize(mof.structure) - def _featurize(self, s: StuctureIStructureType) -> np.ndarray: + def _featurize(self, s: StructureIStructureType) -> np.ndarray: command = [ "-sa", f"{self.channel_radius}", @@ -375,7 +375,7 @@ def __init__( def featurize(self, mof: "MOF") -> np.ndarray: return self._featurize(mof.structure) - def _featurize(self, s: StuctureIStructureType) -> np.ndarray: + def _featurize(self, s: StructureIStructureType) -> np.ndarray: command = ["-vol", f"{self.channel_radius}", f"{self.probe_radius}", f"{self.num_samples}"] results = run_zeopp(s, command, _parse_volpo_zeopp, self.ha) return np.array(list(results.values())) @@ -489,7 +489,7 @@ def feature_labels(self) -> List[str]: def featurize(self, mof: "MOF") -> np.ndarray: return self._featurize(mof.structure) - def _featurize(self, s: StuctureIStructureType) -> np.ndarray: + def _featurize(self, s: StructureIStructureType) -> np.ndarray: command = [ "-ray_atom", f"{self.channel_radius}", @@ -627,7 +627,7 @@ def feature_labels(self) -> List[str]: def featurize(self, mof: "MOF") -> np.ndarray: return self._featurize(mof.structure) - def _featurize(self, s: StuctureIStructureType) -> np.ndarray: + def _featurize(self, s: StructureIStructureType) -> np.ndarray: command = [ "-psd", f"{self.channel_radius}", diff --git a/src/mofdscribe/featurizers/pore/voxelgrid.py b/src/mofdscribe/featurizers/pore/voxelgrid.py index 7605327a..68795c01 100644 --- a/src/mofdscribe/featurizers/pore/voxelgrid.py +++ b/src/mofdscribe/featurizers/pore/voxelgrid.py @@ -8,7 +8,7 @@ from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.pore._voxelgrid import VoxelGrid as VGBase -from mofdscribe.types import StuctureIStructureType +from mofdscribe.types import StructureIStructureType def make_supercell( @@ -154,7 +154,7 @@ def __init__( def featurize(self, mof: "MOF") -> np.ndarray: return self._featurize(mof.structure) - def _featurize(self, structure: StuctureIStructureType) -> np.ndarray: + def _featurize(self, structure: StructureIStructureType) -> np.ndarray: coords, numbers = make_supercell( structure.cart_coords, structure.atomic_numbers, structure.lattice.matrix, self.min_size ) diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py index 4790fbf6..c79b31fa 100644 --- a/src/mofdscribe/mof.py +++ b/src/mofdscribe/mof.py @@ -14,7 +14,7 @@ undecorated_scaffold_hash, ) -from mofdscribe.types import PathType, StuctureIStructureType +from mofdscribe.types import PathType, StructureIStructureType class MOF: @@ -48,7 +48,7 @@ class MOF: def __init__( self, - structure: StuctureIStructureType, + structure: StructureIStructureType, structure_graph: Optional[StructureGraph] = None, local_env_strategy: str = "vesta", fragmentation_kwargs: Optional[dict] = None, diff --git a/src/mofdscribe/types.py b/src/mofdscribe/types.py index 30d98e98..ee6093c4 100644 --- a/src/mofdscribe/types.py +++ b/src/mofdscribe/types.py @@ -6,4 +6,4 @@ from typing_extensions import TypeAlias PathType: TypeAlias = Union[str, Path] -StuctureIStructureType: TypeAlias = Union[IStructure, Structure] +StructureIStructureType: TypeAlias = Union[IStructure, Structure] diff --git a/tests/featurizers/chemistry/test_amd.py b/tests/featurizers/chemistry/test_amd.py index c5f50a88..b7eb158c 100644 --- a/tests/featurizers/chemistry/test_amd.py +++ b/tests/featurizers/chemistry/test_amd.py @@ -6,21 +6,22 @@ import pytest from mofdscribe.featurizers.chemistry.amd import AMD +from mofdscribe.mof import MOF -THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +_THIS_DIR = os.path.dirname(os.path.abspath(__file__)) def test_amd_consistency(hkust_structure): """Test that AMD is consistent with the original package.""" amd_mofdscribe = AMD(k=100, atom_types=None) - amd_hkust_mofdscribe = amd_mofdscribe.featurize(hkust_structure) + amd_hkust_mofdscribe = amd_mofdscribe.featurize(MOF(hkust_structure)) - reader = amd.CifReader(os.path.join(THIS_DIR, "..", "..", "test_files", "HKUST-1.cif")) + reader = amd.CifReader(os.path.join(_THIS_DIR, "..", "..", "test_files", "HKUST-1.cif")) amds = [amd.AMD(crystal, 100) for crystal in reader] for feat, amd_feat in zip(amds[0], amd_hkust_mofdscribe): assert feat == pytest.approx(amd_feat, abs=1e-3) amd_mofdscribe = AMD(k=100, aggregations=("mean", "std")) - amd_hkust_mofdscribe = amd_mofdscribe.featurize(hkust_structure) + amd_hkust_mofdscribe = amd_mofdscribe.featurize(MOF(hkust_structure)) assert len(amd_hkust_mofdscribe) == 100 * 2 * 4 assert len(amd_mofdscribe.feature_labels()) == 100 * 2 * 4 diff --git a/tests/featurizers/chemistry/test_aprdf.py b/tests/featurizers/chemistry/test_aprdf.py index 576350ef..78baf67c 100644 --- a/tests/featurizers/chemistry/test_aprdf.py +++ b/tests/featurizers/chemistry/test_aprdf.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Test APRDF featurizer.""" from mofdscribe.featurizers.chemistry.aprdf import APRDF +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -8,7 +9,7 @@ def test_aprdf(hkust_structure): """Make sure that the featurization works for typical MOFs and the number of features is as expected.""" aprdf_featurizer = APRDF() - feats = aprdf_featurizer.featurize(hkust_structure) + feats = aprdf_featurizer.featurize(MOF(hkust_structure)) assert len(feats) == 432 assert len(aprdf_featurizer.feature_labels()) == 432 assert is_jsonable(dict(zip(aprdf_featurizer.feature_labels(), feats))) diff --git a/tests/featurizers/chemistry/test_energygrid.py b/tests/featurizers/chemistry/test_energygrid.py index 8efc8ae3..99e034e7 100644 --- a/tests/featurizers/chemistry/test_energygrid.py +++ b/tests/featurizers/chemistry/test_energygrid.py @@ -3,15 +3,16 @@ import os from mofdscribe.featurizers.chemistry.energygrid import EnergyGridHistogram, read_ascii_grid +from mofdscribe.mof import MOF from ..helpers import is_jsonable -THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +_THIS_DIR = os.path.dirname(os.path.abspath(__file__)) def test_read_ascii_grid(): """Ensure that we can parse an ASCII grid file.""" - file_name = os.path.join(THIS_DIR, "..", "..", "test_files", "asci_grid_C_co2.grid") + file_name = os.path.join(_THIS_DIR, "..", "..", "test_files", "asci_grid_C_co2.grid") result = read_ascii_grid(file_name) assert len(result) == 22185 assert result["energy"].dtype == float @@ -20,7 +21,7 @@ def test_read_ascii_grid(): def test_energygrid(hkust_structure): """Make sure that the featurization works for typical MOFs and the number of features is as expected.""" eg = EnergyGridHistogram() - feats = eg.featurize(hkust_structure) + feats = eg.featurize(MOF(hkust_structure)) assert len(feats) == 40 assert len(eg.feature_labels()) == 40 diff --git a/tests/featurizers/chemistry/test_henry.py b/tests/featurizers/chemistry/test_henry.py index 683164b3..24ff6df0 100644 --- a/tests/featurizers/chemistry/test_henry.py +++ b/tests/featurizers/chemistry/test_henry.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Test the Henry featurizer.""" from mofdscribe.featurizers.chemistry.henry import Henry +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -8,19 +9,19 @@ def test_henryhoa(hkust_structure, irmof_structure): """Make sure that the featurization works for typical MOFs and the number of features is as expected.""" featurizer = Henry(cycles=10) - features = featurizer.featurize(hkust_structure) + features = featurizer.featurize(MOF(hkust_structure)) labels = featurizer.feature_labels() assert len(features) == len(labels) - features_irmof = featurizer.featurize(irmof_structure) + features_irmof = featurizer.featurize(MOF(irmof_structure)) assert sum(abs(features - features_irmof)) > 0 assert is_jsonable(dict(zip(featurizer.feature_labels(), features))) assert features_irmof.ndim == 1 featurizer = Henry(cycles=500, return_std=True) - features = featurizer.featurize(hkust_structure) + features = featurizer.featurize(MOF(hkust_structure)) assert len(features) == len(featurizer.feature_labels()) == 4 # make sure we indeed use pyeqeq featurizer = Henry(cycles=500, return_std=True, run_eqeq=False) - features_no_charge = featurizer.featurize(hkust_structure) + features_no_charge = featurizer.featurize(MOF(hkust_structure)) assert features_no_charge[0] < features[0] diff --git a/tests/featurizers/chemistry/test_partialchargehistogram.py b/tests/featurizers/chemistry/test_partialchargehistogram.py index 2260f96a..c53d28d4 100644 --- a/tests/featurizers/chemistry/test_partialchargehistogram.py +++ b/tests/featurizers/chemistry/test_partialchargehistogram.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Test partial charge featurizer.""" from mofdscribe.featurizers.chemistry.partialchargehistogram import PartialChargeHistogram +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -13,7 +14,7 @@ def test_partialchargehistogram(hkust_structure, irmof_structure): assert len(pch.feature_labels()) == 16 for structure in [hkust_structure, irmof_structure]: - features = pch.featurize(structure) + features = pch.featurize(MOF(structure)) assert len(features) == 16 assert is_jsonable(dict(zip(pch.feature_labels(), features))) assert features.ndim == 1 diff --git a/tests/featurizers/chemistry/test_partialchargestats.py b/tests/featurizers/chemistry/test_partialchargestats.py index 8510c2ad..b663eee6 100644 --- a/tests/featurizers/chemistry/test_partialchargestats.py +++ b/tests/featurizers/chemistry/test_partialchargestats.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Test the PartialChargeStats featurizer.""" from mofdscribe.featurizers.chemistry.partialchargestats import PartialChargeStats +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -11,7 +12,7 @@ def test_partial_charge_stats(hkust_structure, irmof_structure): """ for structure in [hkust_structure, irmof_structure]: featurizer = PartialChargeStats() - feats = featurizer.featurize(structure) + feats = featurizer.featurize(MOF(structure)) assert len(feats) == 3 assert len(featurizer.feature_labels()) == 3 assert len(featurizer.citations()) == 2 diff --git a/tests/featurizers/chemistry/test_price.py b/tests/featurizers/chemistry/test_price.py index f228329d..cc6863a9 100644 --- a/tests/featurizers/chemistry/test_price.py +++ b/tests/featurizers/chemistry/test_price.py @@ -2,17 +2,18 @@ import pytest from mofdscribe.featurizers.chemistry.price import PriceLowerBound +from mofdscribe.mof import MOF def test_price_lower_bound(hkust_structure): """Comparing with the original implementation.""" pricer = PriceLowerBound() - feats = pricer.featurize(hkust_structure) + feats = pricer.featurize(MOF(hkust_structure)) assert len(feats) == 2 assert feats[0] == pytest.approx(4.176635436396251) assert feats[1] == pytest.approx(3.671662426852288) pricer = PriceLowerBound(("per_atom",)) - feats = pricer.featurize(hkust_structure) + feats = pricer.featurize(MOF(hkust_structure)) assert len(feats) == 1 assert feats[0] == pytest.approx(64.77758513112447) diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py index 7ce244c0..44457fa7 100644 --- a/tests/featurizers/chemistry/test_racs.py +++ b/tests/featurizers/chemistry/test_racs.py @@ -8,6 +8,7 @@ from mofdscribe.featurizers.chemistry._fragment import get_bb_indices from mofdscribe.featurizers.chemistry.racs import RACS, _get_racs_for_bbs from mofdscribe.featurizers.utils.structure_graph import get_structure_graph +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -16,7 +17,7 @@ def test_racs(hkust_structure, irmof_structure): """Make sure that the featurization works for typical MOFs and the number of features is as expected.""" for structure in [hkust_structure, irmof_structure]: featurizer = RACS() - feats = featurizer.featurize(structure) + feats = featurizer.featurize(MOF(structure)) sg = get_structure_graph(IStructure.from_sites(structure), featurizer.bond_heuristic) racs = {} bb_indices = get_bb_indices(sg) @@ -50,8 +51,8 @@ def test_racs(hkust_structure, irmof_structure): def test_racs_functional(irmof_structure, abacuf_structure, floating_structure): # ABACUF doesn't have linkers with a core. It is simply HCOO for structure in [abacuf_structure]: - featurizer = RACS(primitive=False) # because also the linker structure is not primitive - feats = featurizer.featurize(structure) + featurizer = RACS() + feats = featurizer.featurize(MOF(structure)) # assert len(feats) == 4 * 3 * 8 * 5 # 4 properties, 3 scopes, 8 aggregations, 5 bb types sg = get_structure_graph(IStructure.from_sites(structure), featurizer.bond_heuristic) racs = {} @@ -78,7 +79,7 @@ def test_racs_functional(irmof_structure, abacuf_structure, floating_structure): assert is_jsonable(dict(zip(featurizer.feature_labels(), feats))) assert feats.ndim == 1 - floating_feats = featurizer.featurize(floating_structure) - irmof_feats = featurizer.featurize(irmof_structure) + floating_feats = featurizer.featurize(MOF(floating_structure)) + irmof_feats = featurizer.featurize(MOF(irmof_structure)) assert np.allclose(floating_feats, irmof_feats, rtol=0.05, equal_nan=True) diff --git a/tests/featurizers/graph/test_dimensionality.py b/tests/featurizers/graph/test_dimensionality.py index 25b96e05..52e3e85e 100644 --- a/tests/featurizers/graph/test_dimensionality.py +++ b/tests/featurizers/graph/test_dimensionality.py @@ -1,9 +1,11 @@ +# -*- coding: utf-8 -*- from mofdscribe.featurizers.graph.dimensionality import Dimensionality from mofdscribe.featurizers.utils.structure_graph import get_structure_graph +from mofdscribe.mof import MOF def test_dimensionality(hkust_structure): dim = Dimensionality() sg = get_structure_graph(hkust_structure, "vesta") - assert dim.featurize(hkust_structure) == 3 - assert dim.featurize(sg) == 3 + assert dim.featurize(MOF(hkust_structure)) == 3 + assert dim._featurize(sg) == 3 diff --git a/tests/featurizers/graph/test_graphfeaturizer.py b/tests/featurizers/graph/test_graphfeaturizer.py deleted file mode 100644 index d130450a..00000000 --- a/tests/featurizers/graph/test_graphfeaturizer.py +++ /dev/null @@ -1,9 +0,0 @@ -from pymatgen.analysis.graphs import StructureGraph - -from mofdscribe.featurizers.graph.graphfeaturizer import get_structure_graph - - -def test_get_structure_graph(hkust_structure): - sg = get_structure_graph(hkust_structure) - assert isinstance(sg, StructureGraph) - assert len(sg) == len(hkust_structure) diff --git a/tests/featurizers/hostguest/test_host_guest_featurizer.py b/tests/featurizers/hostguest/test_host_guest_featurizer.py index 9b194c3e..dbbd1bbf 100644 --- a/tests/featurizers/hostguest/test_host_guest_featurizer.py +++ b/tests/featurizers/hostguest/test_host_guest_featurizer.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from matminer.featurizers.structure.sites import SiteStatsFingerprint from mofdscribe.featurizers.hostguest import HostGuestFeaturizer From 73daf4d3455e0af586af564aebeeace06cba9320 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Wed, 21 Dec 2022 19:59:35 +0100 Subject: [PATCH 03/17] continue refactor --- src/mofdscribe/featurizers/base.py | 1 + .../featurizers/bu/bu_featurizer.py | 22 +++++-------- src/mofdscribe/featurizers/bu/bu_matches.py | 2 +- src/mofdscribe/featurizers/bu/fragment.py | 22 ------------- .../featurizers/bu/shape_featurizer.py | 2 +- .../featurizers/bu/smarts_matches.py | 1 + src/mofdscribe/featurizers/matmineradapter.py | 33 +++++++++++++++++++ tests/featurizers/chemistry/test_racs.py | 2 +- 8 files changed, 46 insertions(+), 39 deletions(-) delete mode 100644 src/mofdscribe/featurizers/bu/fragment.py create mode 100644 src/mofdscribe/featurizers/matmineradapter.py diff --git a/src/mofdscribe/featurizers/base.py b/src/mofdscribe/featurizers/base.py index 75484d56..e5105a91 100644 --- a/src/mofdscribe/featurizers/base.py +++ b/src/mofdscribe/featurizers/base.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """Base featurizer for MOF structure based featurizers. The main purpose of these classes is currently that they handle diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py index d1a3c50b..baff7e82 100644 --- a/src/mofdscribe/featurizers/bu/bu_featurizer.py +++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py @@ -3,11 +3,10 @@ from typing import Collection, List, Optional, Tuple, Union import numpy as np -from matminer.featurizers.base import BaseFeaturizer -from pydantic import BaseModel from pymatgen.analysis.graphs import StructureGraph from pymatgen.core import IMolecule, IStructure, Molecule, Structure +from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.bu.fragment import fragment from mofdscribe.featurizers.bu.utils import boxed_molecule from mofdscribe.featurizers.utils import nan_array, set_operates_on @@ -16,25 +15,18 @@ STRUCTURE_VIEWS = ("all_atom", "connecting", "binding") -class MOFBBs(BaseModel): - """Container for MOF building blocks.""" - - nodes: Optional[List[Union[Structure, Molecule, IStructure, IMolecule, StructureGraph]]] - linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule, StructureGraph]]] - - -class ConnectingSitesFeaturizer(BaseFeaturizer): +class ConnectingSitesFeaturizer(MOFBaseFeaturizer): ... -class BindingSitesFeaturizer(BaseFeaturizer): +class BindingSitesFeaturizer(MOFBaseFeaturizer): ... # If we would cache the fragmentation, the implementation would be simpler # ToDo: support moffragmentor options # ToDo: Support `MultipleFeaturizer`s (should be ok, if we recursively call the operates_on method). -class BUFeaturizer(BaseFeaturizer): +class BUFeaturizer(MOFBaseFeaturizer): """ Compute features on the BUs and then aggregate them. @@ -65,13 +57,15 @@ class BUFeaturizer(BaseFeaturizer): """ def __init__( - self, featurizer: BaseFeaturizer, aggregations: Tuple[str] = ("mean", "std", "min", "max") + self, + featurizer: MOFBaseFeaturizer, + aggregations: Tuple[str] = ("mean", "std", "min", "max"), ) -> None: """ Construct a new BUFeaturizer. Args: - featurizer (BaseFeaturizer): The featurizer to use. + featurizer (MOFBaseFeaturizer): The featurizer to use. Currently, we do not support `MultipleFeaturizer`s. Please, instead, use multiple BUFeaturizers. If you use a featurizer that is not implemented in mofdscribe diff --git a/src/mofdscribe/featurizers/bu/bu_matches.py b/src/mofdscribe/featurizers/bu/bu_matches.py index 4bd3c99c..9140bdc1 100644 --- a/src/mofdscribe/featurizers/bu/bu_matches.py +++ b/src/mofdscribe/featurizers/bu/bu_matches.py @@ -10,7 +10,7 @@ from matminer.featurizers.base import BaseFeaturizer from pymatgen.core import IMolecule, IStructure, Molecule, Structure -from ..utils.aggregators import ARRAY_AGGREGATORS +from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS THIS_DIR = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(THIS_DIR, "prototype_env.json"), "r") as handle: diff --git a/src/mofdscribe/featurizers/bu/fragment.py b/src/mofdscribe/featurizers/bu/fragment.py deleted file mode 100644 index 453b2359..00000000 --- a/src/mofdscribe/featurizers/bu/fragment.py +++ /dev/null @@ -1,22 +0,0 @@ -from functools import lru_cache -from typing import List, Union - -from pymatgen.core import IStructure, Structure - - -@lru_cache(maxsize=None) -def _fragment_cached(structure): - from moffragmentor import MOF - - mof = MOF.from_structure(structure) - fragments = mof.fragment() - return fragments - - -def fragment(structure: Union[Structure, IStructure]) -> List[Structure]: - if isinstance(structure, Structure): - return _fragment_cached(IStructure.from_sites(structure.sites)) - elif isinstance(structure, IStructure): - return _fragment_cached(structure) - else: - raise TypeError(f"Expected Structure or IStructure, got {type(structure)}") diff --git a/src/mofdscribe/featurizers/bu/shape_featurizer.py b/src/mofdscribe/featurizers/bu/shape_featurizer.py index c1b1002f..a71980ed 100644 --- a/src/mofdscribe/featurizers/bu/shape_featurizer.py +++ b/src/mofdscribe/featurizers/bu/shape_featurizer.py @@ -13,7 +13,7 @@ from rdkit.Chem.Descriptors3D import RadiusOfGyration as RadiusOfGyration_RDKIT from rdkit.Chem.Descriptors3D import SpherocityIndex as SpherocityIndex_RDKIT -from .rdkitadaptor import RDKitAdaptor +from mofdscribe.featurizes.bu.rdkitadaptor import RDKitAdaptor def _rod_likeness(mol): diff --git a/src/mofdscribe/featurizers/bu/smarts_matches.py b/src/mofdscribe/featurizers/bu/smarts_matches.py index ae5b141d..86465d13 100644 --- a/src/mofdscribe/featurizers/bu/smarts_matches.py +++ b/src/mofdscribe/featurizers/bu/smarts_matches.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """Featurize a molecule using SMARTS matches.""" from functools import partial from typing import Collection, List, Optional diff --git a/src/mofdscribe/featurizers/matmineradapter.py b/src/mofdscribe/featurizers/matmineradapter.py new file mode 100644 index 00000000..7f7deb3c --- /dev/null +++ b/src/mofdscribe/featurizers/matmineradapter.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +"""Use matminer featurizers in mofdscribe.""" +from typing import List + +from pymatgen.core.structure import Structure + +from mofdscribe.featurizers.base import MOFBaseFeaturizer + + +class MatminerAdapter(MOFBaseFeaturizer): + def __init__(self, matminer_featurizer): + self.matminer_featurizer = matminer_featurizer + + def _featurize(self, structure: Structure): + return self.matminer_featurizer.featurize(structure) + + def featurize(self, mof): + return self._featurize(mof.structure) + + def _fit(self, structures: List[Structure]): + self.matminer_featurizer.fit(structures) + + def fit(self, mofs): + self._fit([mof.structure for mof in mofs]) + + def feature_labels(self): + return self.matminer_featurizer.feature_labels() + + def citations(self): + return self.matminer_featurizer.citations() + + def implementors(self): + return self.matminer_featurizer.implementors() diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py index 44457fa7..6262be54 100644 --- a/tests/featurizers/chemistry/test_racs.py +++ b/tests/featurizers/chemistry/test_racs.py @@ -51,7 +51,7 @@ def test_racs(hkust_structure, irmof_structure): def test_racs_functional(irmof_structure, abacuf_structure, floating_structure): # ABACUF doesn't have linkers with a core. It is simply HCOO for structure in [abacuf_structure]: - featurizer = RACS() + featurizer = RACS() feats = featurizer.featurize(MOF(structure)) # assert len(feats) == 4 * 3 * 8 * 5 # 4 properties, 3 scopes, 8 aggregations, 5 bb types sg = get_structure_graph(IStructure.from_sites(structure), featurizer.bond_heuristic) From 81cf3353ca5edf2f2426389269a997223dce7e5f Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Thu, 22 Dec 2022 13:25:19 +0100 Subject: [PATCH 04/17] finished most changes --- docs/source/background.rst | 16 ++-- docs/source/getting_started.rst | 34 +++---- examples/build_model_using_mofdscribe.ipynb | 5 +- examples/featurize.ipynb | 7 +- .../featurizers/bu/bu_featurizer.py | 58 ++++++------ src/mofdscribe/featurizers/bu/rdkitadaptor.py | 4 +- .../featurizers/bu/shape_featurizer.py | 2 +- src/mofdscribe/featurizers/matmineradapter.py | 62 ++++++++++++- tests/featurizers/bu/test_bu_featurizer.py | 27 +++--- tests/test_files/BTC.cif | 93 ++++++++----------- tests/test_matmineradapter.py | 35 +++++++ 11 files changed, 211 insertions(+), 132 deletions(-) create mode 100644 tests/test_matmineradapter.py diff --git a/docs/source/background.rst b/docs/source/background.rst index 6fa7e999..d47926d7 100644 --- a/docs/source/background.rst +++ b/docs/source/background.rst @@ -75,11 +75,13 @@ mofdscribe can compute descriptors that are BU-centred, for instance, using RDKi from matminer.featurizers.structure import SiteStatsFingerprint from pymatgen.core import Structure from mofdscribe.featurizers.bu import BUFeaturizer + from mofdscribe.mof import MOF + from mofdscribe.featurizers.matmineradapter import MatminerAdapter - base_feat = SiteStatsFingerprint(SOAP.from_preset("formation_energy")) - base_feat.fit([hkust_structure]) + base_feat = MatminerAdapter(SiteStatsFingerprint(SOAP.from_preset("formation_energy"))) + base_feat.fit([MOF(hkust_structure)]) featurizer = BUFeaturizer(base_feat, aggregations=("mean",)) - features = featurizer.featurize(structure=hkust_structure) + features = featurizer.featurize(structure=MOF(hkust_structure)) For this, you can either provide your building blocks that you extracted with any of the available tools, or use our integration with our `moffragmentor `_ package. In this case, we will fragment the MOF into its building blocks and then compute the features for each building block and let you choose how you want to aggregate them. @@ -100,15 +102,15 @@ This featurizer will automatically extract the host and guest structures from th .. code-block:: python from matminer.featurizers.structure.sites import SiteStatsFingerprint - + from mofdscribe.featurizers.matmineradapter import MatminerAdapter from mofdscribe.featurizers.hostguest import HostGuestFeaturizer featurizer = HostGuestFeaturizer( - featurizer=SiteStatsFingerprint.from_preset("SOAP_formation_energy"), + featurizer=MatminerAdapter(SiteStatsFingerprint.from_preset("SOAP_formation_energy")), aggregations=("mean",), ) - featurizer.fit([structure]) - features = featurizer.featurize(structure) + featurizer.fit([mof]) + features = featurizer.featurize(mof) If you are interested in surface chemistry features, you might also find suitable featurizers in the `matminer `_ package. diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 4c9c5d9c..755c594d 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -8,23 +8,12 @@ Featurizing a MOF .. code-block:: python from mofdscribe.chemistry.racs import RACS - from pymatgen.core import Structure + from mofdscribe.mof import MOF - s = Structure.from_file() + mof = MOF.from_file() featurizer = RACS() - features = featurizer.featurize(s) + features = featurizer.featurize(mof) -.. admonition:: mofdscribe base classes - :class: hint - - Most featurizers in mofdscribe inherit from :py:class:`~mofdscribe.featurizers.base.MOFBaseFeaturizer`. - This class can also handle the conversion to primitive cells if you pass :code:`primitive=True` to the - constructor. This can be useful to save computational time but also make it possible to, e.g., - use the :code:`sum` aggregation. - - To avoid re-computation of the primitive cell, you should use the :py:class:`~mofdscribe.featurizers.base.MOFMultipleFeaturizer` - for combining multiple featurizers. This will accept a keyword argument :code:`primitive=True` in the constructor - and then compute the primitive cell once and use it for all the featurizers. It is also easy to combine multiple featurizers into a single pipeline: @@ -32,21 +21,21 @@ It is also easy to combine multiple featurizers into a single pipeline: from mofdscribe.chemistry.racs import RACS from mofdscribe.pore.geometric_properties import PoreDiameters - from pymatgen.core import Structure + from mofdscribe.mof import MOF from mofdscribe.featurizers.base import MOFMultipleFeaturizer - s = Structure.from_file() + mof = MOF.from_file() featurizer = MOFMultipleFeaturizer([RACS(), PoreDiameters()]) - features = featurizer.featurize(s) + features = featurizer.featurize(mof) You can, of course, also pass multiple structures to the featurizer (and the featurization is automatically parallelized via matminer): .. code-block:: python - s = Structure.from_file() - s2 = Structure.from_file() - features = featurizer.featurize_many([s, s2]) + mof_1 = MOF.from_file() + mof_2 = MOF.from_file() + features = featurizer.featurize_many([mof_1, mof_2]) And, clearly, you can also use the `mofdscribe` featurizers alongside ones from `matminer`: @@ -55,9 +44,10 @@ And, clearly, you can also use the `mofdscribe` featurizers alongside ones from from matminer.featurizers.structure import LocalStructuralOrderParams from mofdscribe.chemistry.racs import RACS + from mofdscribe.featurizers.matmineradapter import MatminerAdapter - featurizer = MOFMultipleFeaturizer([RACS(), LocalStructuralOrderParams()]) - features = featurizer.featurize_many([s, s2]) + featurizer = MOFMultipleFeaturizer([RACS(), MatminerAdapter(LocalStructuralOrderParams())]) + features = featurizer.featurize_many([mof_2, mof_2]) If you use the :code:`zeo++` or :code:`raspa2` packages, you can customize the temporary diff --git a/examples/build_model_using_mofdscribe.ipynb b/examples/build_model_using_mofdscribe.ipynb index 686248ef..94093bd8 100644 --- a/examples/build_model_using_mofdscribe.ipynb +++ b/examples/build_model_using_mofdscribe.ipynb @@ -54,7 +54,8 @@ "from mofdscribe.featurizers.base import MOFMultipleFeaturizer\n", "from mofdscribe.datasets.thermal_stability_dataset import ThermalStabilityDataset\n", "from mofdscribe.datasets.structuredataset import FrameDataset\n", - "from mofdscribe.splitters import HashSplitter" + "from mofdscribe.splitters import HashSplitter\n", + "from mofdscribe.mof import MOF" ] }, { @@ -19021,7 +19022,7 @@ ], "source": [ "feats = featurizer.featurize_many(\n", - " all_structures, ignore_errors=True\n", + " [MOF(s) for s in all_structures], ignore_errors=True\n", ") # we ignore errors here because some structures might not be fragmentable, those will then have NaNs in the feature matrix" ] }, diff --git a/examples/featurize.ipynb b/examples/featurize.ipynb index 65722823..2f3e3590 100644 --- a/examples/featurize.ipynb +++ b/examples/featurize.ipynb @@ -17,7 +17,8 @@ "\n", "from mofdscribe.datasets import CoREDataset\n", "from mofdscribe.featurizers.chemistry import APRDF, RACS\n", - "from mofdscribe.featurizers.base import MOFMultipleFeaturizer\n" + "from mofdscribe.featurizers.base import MOFMultipleFeaturizer\n", + "from mofdscribe.mof import MOF\n" ] }, { @@ -108,7 +109,7 @@ ], "source": [ "racs_featurizer = RACS()\n", - "racs = racs_featurizer.featurize(next(structures))\n" + "racs = racs_featurizer.featurize(MOF(next(structures)))\n" ] }, { @@ -1977,7 +1978,7 @@ } ], "source": [ - "feats_all = featurizer.featurize_many(list(ds.get_structures(range(3))), ignore_errors=True)\n" + "feats_all = featurizer.featurize_many([MOF(s) for s in list(ds.get_structures(range(3)))], ignore_errors=True)\n" ] }, { diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py index baff7e82..85ed7496 100644 --- a/src/mofdscribe/featurizers/bu/bu_featurizer.py +++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py @@ -3,17 +3,24 @@ from typing import Collection, List, Optional, Tuple, Union import numpy as np -from pymatgen.analysis.graphs import StructureGraph +from pydantic import BaseModel from pymatgen.core import IMolecule, IStructure, Molecule, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer -from mofdscribe.featurizers.bu.fragment import fragment from mofdscribe.featurizers.bu.utils import boxed_molecule from mofdscribe.featurizers.utils import nan_array, set_operates_on from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS STRUCTURE_VIEWS = ("all_atom", "connecting", "binding") +# since fragmentation is not super straightforward, +# we give the option to provide the fragments directly +class MOFBBs(BaseModel): + """Container for MOF building blocks.""" + + nodes: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]] + linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]] + class ConnectingSitesFeaturizer(MOFBaseFeaturizer): ... @@ -53,7 +60,6 @@ class BUFeaturizer(MOFBaseFeaturizer): mofbbs=MOFBBs(nodes=[BabelMolAdaptor.from_string( "[CH-]1C=CC=C1.[CH-]1C=CC=C1.[Fe+2]", "smi").pymatgen_mol], linkers=[BabelMolAdaptor.from_string("CCCC", "smi").pymatgen_mol])) - """ def __init__( @@ -90,25 +96,24 @@ def feature_labels(self) -> List[str]: def _extract_bbs( self, - structure: Optional[Union[Structure, IStructure]] = None, + mof: Optional["MOF"], mofbbs: Optional[MOFBBs] = None, ): - if structure is None and mofbbs is None: + if mof is None and mofbbs is None: raise ValueError("You must provide a structure or mofbbs.") - if structure is not None: - fragments = fragment(structure) + if mof is not None: + fragments = mof.fragments if self._operates_on in ("both", "molecule"): linkers = [linker.molecule for linker in fragments.linkers] nodes = [node.molecule for node in fragments.nodes] else: # create a boxed structure - linkers = [boxed_molecule(linker.molecule) for linker in fragments.linkers] - nodes = [boxed_molecule(node.molecule) for node in fragments.nodes] + linkers = [linker._get_boxed_structure() for linker in fragments.linkers] + nodes = [node._get_boxed_structure() for node in fragments.nodes] if mofbbs is not None: - linkers = list(mofbbs.linkers) if mofbbs.linkers is not None else [] nodes = list(mofbbs.nodes) if mofbbs.nodes is not None else [] types = [type(node) for node in nodes] + [type(linker) for linker in linkers] @@ -138,22 +143,25 @@ def _extract_bbs( return nodes, linkers + def _fit(self): + raise NotImplementedError("BUFeaturizer does not support _fit.") + def fit( self, - structures: Optional[Collection[Structure]] = None, + mofs: Optional[Collection["MOF"]] = None, mofbbs: Optional[Collection[MOFBBs]] = None, ) -> None: """ Fit the featurizer to the given structures. Args: - structures (Collection[Structure], optional): The structures to featurize. + structures (Collection[MOF], optional): The MOFs to featurize. mofbbs (Collection[MOFBBs], optional): The MOF fragments (nodes and linkers). """ all_nodes, all_linkers = [], [] - if structures is not None: - for structure in structures: - nodes, linkers = self._extract_bbs(structure=structure) + if mofs is not None: + for mof in mofs: + nodes, linkers = self._extract_bbs(mof=mof) all_nodes.extend(nodes) all_linkers.extend(linkers) if mofbbs is not None: @@ -163,16 +171,14 @@ def fit( all_linkers.extend(linkers) all_fragments = all_nodes + all_linkers - self._featurizer.fit(all_fragments) + self._featurizer._fit(all_fragments) + + def _featurize(self): + raise NotImplementedError("BUFeaturizer does not support _featurize.") - # ToDo: - # - Perhaps use type dispatch instead of different keyword arguments. - # (we can use fastcore or code it ourselves) - # - We can also directly pass the graph to the featurizer if it want to work - # on the graph. def featurize( self, - structure: Optional[Union[Structure, IStructure]] = None, + mof: Optional["MOF"] = None, mofbbs: Optional[MOFBBs] = None, ) -> np.ndarray: """ @@ -186,7 +192,7 @@ def featurize( where possible. Args: - structure (Union[Structure, IStructure], optional): The structure to featurize. + mof (MOF, optional): The structure to featurize. mofbbs (MOFBBs, optional): The MOF fragments (nodes and linkers). Returns: @@ -194,12 +200,12 @@ def featurize( """ # if i know what the featurizer wants, I can always cast to a structure num_features = len(self._featurizer.feature_labels()) - nodes, linkers = self._extract_bbs(structure, mofbbs) - linker_feats = [self._featurizer.featurize(linker) for linker in linkers] + nodes, linkers = self._extract_bbs(mof, mofbbs) + linker_feats = [self._featurizer._featurize(linker) for linker in linkers] if not linker_feats: linker_feats = [nan_array(num_features)] - node_feats = [self._featurizer.featurize(node) for node in nodes] + node_feats = [self._featurizer._featurize(node) for node in nodes] if not node_feats: node_feats = [nan_array(num_features)] diff --git a/src/mofdscribe/featurizers/bu/rdkitadaptor.py b/src/mofdscribe/featurizers/bu/rdkitadaptor.py index 8370871a..dd5db2f7 100644 --- a/src/mofdscribe/featurizers/bu/rdkitadaptor.py +++ b/src/mofdscribe/featurizers/bu/rdkitadaptor.py @@ -10,8 +10,8 @@ from pymatgen.core import Molecule from structuregraph_helpers.create import get_local_env_method -from .utils import create_rdkit_mol_from_mol_graph -from ..utils.extend import operates_on_imolecule, operates_on_molecule +from mofdscribe.featurizers.bu.utils import create_rdkit_mol_from_mol_graph +from mofdscribe.featurizers.utils.extend import operates_on_imolecule, operates_on_molecule @operates_on_molecule diff --git a/src/mofdscribe/featurizers/bu/shape_featurizer.py b/src/mofdscribe/featurizers/bu/shape_featurizer.py index a71980ed..fd920efe 100644 --- a/src/mofdscribe/featurizers/bu/shape_featurizer.py +++ b/src/mofdscribe/featurizers/bu/shape_featurizer.py @@ -13,7 +13,7 @@ from rdkit.Chem.Descriptors3D import RadiusOfGyration as RadiusOfGyration_RDKIT from rdkit.Chem.Descriptors3D import SpherocityIndex as SpherocityIndex_RDKIT -from mofdscribe.featurizes.bu.rdkitadaptor import RDKitAdaptor +from mofdscribe.featurizers.bu.rdkitadaptor import RDKitAdaptor def _rod_likeness(mol): diff --git a/src/mofdscribe/featurizers/matmineradapter.py b/src/mofdscribe/featurizers/matmineradapter.py index 7f7deb3c..8e8bb585 100644 --- a/src/mofdscribe/featurizers/matmineradapter.py +++ b/src/mofdscribe/featurizers/matmineradapter.py @@ -2,32 +2,84 @@ """Use matminer featurizers in mofdscribe.""" from typing import List +import numpy as np +from matminer.featurizers.base import BaseFeaturizer from pymatgen.core.structure import Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer class MatminerAdapter(MOFBaseFeaturizer): - def __init__(self, matminer_featurizer): + """ + MatminerAdapter is a wrapper for matminer featurizers to be used in mofdscribe. + + That is, you can then pass :py:class:`mofdscribe.mof.MOF` objects to the + featurizer. + + .. note:: + + If your matminer featurizer needs an expensive + computation, e.g., the structure graph, if cannot reuse the + one computed on the ``MOF`` object. + + Example: + + >>> from mofdscribe.featurizers.matmineradapter import MatminerAdapter + >>> from matminer.featurizers.structure import DensityFeatures + >>> from mofdscribe.mof import MOF + >>> from pymatgen.core.structure import Structure + >>> hkust_structure = Structure.from_file("tests/data/hkust-1.cif") + >>> matminer_featurizer = DensityFeatures() + >>> adapter = MatminerAdapter(matminer_featurizer) + >>> features = adapter.featurize(MOF(hkust_structure)) + >>> original_features = matminer_featurizer.featurize(hkust_structure) + >>> np.allclose(features, original_features) + True + """ + + def __init__(self, matminer_featurizer: BaseFeaturizer): + """Construct a MatminerAdapter. + + Args: + matminer_featurizer (BaseFeaturizer): A matminer featurizer. + """ self.matminer_featurizer = matminer_featurizer def _featurize(self, structure: Structure): + """Call the featurize method of the matminer featurizer.""" return self.matminer_featurizer.featurize(structure) - def featurize(self, mof): + def featurize(self, mof: "MOF") -> np.ndarray: + """Call the featurize method of the matminer featurizer using a MOF as input. + + Args: + mof (MOF): A MOF object. + + Returns: + np.ndarray: The features. + """ return self._featurize(mof.structure) - def _fit(self, structures: List[Structure]): + def _fit(self, structures: List[Structure]) -> np.ndarray: + """Call the fit method of the matminer featurizer.""" self.matminer_featurizer.fit(structures) - def fit(self, mofs): - self._fit([mof.structure for mof in mofs]) + def fit(self, mofs: List["MOF"]): + """Call the fit method of the matminer featurizer using a list of MOFs as input. + + Args: + mofs (List[MOF]): A list of MOF objects. + """ + self._fit([Structure.from_sites(mof.structure.sites) for mof in mofs]) def feature_labels(self): + """Call the feature_labels method of the matminer featurizer.""" return self.matminer_featurizer.feature_labels() def citations(self): + """Call the citations method of the matminer featurizer.""" return self.matminer_featurizer.citations() def implementors(self): + """Call the implementors method of the matminer featurizer.""" return self.matminer_featurizer.implementors() diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py index 90c6b3ab..0ea5db30 100644 --- a/tests/featurizers/bu/test_bu_featurizer.py +++ b/tests/featurizers/bu/test_bu_featurizer.py @@ -7,7 +7,9 @@ from pymatgen.core import Structure from mofdscribe.featurizers.bu.bu_featurizer import BUFeaturizer, MOFBBs +from mofdscribe.featurizers.matmineradapter import MatminerAdapter from mofdscribe.featurizers.topology import PHStats +from mofdscribe.mof import MOF def test_bu_featurizer(hkust_structure, molecule): @@ -21,7 +23,7 @@ def test_bu_featurizer(hkust_structure, molecule): assert features[600] < 2 featurizer = BUFeaturizer(PHStats(no_supercell=True)) - features = featurizer.featurize(structure=hkust_structure) + features = featurizer.featurize(mof=MOF(hkust_structure)) assert features.shape == (768,) assert features[0] > 0 assert features[0] < 2 @@ -30,28 +32,31 @@ def test_bu_featurizer(hkust_structure, molecule): def test_bu_featurizer_with_matminer_featurizer(hkust_structure, hkust_linker_structure): """Test that we can call BU featurizers with matminer molecules.""" # we disable the periodic keyword to be able to compare with the molecules - base_feat = SiteStatsFingerprint(SOAP(6, 8, 8, 0.4, False, "gto", False)) + base_feat = MatminerAdapter(SiteStatsFingerprint(SOAP(4, 4, 4, 0.1, False, "gto", False))) hkust_structure = Structure.from_sites(hkust_structure.sites) - base_feat.fit([hkust_structure]) featurizer = BUFeaturizer(base_feat, aggregations=("mean",)) - features = featurizer.featurize(structure=hkust_structure) - assert features.shape == (2592 * 2,) + featurizer.fit([MOF(hkust_structure)]) + features = featurizer.featurize(mof=MOF(hkust_structure)) + assert features.shape == (400 * 2,) assert features[0] >= 0 assert features[0] < 2 linker_feats = [f for f in featurizer.feature_labels() if "linker" in f] - assert len(linker_feats) == 2592 + assert len(linker_feats) == 400 # test that our fit method works featurizer = BUFeaturizer( - SiteStatsFingerprint(SOAP(6, 8, 8, 0.4, False, "gto", False)), aggregations=("mean",) + MatminerAdapter(SiteStatsFingerprint(SOAP(4, 4, 4, 0.1, False, "gto", False))), + aggregations=("mean",), ) - featurizer.fit([hkust_structure]) - features_direct_fit = featurizer.featurize(structure=hkust_structure) + featurizer.fit([MOF(hkust_structure)]) + features_direct_fit = featurizer.featurize(mof=MOF(hkust_structure)) assert np.allclose(features, features_direct_fit, rtol=0.01) # test that the linker features are actually the ones we get when # we featurize the linker - linker_feats = featurizer._featurizer.featurize(hkust_linker_structure) + linker_feats = featurizer._featurizer._featurize(hkust_linker_structure) linker_feature_mask = [i for i, f in enumerate(featurizer.feature_labels()) if "linker" in f] - assert np.allclose(features[linker_feature_mask], linker_feats, rtol=0.01, equal_nan=True) + assert len(features[linker_feature_mask]) == len(linker_feats) + # todo: check where potential differences come from + assert np.allclose(features[linker_feature_mask], linker_feats, rtol=0.005, atol=1e-3) diff --git a/tests/test_files/BTC.cif b/tests/test_files/BTC.cif index baabe76e..c724395e 100644 --- a/tests/test_files/BTC.cif +++ b/tests/test_files/BTC.cif @@ -1,57 +1,44 @@ - -####################################################################### -# -# Cambridge Crystallographic Data Centre -# CCDC -# -####################################################################### -# -# If this CIF has been generated from an entry in the Cambridge -# Structural Database, then it will include bibliographic, chemical, -# crystal, experimental, refinement or atomic coordinate data resulting -# from the CCDC's data processing and validation procedures. -# -####################################################################### - +# generated using pymatgen data_HC3O2 -_symmetry_cell_setting triclinic _symmetry_space_group_name_H-M 'P 1' -_symmetry_Int_Tables_number 1 -_space_group_name_Hall 'P 1' +_cell_length_a 7.82719732 +_cell_length_b 7.82719732 +_cell_length_c 7.82719732 +_cell_angle_alpha 90.00000000 +_cell_angle_beta 90.00000000 +_cell_angle_gamma 90.00000000 +_symmetry_Int_Tables_number 1 +_chemical_formula_structural HC3O2 +_chemical_formula_sum 'H3 C9 O6' +_cell_volume 479.53338312 +_cell_formula_units_Z 3 loop_ -_symmetry_equiv_pos_site_id -_symmetry_equiv_pos_as_xyz -1 x,y,z -_cell_length_a 26.34300000 -_cell_length_b 26.34300000 -_cell_length_c 26.34300000 -_cell_angle_alpha 90.00000000 -_cell_angle_beta 90.00000000 -_cell_angle_gamma 90.00000000 -_cell_volume 18280.8 + _symmetry_equiv_pos_site_id + _symmetry_equiv_pos_as_xyz + 1 'x, y, z' loop_ -_atom_site_label -_atom_site_type_symbol -_atom_site_fract_x -_atom_site_fract_y -_atom_site_fract_z -C378 C 0.32200000 0.67800000 0.61300000 -O379 O 0.44800000 0.74300000 0.68300000 -C380 C 0.43100000 0.70300000 0.70300000 -C381 C 0.29700000 0.70300000 0.56900000 -C382 C 0.32200000 0.61300000 0.67800000 -H383 H 0.38000000 0.72800000 0.62000000 -O384 O 0.31700000 0.74300000 0.55200000 -C385 C 0.38700000 0.67800000 0.67800000 -C386 C 0.30100000 0.63400000 0.63400000 -H387 H 0.27200000 0.62000000 0.62000000 -C388 C 0.36600000 0.63400000 0.69900000 -H389 H 0.38000000 0.62000000 0.72800000 -O390 O 0.44800000 0.68300000 0.74300000 -O391 O 0.25700000 0.68300000 0.55200000 -O392 O 0.25700000 0.55200000 0.68300000 -C393 C 0.36600000 0.69900000 0.63400000 -O394 O 0.31700000 0.55200000 0.74300000 -C395 C 0.29700000 0.56900000 0.70300000 - -#END + _atom_site_type_symbol + _atom_site_label + _atom_site_symmetry_multiplicity + _atom_site_fract_x + _atom_site_fract_y + _atom_site_fract_z + _atom_site_occupancy + H H0 1 -2.08665495 -1.27891755 -2.45013678 1 + O O1 1 -2.50062036 -1.50777648 -2.29868601 1 + C C2 1 -2.28185815 -1.30247656 -2.28185815 1 + O O3 1 -1.85779602 -0.86495213 -2.29868601 1 + C C4 1 -2.13377296 -1.22843396 -2.35253517 1 + O O5 1 -2.50062036 -1.06688648 -1.85779602 1 + H H6 1 -2.45013678 -1.27891755 -2.08665495 1 + C C7 1 -2.28185815 -1.08371434 -2.06309594 1 + C C8 1 -2.36599746 -0.99957503 -1.91501075 1 + C C9 1 -2.13377296 -1.01303732 -2.13377296 1 + O O10 1 -2.29868601 -0.86495213 -1.85779602 1 + O O11 1 -2.29868601 -1.50777648 -2.50062036 1 + C C12 1 -2.35253517 -1.22843396 -2.13377296 1 + C C13 1 -1.91501075 -0.99957503 -2.36599746 1 + C C14 1 -2.06309594 -1.08371434 -2.28185815 1 + H H15 1 -2.08665495 -0.91543572 -2.08665495 1 + O O16 1 -1.85779602 -1.06688648 -2.50062036 1 + C C17 1 -2.36599746 -1.45056175 -2.36599746 1 diff --git a/tests/test_matmineradapter.py b/tests/test_matmineradapter.py new file mode 100644 index 00000000..24c74bac --- /dev/null +++ b/tests/test_matmineradapter.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +import numpy as np +from matminer.featurizers.site import SOAP +from matminer.featurizers.structure import DensityFeatures, SiteStatsFingerprint +from pymatgen.core.structure import Structure + +from mofdscribe.featurizers.matmineradapter import MatminerAdapter +from mofdscribe.mof import MOF + + +def test_matminer_adapter(hkust_structure): + matminer_featurizer = DensityFeatures() + adapter = MatminerAdapter(matminer_featurizer) + features = adapter.featurize(MOF(hkust_structure)) + original_features = matminer_featurizer.featurize(hkust_structure) + + assert np.allclose(features, original_features) + + assert adapter.feature_labels() == matminer_featurizer.feature_labels() + assert adapter.citations() == matminer_featurizer.citations() + assert adapter.implementors() == matminer_featurizer.implementors() + + +def test_matminer_adapter_with_fit(hkust_structure): + original_featurizer = SiteStatsFingerprint(SOAP(6, 8, 8, 0.4, False, "gto", False)) + hkust_structure = Structure.from_sites(hkust_structure.sites) + original_featurizer.fit([hkust_structure]) + original_features = original_featurizer.featurize(hkust_structure) + + matminer_featurizer = SiteStatsFingerprint(SOAP(6, 8, 8, 0.4, False, "gto", False)) + adapter = MatminerAdapter(matminer_featurizer) + adapter.fit([MOF(hkust_structure)]) + features = adapter.featurize(MOF(hkust_structure)) + + assert np.allclose(features, original_features) From b3554a74e2dfcd114aacbc33aa42ff3b8ceb0e69 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Thu, 22 Dec 2022 19:47:40 +0100 Subject: [PATCH 05/17] continue refactor --- .../featurizers/bu/bu_featurizer.py | 102 ++++++++++++++++-- src/mofdscribe/featurizers/bu/bu_matches.py | 4 + .../bu/compositionstats_featurizer.py | 3 + .../bu/distance_hist_featurizer.py | 5 + .../bu/distance_stats_featurizer.py | 5 + .../featurizers/bu/lsop_featurizer.py | 13 ++- src/mofdscribe/featurizers/bu/rdkitadaptor.py | 15 +++ tests/featurizers/bu/test_bu_featurizer.py | 2 +- tests/featurizers/bu/test_lsop_featurizer.py | 3 +- 9 files changed, 140 insertions(+), 12 deletions(-) diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py index 85ed7496..a2454508 100644 --- a/src/mofdscribe/featurizers/bu/bu_featurizer.py +++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- """Compute features on the BUs and then aggregate them.""" +from abc import abstractmethod from typing import Collection, List, Optional, Tuple, Union import numpy as np @@ -10,6 +11,7 @@ from mofdscribe.featurizers.bu.utils import boxed_molecule from mofdscribe.featurizers.utils import nan_array, set_operates_on from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS +from mofdscribe.mof import MOF STRUCTURE_VIEWS = ("all_atom", "connecting", "binding") @@ -22,14 +24,6 @@ class MOFBBs(BaseModel): linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]] -class ConnectingSitesFeaturizer(MOFBaseFeaturizer): - ... - - -class BindingSitesFeaturizer(MOFBaseFeaturizer): - ... - - # If we would cache the fragmentation, the implementation would be simpler # ToDo: support moffragmentor options # ToDo: Support `MultipleFeaturizer`s (should be ok, if we recursively call the operates_on method). @@ -62,6 +56,8 @@ class BUFeaturizer(MOFBaseFeaturizer): linkers=[BabelMolAdaptor.from_string("CCCC", "smi").pymatgen_mol])) """ + _NAME = "BUFeaturizer" + def __init__( self, featurizer: MOFBaseFeaturizer, @@ -91,7 +87,7 @@ def feature_labels(self) -> List[str]: for bb in ["node", "linker"]: for aggregation in self._aggregations: for label in base_labels: - labels.append(f"{bb}_{aggregation}_{label}") + labels.append(f"{self._NAME}_{bb}_{aggregation}_{label}") return labels def _extract_bbs( @@ -226,3 +222,91 @@ def citations(self) -> List[str]: def implementors(self) -> List[str]: return self._featurizer.implementors() + + +class _BUSubBaseFeaturizer(BUFeaturizer): + def featurize( + self, + mof: Optional["MOF"] = None, + ) -> np.ndarray: + """ + Compute features on the BUs and then aggregate them. + + If you provide a structure, we will fragment the MOF into BUs. + If you already have precomputed fragements or only want to consider a subset + of the BUs, you can provide them manually via the `mofbbs` argument. + + If you manually provide the `mofbbs`, we will convert molecules to structures + where possible. + + Args: + mof (MOF, optional): The structure to featurize. + + Returns: + A numpy array of features. + """ + # if i know what the featurizer wants, I can always cast to a structure + num_features = len(self._featurizer.feature_labels()) + nodes, linkers = self._extract(mof) + linker_feats = [self._featurizer._featurize(linker) for linker in linkers] + if not linker_feats: + linker_feats = [nan_array(num_features)] + + node_feats = [self._featurizer._featurize(node) for node in nodes] + if not node_feats: + node_feats = [nan_array(num_features)] + + aggregated_linker_feats = [] + for aggregation in self._aggregations: + aggregated_linker_feats.extend(ARRAY_AGGREGATORS[aggregation](linker_feats, axis=0)) + aggregated_linker_feats = np.array(aggregated_linker_feats) + + aggregated_node_feats = [] + for aggregation in self._aggregations: + aggregated_node_feats.extend(ARRAY_AGGREGATORS[aggregation](node_feats, axis=0)) + aggregated_node_feats = np.array(aggregated_node_feats) + + return np.concatenate((aggregated_node_feats, aggregated_linker_feats)) + + @abstractmethod + def _extract(self, mof: "MOF") -> Tuple[List[Structure], List[Structure]]: + raise NotImplementedError() + + def fit(self, mofs: Collection["MOF"]): + all_nodes, all_linkers = [], [] + for mof in mofs: + nodes, linkers = self._extract(mof) + all_nodes.extend(nodes) + all_linkers.extend(linkers) + self._featurizer.fit(all_nodes + all_linkers) + + def citations(self) -> List[str]: + return self._featurizer.citations() + + def implementors(self) -> List[str]: + return self._featurizer.implementors() + + +class BindingSitesFeaturizer(_BUSubBaseFeaturizer): + """A special BU featurizer that operates on structures spanned by "binding sites". + + From more details see. + + + Example: + >>> from mofdscribe.mof import MOF + >>> from mofdscribe.featurizers.bu.bu_featurizer import BindingSitesFeaturizer + >>> from mofdscribe.featurizers.bu.lsop_featurizer import LSOP + >>> import pandas as pd + >>> mof = MOF.from_file("mof_file.cif") + >>> featurizer = BindingSitesFeaturizer(LSOP()) + >>> feats = featurizer.featurize(mof) + >>> pd.DataFrame([feats], columns=featurizer.feature_labels()) + """ + + _NAME = "BindingSitesFeaturizer" + + def _extract(self, mof: "MOF"): + linkers = [l._get_binding_sites_structure() for l in mof.fragments.linkers] + nodes = [n._get_binding_sites_structure() for n in mof.fragments.nodes] + return nodes, linkers diff --git a/src/mofdscribe/featurizers/bu/bu_matches.py b/src/mofdscribe/featurizers/bu/bu_matches.py index 9140bdc1..ec154caa 100644 --- a/src/mofdscribe/featurizers/bu/bu_matches.py +++ b/src/mofdscribe/featurizers/bu/bu_matches.py @@ -173,6 +173,10 @@ def feature_labels(self) -> List[str]: return self._get_feature_labels() def featurize(self, s: Union[Structure, IStructure, Molecule, IMolecule]) -> np.ndarray: + """Structure is here spanned by the connecting points of a BU.""" + return self._featurize(s) + + def _featurize(self, s: Union[Structure, IStructure, Molecule, IMolecule]) -> np.ndarray: """Structure is here spanned by the connecting points of a BU.""" features = [] for topo in self.topos: diff --git a/src/mofdscribe/featurizers/bu/compositionstats_featurizer.py b/src/mofdscribe/featurizers/bu/compositionstats_featurizer.py index 7ec7b6a9..e6b2749d 100644 --- a/src/mofdscribe/featurizers/bu/compositionstats_featurizer.py +++ b/src/mofdscribe/featurizers/bu/compositionstats_featurizer.py @@ -58,6 +58,9 @@ def feature_labels(self) -> List[str]: return feature_labels def featurize(self, molecule: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray: + return self._featurize(molecule) + + def _featurize(self, molecule: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray: encodings = defaultdict(list) for encoding in self.encodings: for site in molecule.sites: diff --git a/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py b/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py index 17e31dea..dc5fea1f 100644 --- a/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py +++ b/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py @@ -63,6 +63,11 @@ def feature_labels(self) -> List[str]: return [f"pairwise_distance_hist_{a}" for a in self._get_grid()] def featurize(self, structure: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray: + return self._featurize(structure) + + def _featurize( + self, structure: Union[Molecule, IMolecule, Structure, IStructure] + ) -> np.ndarray: distances = [] for i, _ in enumerate(structure): for j, _ in enumerate(structure): diff --git a/src/mofdscribe/featurizers/bu/distance_stats_featurizer.py b/src/mofdscribe/featurizers/bu/distance_stats_featurizer.py index d2231460..0816ab88 100644 --- a/src/mofdscribe/featurizers/bu/distance_stats_featurizer.py +++ b/src/mofdscribe/featurizers/bu/distance_stats_featurizer.py @@ -43,6 +43,11 @@ def feature_labels(self) -> List[str]: return [f"pairwise_distance_stats_{a}" for a in self.aggregations] def featurize(self, structure: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray: + return self._featurize(structure) + + def _featurize( + self, structure: Union[Molecule, IMolecule, Structure, IStructure] + ) -> np.ndarray: distances = [] for i, _ in enumerate(structure): for j, _ in enumerate(structure): diff --git a/src/mofdscribe/featurizers/bu/lsop_featurizer.py b/src/mofdscribe/featurizers/bu/lsop_featurizer.py index 63d3a42a..c4cf0e50 100644 --- a/src/mofdscribe/featurizers/bu/lsop_featurizer.py +++ b/src/mofdscribe/featurizers/bu/lsop_featurizer.py @@ -88,7 +88,18 @@ def __init__( def feature_labels(self) -> List[str]: return [f"lsop_{val}" for val in self.types] - def featurize(self, s: Union[Structure, IStructure, Molecule, IMolecule]) -> np.ndarray: + def featurize(self, mof: "MOF") -> np.ndarray: + """Compute the LSOP for a fragment. + + Args: + mof (MOF): The MOF to compute the LSOP for. + + Returns: + np.ndarray: The LSOP values. + """ + return self._featurize(mof.structure) + + def _featurize(self, s: Union[Structure, IStructure, Molecule, IMolecule]) -> np.ndarray: molecule = Molecule.from_sites(s.sites) com = molecule.center_of_mass orginal_len = len(molecule) diff --git a/src/mofdscribe/featurizers/bu/rdkitadaptor.py b/src/mofdscribe/featurizers/bu/rdkitadaptor.py index dd5db2f7..b8e1ee35 100644 --- a/src/mofdscribe/featurizers/bu/rdkitadaptor.py +++ b/src/mofdscribe/featurizers/bu/rdkitadaptor.py @@ -70,6 +70,21 @@ def featurize(self, molecule: Union[Molecule, MoleculeGraph]) -> np.ndarray: If the input molecule is a Molecule, we convert it to a MoleculeGraph using the local environment strategy specified in the constructor. + Args: + molecule: A pymatgen Molecule or MoleculeGraph object. + + Returns: + A numpy array of features. + """ + return self._featurize(molecule) + + def _featurize(self, molecule: Union[Molecule, MoleculeGraph]) -> np.ndarray: + """ + Call the RDKit featurizer on the molecule. + + If the input molecule is a Molecule, we convert it to a MoleculeGraph + using the local environment strategy specified in the constructor. + Args: molecule: A pymatgen Molecule or MoleculeGraph object. diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py index 0ea5db30..4535aa6e 100644 --- a/tests/featurizers/bu/test_bu_featurizer.py +++ b/tests/featurizers/bu/test_bu_featurizer.py @@ -58,5 +58,5 @@ def test_bu_featurizer_with_matminer_featurizer(hkust_structure, hkust_linker_st linker_feats = featurizer._featurizer._featurize(hkust_linker_structure) linker_feature_mask = [i for i, f in enumerate(featurizer.feature_labels()) if "linker" in f] assert len(features[linker_feature_mask]) == len(linker_feats) - # todo: check where potential differences come from + # todo: check where potential differences come from assert np.allclose(features[linker_feature_mask], linker_feats, rtol=0.005, atol=1e-3) diff --git a/tests/featurizers/bu/test_lsop_featurizer.py b/tests/featurizers/bu/test_lsop_featurizer.py index bacd3a50..1d5ea85c 100644 --- a/tests/featurizers/bu/test_lsop_featurizer.py +++ b/tests/featurizers/bu/test_lsop_featurizer.py @@ -2,12 +2,13 @@ """Test that the fragment LSOP featurizer works.""" from mofdscribe.featurizers.bu.lsop_featurizer import LSOP +from mofdscribe.mof import MOF def test_lsop_featurizer(triangle_structure): """Test that the LSOP featurizer works.""" featurizer = LSOP(types=["tri_plan", "sq_pyr", "cn"]) - features = featurizer.featurize(triangle_structure) + features = featurizer.featurize(MOF(triangle_structure)) assert features.shape == (3,) assert len(features) == len(featurizer.feature_labels()) assert features[0] > 0.5 From 96cc158cea4b8f627eebf223078fbca208575373 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Fri, 23 Dec 2022 13:56:14 +0100 Subject: [PATCH 06/17] refactoring done (?) --- setup.cfg | 18 ++-- src/mofdscribe/featurizers/base.py | 92 +++++-------------- .../featurizers/bu/bu_featurizer.py | 68 ++++++++++---- .../featurizers/bu/lsop_featurizer.py | 3 +- .../featurizers/bu/nconf20_featurizer.py | 2 +- src/mofdscribe/featurizers/chemistry/amd.py | 6 +- src/mofdscribe/featurizers/chemistry/aprdf.py | 3 +- .../featurizers/chemistry/energygrid.py | 3 +- src/mofdscribe/featurizers/chemistry/henry.py | 3 +- .../chemistry/partialchargehistogram.py | 3 +- .../chemistry/partialchargestats.py | 3 +- src/mofdscribe/featurizers/chemistry/price.py | 3 +- src/mofdscribe/featurizers/chemistry/racs.py | 18 +++- .../featurizers/graph/dimensionality.py | 4 +- .../featurizers/graph/graphfeaturizer.py | 3 + .../featurizers/hostguest/guest_aprdf.py | 3 +- .../hostguest/host_guest_featurizer.py | 7 +- src/mofdscribe/featurizers/matmineradapter.py | 10 +- .../featurizers/pore/geometric_properties.py | 14 ++- src/mofdscribe/featurizers/pore/voxelgrid.py | 3 +- .../featurizers/topology/atom_centered_ph.py | 5 +- .../featurizers/topology/ph_hist.py | 3 +- .../featurizers/topology/ph_image.py | 9 +- .../featurizers/topology/ph_stats.py | 4 +- .../featurizers/topology/ph_vect.py | 37 ++++++-- src/mofdscribe/mof.py | 5 +- tests/featurizers/bu/test_bu_featurizer.py | 48 +++++++++- tests/featurizers/test_base.py | 42 +-------- tests/featurizers/topology/test_ph_vect.py | 17 ++-- tests/test_matmineradapter.py | 1 + tox.ini | 8 +- 31 files changed, 247 insertions(+), 201 deletions(-) diff --git a/setup.cfg b/setup.cfg index ef2f8b0e..62ab6d3a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -177,17 +177,17 @@ strictness = short ######################### [flake8] ignore = - S301 # pickle - S403 # pickle + S301 + S403 S404 S603 - W503 # Line break before binary operator (flake8 is wrong) - E203 # whitespace before ':' - S101 # Complaining about assert statements - D101 # Docstring missing - D102 # Docstring missing - D103 # Docstring missing - D104 # Docstring missing + W503 + E203 + S101 + D101 + D102 + D103 + D104 D400 exclude = .tox, diff --git a/src/mofdscribe/featurizers/base.py b/src/mofdscribe/featurizers/base.py index e5105a91..761058cc 100644 --- a/src/mofdscribe/featurizers/base.py +++ b/src/mofdscribe/featurizers/base.py @@ -12,16 +12,12 @@ or and iterable of pymatgen structure objects. """ from abc import abstractmethod -from multiprocessing import Pool from typing import Collection, Union import numpy as np -import pandas as pd -from loguru import logger from matminer.featurizers.base import BaseFeaturizer, MultipleFeaturizer from pymatgen.analysis.graphs import StructureGraph from pymatgen.core import IMolecule, IStructure, Molecule, Structure -from tqdm.auto import tqdm from mofdscribe.mof import MOF @@ -106,81 +102,37 @@ class MOFMultipleFeaturizer(MultipleFeaturizer): In this case, you're better off using two separate MultipleFeaturizers. """ - def __init__(self, featurizers, iterate_over_entries=True, primitive=True): + def __init__(self, featurizers): """ Construct a MOFMultipleFeaturizer. Args: featurizers (list): List of featurizers to use. - iterate_over_entries (bool): If True, featurize each entry in the list. - If False, featurize each structure in the list. Defaults to True. - primitive (bool): If True, use the primitive cell of the structure. - Defaults to True. """ - # unset the primitive on the individual featurizers - for featurizer in featurizers: - featurizer.primitive = False - + self.iterate_over_entries = True self.featurizers = featurizers - self.iterate_over_entries = iterate_over_entries - self.primitive_multiple = primitive - - def _get_primitive(self, structure: Union[Structure, IStructure]) -> Structure: - return get_primitive(structure) - - def featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: - logger.debug(f"Featurizing structure in MOFMultipleFeaturizer, {self.primitive_multiple}") - if self.primitive_multiple: - logger.debug("Getting primitive cell") - structure = self._get_primitive(structure) - return np.array([feature for f in self.featurizers for feature in f.featurize(structure)]) - - def _get_primitive_many(self, structures): - if self.n_jobs == 1: - return [self._get_primitive(s) for s in structures] - else: - with Pool(self.n_jobs, maxtasksperchild=1) as p: - res = p.map(self._get_primitive, structures, chunksize=self.chunksize) - return res + + def featurize(self, mof: MOF) -> np.ndarray: + return np.array([feature for f in self.featurizers for feature in f.featurize(mof)]) def featurize_many(self, entries, ignore_errors=False, return_errors=False, pbar=True): - logger.debug("Precomputing primitive cells") - if self.primitive_multiple: - entries = self._get_primitive_many(entries) - if self.iterate_over_entries: - return np.array( - super().featurize_many( - entries, - ignore_errors=ignore_errors, - return_errors=return_errors, - pbar=pbar, - ) + return np.array( + super().featurize_many( + entries, + ignore_errors=ignore_errors, + return_errors=return_errors, + pbar=pbar, ) - else: - features = [ - f.featurize_many( - entries, - ignore_errors=ignore_errors, - return_errors=return_errors, - pbar=pbar, - ) - for f in self.featurizers - ] - - return np.hstack(features) + ) def featurize_wrapper(self, x, return_errors=False, ignore_errors=False): - if self.iterate_over_entries: - return np.array( - [ - feature - for f in self.featurizers - for feature in f.featurize_wrapper( - x, return_errors=return_errors, ignore_errors=ignore_errors - ) - ] - ) - else: - return super().featurize_wrapper( - x, return_errors=return_errors, ignore_errors=ignore_errors - ) + + return np.array( + [ + feature + for f in self.featurizers + for feature in f.featurize_wrapper( + x, return_errors=return_errors, ignore_errors=ignore_errors + ) + ] + ) diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py index a2454508..2641c133 100644 --- a/src/mofdscribe/featurizers/bu/bu_featurizer.py +++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py @@ -4,16 +4,18 @@ from typing import Collection, List, Optional, Tuple, Union import numpy as np +from matminer.featurizers.base import MultipleFeaturizer from pydantic import BaseModel from pymatgen.core import IMolecule, IStructure, Molecule, Structure -from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.base import MOFBaseFeaturizer, MOFMultipleFeaturizer from mofdscribe.featurizers.bu.utils import boxed_molecule from mofdscribe.featurizers.utils import nan_array, set_operates_on from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS from mofdscribe.mof import MOF -STRUCTURE_VIEWS = ("all_atom", "connecting", "binding") +__all__ = ("BUFeaturizer", "BranchingSitesFeaturizer", "BindingSitesFeaturizer") + # since fragmentation is not super straightforward, # we give the option to provide the fragments directly @@ -24,9 +26,6 @@ class MOFBBs(BaseModel): linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]] -# If we would cache the fragmentation, the implementation would be simpler -# ToDo: support moffragmentor options -# ToDo: Support `MultipleFeaturizer`s (should be ok, if we recursively call the operates_on method). class BUFeaturizer(MOFBaseFeaturizer): """ Compute features on the BUs and then aggregate them. @@ -76,7 +75,12 @@ def __init__( If you do not do this, we default to assuming that it operates on structures. aggregations (Tuple[str]): The aggregations to use. Must be one of :py:obj:`ARRAY_AGGREGATORS`. + + Raises: + ValueError: If the featurizer is a `MultipleFeaturizer`. """ + if isinstance(featurizer, (MOFMultipleFeaturizer, MultipleFeaturizer)): + raise ValueError("BUFeaturizer does not support MultipleFeaturizer.") self._featurizer = featurizer self._aggregations = aggregations set_operates_on(self, featurizer) @@ -92,7 +96,7 @@ def feature_labels(self) -> List[str]: def _extract_bbs( self, - mof: Optional["MOF"], + mof: Optional[MOF], mofbbs: Optional[MOFBBs] = None, ): if mof is None and mofbbs is None: @@ -144,14 +148,14 @@ def _fit(self): def fit( self, - mofs: Optional[Collection["MOF"]] = None, + mofs: Optional[Collection[MOF]] = None, mofbbs: Optional[Collection[MOFBBs]] = None, ) -> None: """ Fit the featurizer to the given structures. Args: - structures (Collection[MOF], optional): The MOFs to featurize. + mofs (Collection[MOF], optional): The MOFs to featurize. mofbbs (Collection[MOFBBs], optional): The MOF fragments (nodes and linkers). """ all_nodes, all_linkers = [], [] @@ -174,7 +178,7 @@ def _featurize(self): def featurize( self, - mof: Optional["MOF"] = None, + mof: Optional[MOF] = None, mofbbs: Optional[MOFBBs] = None, ) -> np.ndarray: """ @@ -227,7 +231,7 @@ def implementors(self) -> List[str]: class _BUSubBaseFeaturizer(BUFeaturizer): def featurize( self, - mof: Optional["MOF"] = None, + mof: Optional[MOF] = None, ) -> np.ndarray: """ Compute features on the BUs and then aggregate them. @@ -269,10 +273,10 @@ def featurize( return np.concatenate((aggregated_node_feats, aggregated_linker_feats)) @abstractmethod - def _extract(self, mof: "MOF") -> Tuple[List[Structure], List[Structure]]: + def _extract(self, mof: MOF) -> Tuple[List[Structure], List[Structure]]: raise NotImplementedError() - def fit(self, mofs: Collection["MOF"]): + def fit(self, mofs: Collection[MOF]): all_nodes, all_linkers = [], [] for mof in mofs: nodes, linkers = self._extract(mof) @@ -290,7 +294,11 @@ def implementors(self) -> List[str]: class BindingSitesFeaturizer(_BUSubBaseFeaturizer): """A special BU featurizer that operates on structures spanned by "binding sites". - From more details see. + We define binding sites as the linker atoms that directly connect to + the metal atoms of a node. + A good example are the carboxy oxygen atoms in a copper paddlewheel. + From more details see the `moffragmentor documentation + `_. Example: @@ -306,7 +314,35 @@ class BindingSitesFeaturizer(_BUSubBaseFeaturizer): _NAME = "BindingSitesFeaturizer" - def _extract(self, mof: "MOF"): - linkers = [l._get_binding_sites_structure() for l in mof.fragments.linkers] - nodes = [n._get_binding_sites_structure() for n in mof.fragments.nodes] + def _extract(self, mof: MOF): + linkers = [linker._get_binding_sites_structure() for linker in mof.fragments.linkers] + nodes = [node._get_binding_sites_structure() for node in mof.fragments.nodes] + return nodes, linkers + + +class BranchingSitesFeaturizer(_BUSubBaseFeaturizer): + """A special BU featurizer that operates on structures spanned by "branching sites". + + Branching sites are defined as the sites where a linker connects to a node. + The most common example is the carbon atom in a carboxy group. + From more details see the `moffragmentor documentation + `_. + + + Example: + >>> from mofdscribe.mof import MOF + >>> from mofdscribe.featurizers.bu.bu_featurizer import BranchingSitesFeaturizer + >>> from mofdscribe.featurizers.bu.lsop_featurizer import LSOP + >>> import pandas as pd + >>> mof = MOF.from_file("mof_file.cif") + >>> featurizer = BranchingSitesFeaturizer(LSOP()) + >>> feats = featurizer.featurize(mof) + >>> pd.DataFrame([feats], columns=featurizer.feature_labels()) + """ + + _NAME = "BranchingSitesFeaturizer" + + def _extract(self, mof: MOF): + linkers = [linker._get_branching_sites_structure() for linker in mof.fragments.linkers] + nodes = [node._get_branching_sites_structure() for node in mof.fragments.nodes] return nodes, linkers diff --git a/src/mofdscribe/featurizers/bu/lsop_featurizer.py b/src/mofdscribe/featurizers/bu/lsop_featurizer.py index c4cf0e50..60f4f964 100644 --- a/src/mofdscribe/featurizers/bu/lsop_featurizer.py +++ b/src/mofdscribe/featurizers/bu/lsop_featurizer.py @@ -14,6 +14,7 @@ operates_on_molecule, operates_on_structure, ) +from mofdscribe.mof import MOF _default_types = ( "cn", @@ -88,7 +89,7 @@ def __init__( def feature_labels(self) -> List[str]: return [f"lsop_{val}" for val in self.types] - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: """Compute the LSOP for a fragment. Args: diff --git a/src/mofdscribe/featurizers/bu/nconf20_featurizer.py b/src/mofdscribe/featurizers/bu/nconf20_featurizer.py index b5b3a245..de38d89e 100644 --- a/src/mofdscribe/featurizers/bu/nconf20_featurizer.py +++ b/src/mofdscribe/featurizers/bu/nconf20_featurizer.py @@ -7,7 +7,7 @@ from rdkit import Chem from rdkit.Chem import AllChem -from .rdkitadaptor import RDKitAdaptor +from mofdscribe.featurizers.bu.rdkitadaptor import RDKitAdaptor # ToDo: Allow to set some of the options, e.g. for UFF diff --git a/src/mofdscribe/featurizers/chemistry/amd.py b/src/mofdscribe/featurizers/chemistry/amd.py index 54a488be..55068171 100644 --- a/src/mofdscribe/featurizers/chemistry/amd.py +++ b/src/mofdscribe/featurizers/chemistry/amd.py @@ -7,6 +7,7 @@ from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.substructures import filter_element +from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType __all__ = ["AMD"] @@ -81,7 +82,7 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize(self, structure: StructureIStructureType) -> np.ndarray: @@ -92,6 +93,9 @@ def _featurize(self, structure: StructureIStructureType) -> np.ndarray: Returns: A numpy array containing the AMD descriptor. + + Raises: + ImportError: If the AMD package is not installed. """ try: from amd._nns import nearest_neighbours diff --git a/src/mofdscribe/featurizers/chemistry/aprdf.py b/src/mofdscribe/featurizers/chemistry/aprdf.py index 95ee0967..3376d7be 100644 --- a/src/mofdscribe/featurizers/chemistry/aprdf.py +++ b/src/mofdscribe/featurizers/chemistry/aprdf.py @@ -13,6 +13,7 @@ from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.utils.aggregators import AGGREGATORS from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType __all__ = ["APRDF"] @@ -101,7 +102,7 @@ def _get_feature_labels(self): return list(aprdfs.flatten()) - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize(self, s: StructureIStructureType) -> np.array: diff --git a/src/mofdscribe/featurizers/chemistry/energygrid.py b/src/mofdscribe/featurizers/chemistry/energygrid.py index fbb0e3f6..e8a8dd5b 100644 --- a/src/mofdscribe/featurizers/chemistry/energygrid.py +++ b/src/mofdscribe/featurizers/chemistry/energygrid.py @@ -13,6 +13,7 @@ from mofdscribe.featurizers.utils.histogram import get_rdf from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa +from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType __all__ = ["EnergyGridHistogram"] @@ -190,7 +191,7 @@ def feature_labels(self) -> List[str]: labels.append(f"energygridhist_{self.mol_name}_{site}_{grid_point}") return labels - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize(self, s: StructureIStructureType) -> np.array: diff --git a/src/mofdscribe/featurizers/chemistry/henry.py b/src/mofdscribe/featurizers/chemistry/henry.py index fa8a13f0..fde1bbf2 100644 --- a/src/mofdscribe/featurizers/chemistry/henry.py +++ b/src/mofdscribe/featurizers/chemistry/henry.py @@ -11,6 +11,7 @@ from mofdscribe.featurizers.utils.raspa.base_parser import parse_base_output from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa +from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType __all__ = ["Henry"] @@ -142,7 +143,7 @@ def __init__( self.run_eqeq = run_eqeq self.return_std = return_std - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize(self, s: StructureIStructureType) -> np.array: diff --git a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py index 3ac436d9..963e7f78 100644 --- a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py +++ b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py @@ -9,6 +9,7 @@ from mofdscribe.featurizers.utils.eqeq import get_eqeq_charges from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.histogram import get_rdf +from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType __all__ = ["PartialChargeHistogram"] @@ -49,7 +50,7 @@ def _get_grid(self): def feature_labels(self) -> List[str]: return [f"chargehist_{val}" for val in self._get_grid()] - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(s=mof.structure) def _featurize(self, s: StructureIStructureType) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/chemistry/partialchargestats.py b/src/mofdscribe/featurizers/chemistry/partialchargestats.py index 4be296e0..fd668e85 100644 --- a/src/mofdscribe/featurizers/chemistry/partialchargestats.py +++ b/src/mofdscribe/featurizers/chemistry/partialchargestats.py @@ -9,6 +9,7 @@ from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS from mofdscribe.featurizers.utils.eqeq import get_eqeq_charges from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType __all__ = ["PartialChargeStats"] @@ -41,7 +42,7 @@ def __init__(self, aggregations: Tuple[str] = ("max", "min", "std")) -> None: def feature_labels(self) -> List[str]: return [f"chargestat_{agg}" for agg in self.aggregations] - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(s=mof.structure) def _featurize(self, s: StructureIStructureType) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/chemistry/price.py b/src/mofdscribe/featurizers/chemistry/price.py index afde093d..3e80205f 100644 --- a/src/mofdscribe/featurizers/chemistry/price.py +++ b/src/mofdscribe/featurizers/chemistry/price.py @@ -9,6 +9,7 @@ import pandas as pd from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType _THIS_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -121,7 +122,7 @@ def feature_labels(self) -> List[str]: labels.append(f"price_lower_bound_{projection}") return labels - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize(self, structure: StructureIStructureType) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py index 4087d034..11012092 100644 --- a/src/mofdscribe/featurizers/chemistry/racs.py +++ b/src/mofdscribe/featurizers/chemistry/racs.py @@ -18,6 +18,7 @@ get_neighbors_at_distance, get_structure_graph, ) +from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType __all__ = ("RACS",) @@ -201,13 +202,20 @@ def __init__( "nodes", ] - def featurize(self, mof: "MOF") -> np.ndarray: - return self._featurize(mof.structure) + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure_graph) - def _featurize(self, structure: StructureIStructureType) -> np.ndarray: - if isinstance(structure, Structure): + def _featurize(self, structure: Union[StructureIStructureType, StructureGraph]) -> np.ndarray: + if isinstance(structure, (Structure, IStructure)): structure = IStructure.from_sites(structure) - sg = get_structure_graph(structure, self.bond_heuristic) + sg = get_structure_graph(structure, self.bond_heuristic) + elif isinstance(structure, StructureGraph): + sg = structure + structure = sg.structure + else: + raise TypeError( + f"Structure must be pymatgen Structure or StructureGraph, found {type(structure)}" + ) racs = {} # This finds all indices for a particular subset of atoms diff --git a/src/mofdscribe/featurizers/graph/dimensionality.py b/src/mofdscribe/featurizers/graph/dimensionality.py index 7c0d9c77..d966b544 100644 --- a/src/mofdscribe/featurizers/graph/dimensionality.py +++ b/src/mofdscribe/featurizers/graph/dimensionality.py @@ -7,13 +7,15 @@ from pymatgen.analysis.graphs import StructureGraph from mofdscribe.featurizers.graph.graphfeaturizer import GraphFeaturizer +from mofdscribe.mof import MOF class Dimensionality(GraphFeaturizer): def __init__(self) -> None: + """Construct a new Dimensionality featurizer.""" super().__init__() - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure_graph) def _featurize(self, structure_graph: StructureGraph) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/graph/graphfeaturizer.py b/src/mofdscribe/featurizers/graph/graphfeaturizer.py index 4f5dddc0..20f3520b 100644 --- a/src/mofdscribe/featurizers/graph/graphfeaturizer.py +++ b/src/mofdscribe/featurizers/graph/graphfeaturizer.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +"""Base class for featurizers that operate on graphs.""" from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.utils.extend import operates_on_structuregraph @@ -12,4 +13,6 @@ # all of this could then be orchestrated in a pipeline object @operates_on_structuregraph class GraphFeaturizer(MOFBaseFeaturizer): + """Base class for graph featurizers.""" + pass diff --git a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py index 313c7433..7de9c50d 100644 --- a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py +++ b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py @@ -10,6 +10,7 @@ from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.hostguest.utils import _extract_host_guest from mofdscribe.featurizers.utils.aggregators import AGGREGATORS +from mofdscribe.mof import MOF __all__ = ["GuestCenteredAPRDF"] @@ -94,7 +95,7 @@ def _extract_host_guest( local_env_method=self._local_env_method, ) - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(structure=mof.structure) def _featurize( diff --git a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py index 129ba3c4..1d725074 100644 --- a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py +++ b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py @@ -11,6 +11,7 @@ from mofdscribe.featurizers.hostguest.utils import _extract_host_guest from mofdscribe.featurizers.utils import set_operates_on from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS +from mofdscribe.mof import MOF # ToDo: How do we handle if there is no guest? @@ -92,12 +93,12 @@ def _extract_host_guest(self, structure: Union[Structure, IStructure]): local_env_method=self._local_env_method, ) - def fit(self, mofs: Collection["MOF"]): + def fit(self, mofs: Collection[MOF]): """ Fit the featurizer to the given MOFs. Args: - mofs (Collection["MOF"]): The MOFs to fit to. + mofs (Collection[MOF]): The MOFs to fit to. """ structures = [mof.structure for mof in mofs] self._fit(structures=structures) @@ -121,7 +122,7 @@ def _fit( self._featurizer.fit(all_hosts + all_guests) - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(structure=mof.structure) def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/matmineradapter.py b/src/mofdscribe/featurizers/matmineradapter.py index 8e8bb585..24be9ba5 100644 --- a/src/mofdscribe/featurizers/matmineradapter.py +++ b/src/mofdscribe/featurizers/matmineradapter.py @@ -7,23 +7,21 @@ from pymatgen.core.structure import Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.mof import MOF class MatminerAdapter(MOFBaseFeaturizer): - """ - MatminerAdapter is a wrapper for matminer featurizers to be used in mofdscribe. + """MatminerAdapter is a wrapper for matminer featurizers to be used in mofdscribe. That is, you can then pass :py:class:`mofdscribe.mof.MOF` objects to the featurizer. .. note:: - If your matminer featurizer needs an expensive computation, e.g., the structure graph, if cannot reuse the one computed on the ``MOF`` object. Example: - >>> from mofdscribe.featurizers.matmineradapter import MatminerAdapter >>> from matminer.featurizers.structure import DensityFeatures >>> from mofdscribe.mof import MOF @@ -49,7 +47,7 @@ def _featurize(self, structure: Structure): """Call the featurize method of the matminer featurizer.""" return self.matminer_featurizer.featurize(structure) - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: """Call the featurize method of the matminer featurizer using a MOF as input. Args: @@ -64,7 +62,7 @@ def _fit(self, structures: List[Structure]) -> np.ndarray: """Call the fit method of the matminer featurizer.""" self.matminer_featurizer.fit(structures) - def fit(self, mofs: List["MOF"]): + def fit(self, mofs: List[MOF]): """Call the fit method of the matminer featurizer using a list of MOFs as input. Args: diff --git a/src/mofdscribe/featurizers/pore/geometric_properties.py b/src/mofdscribe/featurizers/pore/geometric_properties.py index 09ca6beb..3f36804e 100644 --- a/src/mofdscribe/featurizers/pore/geometric_properties.py +++ b/src/mofdscribe/featurizers/pore/geometric_properties.py @@ -21,6 +21,7 @@ from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.utils import is_tool from mofdscribe.featurizers.utils.tempdir import TEMPDIR +from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType ZEOPP_BASE_COMMAND = ["network"] @@ -194,7 +195,7 @@ def __init__( self.labels = ["lis", "lifs", "lifsp"] self.ha = ha - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize(self, s: StructureIStructureType): @@ -281,7 +282,7 @@ def __init__( self.labels = [f"{label}_{self.probe_radius}" for label in labels] - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize(self, s: StructureIStructureType) -> np.ndarray: @@ -372,7 +373,7 @@ def __init__( ] self.labels = [f"{label}_{self.probe_radius}" for label in labels] - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize(self, s: StructureIStructureType) -> np.ndarray: @@ -462,8 +463,6 @@ def __init__( It has been reported that this can lead to issues for some structures. Default is True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ if channel_radius is not None and probe_radius != channel_radius: logger.warning( @@ -486,7 +485,7 @@ def __init__( def feature_labels(self) -> List[str]: return [f"ray_hist_{self.probe_radius}_{i}" for i in range(1000)] - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize(self, s: StructureIStructureType) -> np.ndarray: @@ -566,7 +565,6 @@ def __init__( channel_radius: Optional[Union[str, float]] = None, hist_type: str = "derivative", ha: bool = False, - primitive: bool = True, ) -> None: """Initialize the PoreSizeDistribution featurizer. @@ -624,7 +622,7 @@ def __init__( def feature_labels(self) -> List[str]: return [f"psd_hist_{self.probe_radius}_{i}" for i in range(1000)] - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize(self, s: StructureIStructureType) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/pore/voxelgrid.py b/src/mofdscribe/featurizers/pore/voxelgrid.py index 68795c01..9007f6af 100644 --- a/src/mofdscribe/featurizers/pore/voxelgrid.py +++ b/src/mofdscribe/featurizers/pore/voxelgrid.py @@ -8,6 +8,7 @@ from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.pore._voxelgrid import VoxelGrid as VGBase +from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType @@ -151,7 +152,7 @@ def __init__( self.flatten = flatten self.regular_bounding_box = regular_bounding_box - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize(self, structure: StructureIStructureType) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/topology/atom_centered_ph.py b/src/mofdscribe/featurizers/topology/atom_centered_ph.py index c79be734..e14b4cb9 100644 --- a/src/mofdscribe/featurizers/topology/atom_centered_ph.py +++ b/src/mofdscribe/featurizers/topology/atom_centered_ph.py @@ -16,6 +16,7 @@ from mofdscribe.featurizers.utils import flatten from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.mof import MOF # Todo: allow doing this with cutoff and coordination shells @@ -73,7 +74,7 @@ def __init__( self.dimensions = dimensions self.alpha_weight = alpha_weight - def featurize(self, mof: "MOF", idx: int) -> np.ndarray: + def featurize(self, mof: MOF, idx: int) -> np.ndarray: """Compute the features for a single site. Args: @@ -226,7 +227,7 @@ def _get_relevant_atom_type(self, element: str) -> str: if element in atom_type: return atom_type - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/topology/ph_hist.py b/src/mofdscribe/featurizers/topology/ph_hist.py index b236b786..0fbdfa42 100644 --- a/src/mofdscribe/featurizers/topology/ph_hist.py +++ b/src/mofdscribe/featurizers/topology/ph_hist.py @@ -14,6 +14,7 @@ operates_on_molecule, operates_on_structure, ) +from mofdscribe.mof import MOF @operates_on_imolecule @@ -114,7 +115,7 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize( diff --git a/src/mofdscribe/featurizers/topology/ph_image.py b/src/mofdscribe/featurizers/topology/ph_image.py index f2d03283..54f59545 100644 --- a/src/mofdscribe/featurizers/topology/ph_image.py +++ b/src/mofdscribe/featurizers/topology/ph_image.py @@ -17,6 +17,7 @@ operates_on_molecule, operates_on_structure, ) +from mofdscribe.mof import MOF @operates_on_imolecule @@ -110,8 +111,6 @@ def __init__( in the analysis (experimental!). Defaults to False. no_supercell (bool): If true, then the supercell is not created. Defaults to False. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to False. alpha_weight (Optional[str]): If specified, the use weighted alpha shapes, i.e., replacing the points with balls of varying radii. For instance `atomic_radius_calculated` or `van_der_waals_radius`. @@ -175,7 +174,7 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize( @@ -205,14 +204,14 @@ def _featurize( features.append(np.array(results["image"][element][dim]).flatten()) return np.concatenate(features) - def fit(self, mofs: List["MOF"]) -> None: + def fit(self, mofs: List[MOF]) -> None: """Use structures to estimate the settings for the featurizer. Find the limits (maximum/minimum birth/death and persistence) for all the structures in the dataset and store them in the object. Args: - mofs (List["MOF"]): List of MOFs to find the limits for. + mofs (List[MOF]): List of MOFs to find the limits for. """ structures = [mof.structure for mof in mofs] self._fit(structures) diff --git a/src/mofdscribe/featurizers/topology/ph_stats.py b/src/mofdscribe/featurizers/topology/ph_stats.py index bed8ee3d..650d358a 100644 --- a/src/mofdscribe/featurizers/topology/ph_stats.py +++ b/src/mofdscribe/featurizers/topology/ph_stats.py @@ -17,6 +17,7 @@ operates_on_molecule, operates_on_structure, ) +from mofdscribe.mof import MOF @operates_on_imolecule @@ -50,7 +51,6 @@ def __init__( aggregation_functions: Tuple[str] = ("min", "max", "mean", "std"), periodic: bool = False, no_supercell: bool = False, - primitive: bool = False, alpha_weight: Optional[str] = None, ) -> None: """Initialize the PHStats object. @@ -105,7 +105,7 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() - def featurize(self, mof: "MOF") -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure) def _featurize( diff --git a/src/mofdscribe/featurizers/topology/ph_vect.py b/src/mofdscribe/featurizers/topology/ph_vect.py index 7566f793..02135c2c 100644 --- a/src/mofdscribe/featurizers/topology/ph_vect.py +++ b/src/mofdscribe/featurizers/topology/ph_vect.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """Featurizers using persistent homology -- vectorized using Gaussian mixture models.""" from collections import defaultdict -from typing import List, Optional, Tuple, Union +from typing import Collection, List, Optional, Tuple, Union import numpy as np from loguru import logger @@ -9,14 +9,14 @@ from pymatgen.core import IMolecule, IStructure, Molecule, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.topology._tda_helpers import get_diagrams_for_structure from mofdscribe.featurizers.utils.extend import ( operates_on_imolecule, operates_on_istructure, operates_on_molecule, operates_on_structure, ) - -from ._tda_helpers import get_diagrams_for_structure +from mofdscribe.mof import MOF def _apply_and_fill(transformer_func, diagrams): @@ -237,7 +237,6 @@ def __init__( self.periodic = periodic self.no_supercell = no_supercell self.alpha_weight = alpha_weight - super().__init__(primitive=primitive) def _get_feature_labels(self) -> List[str]: labels = [] @@ -251,6 +250,9 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + def _featurize( self, structure: Union[Structure, IStructure, Molecule, IMolecule] ) -> np.ndarray: @@ -269,9 +271,15 @@ def _featurize( compiled_results = self._reshape_results(res, 1).flatten() return compiled_results - def fit(self, structures: Union[Structure, IStructure, Molecule, IMolecule]) -> "PHVect": - if self.primitive: - structures = self._get_primitive_many(structures) + def fit(self, mofs: Collection["MOF"]): + """Fit the featurizer to a collection of MOFs. + + Args: + mofs (Collection[MOF]): A collection of MOFs to fit the featurizer to. + """ + self._fit([mof.structure for mof in mofs]) + + def _fit(self, structures: Union[Structure, IStructure, Molecule, IMolecule]) -> "PHVect": self.transformers, _ = _fit_transform_structures( self.transformers, structures, @@ -294,11 +302,20 @@ def _reshape_results(self, results, num_structures) -> np.ndarray: n_col += self.n_components return compiled_results - def fit_transform( + def fit_transform(self, mofs: Collection["MOF"]) -> np.ndarray: + """Fit the featurizer to a collection of MOFs and return the featurized values. + + Args: + mofs (Collection[MOF]): A collection of MOFs to fit the featurizer to. + + Returns: + np.ndarray: The featurized values. + """ + return self._fit_transform([mof.structure for mof in mofs]) + + def _fit_transform( self, structures: Union[Structure, IStructure, Molecule, IMolecule] ) -> np.ndarray: - if self.primitive: - structures = self._get_primitive_many(structures) self.transformers, results = _fit_transform_structures( self.transformers, structures, diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py index c79b31fa..dd5b3956 100644 --- a/src/mofdscribe/mof.py +++ b/src/mofdscribe/mof.py @@ -98,6 +98,7 @@ def from_file(cls, path: PathType, primitive: bool = True) -> "MOF": path (PathType): The path to the file. primitive (bool): Whether to use the primitive cell or not. Defaults to ``True``. + Returns: MOF: The MOF class. """ @@ -119,14 +120,14 @@ def fragments(self) -> namedtuple: namedtuple: :py:class:`~moffragmentor.fragment.Fragments` namedtuple. """ try: - from moffragmentor import MOF as MOFFragmentorMOF + from moffragmentor import MOF as MOFFRAGMENTORMOF except ImportError: raise ImportError( "moffragmentor is not installed. Please install it to use the fragments feature.\ See https://github.com/kjappelbaum/moffragmentor for more information." ) - mof = MOFFragmentorMOF(self.__structure, self.structure_graph) + mof = MOFFRAGMENTORMOF(self.__structure, self.structure_graph) return mof.fragment(**self.__fragmentation_kwargs) @cached_property diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py index 4535aa6e..51eac95e 100644 --- a/tests/featurizers/bu/test_bu_featurizer.py +++ b/tests/featurizers/bu/test_bu_featurizer.py @@ -2,11 +2,18 @@ """Test the BU featurizer.""" import numpy as np +import pandas as pd from matminer.featurizers.site import SOAP from matminer.featurizers.structure import SiteStatsFingerprint from pymatgen.core import Structure -from mofdscribe.featurizers.bu.bu_featurizer import BUFeaturizer, MOFBBs +from mofdscribe.featurizers.bu.bu_featurizer import ( + BindingSitesFeaturizer, + BranchingSitesFeaturizer, + BUFeaturizer, + MOFBBs, +) +from mofdscribe.featurizers.bu.lsop_featurizer import LSOP from mofdscribe.featurizers.matmineradapter import MatminerAdapter from mofdscribe.featurizers.topology import PHStats from mofdscribe.mof import MOF @@ -60,3 +67,42 @@ def test_bu_featurizer_with_matminer_featurizer(hkust_structure, hkust_linker_st assert len(features[linker_feature_mask]) == len(linker_feats) # todo: check where potential differences come from assert np.allclose(features[linker_feature_mask], linker_feats, rtol=0.005, atol=1e-3) + + +def test_binding_site_featurizer(hkust_structure): + """Make sure we can featurize on the 'binding' substructure and that the features make some sense.""" + featurizer = BindingSitesFeaturizer(LSOP()) + feats = featurizer.featurize(mof=MOF(hkust_structure)) + labels = featurizer.feature_labels() + for label in labels: + assert "BindingSite" in label + assert len(feats) == len(labels) + + df = pd.DataFrame(data=[feats], columns=labels) + + assert df["BindingSitesFeaturizer_node_mean_lsop_cn"].values[0] == 8.0 + + assert df["BindingSitesFeaturizer_node_mean_lsop_cuboct_max"].values[0] > 0.95 + + assert df["BindingSitesFeaturizer_linker_mean_lsop_cn"].values[0] == 6.0 + + assert df["BindingSitesFeaturizer_linker_mean_lsop_hex_plan_max"].values[0] > 0.4 + + +def test_branching_site_featurizer(hkust_structure): + """Make sure we can featurize on the 'branching' substructure and that the features make some sense.""" + featurizer = BranchingSitesFeaturizer(LSOP()) + feats = featurizer.featurize(mof=MOF(hkust_structure)) + labels = featurizer.feature_labels() + for label in labels: + assert "BranchingSite" in label + assert len(feats) == len(labels) + + df = pd.DataFrame(data=[feats], columns=labels) + + assert df["BranchingSitesFeaturizer_node_mean_lsop_cn"].values[0] == 4.0 + assert df["BranchingSitesFeaturizer_linker_mean_lsop_cn"].values[0] == 3.0 + + assert df["BranchingSitesFeaturizer_node_mean_lsop_sq_plan_max"].values[0] > 0.9 + + assert df["BranchingSitesFeaturizer_linker_mean_lsop_tri_plan"].values[0] > 0.98 diff --git a/tests/featurizers/test_base.py b/tests/featurizers/test_base.py index 1f0d3ef5..f4ce290e 100644 --- a/tests/featurizers/test_base.py +++ b/tests/featurizers/test_base.py @@ -1,56 +1,24 @@ -import numpy as np -import pandas as pd -from matminer.featurizers.base import MultipleFeaturizer from matminer.featurizers.structure import DensityFeatures from mofdscribe.featurizers.base import MOFMultipleFeaturizer from mofdscribe.featurizers.chemistry import RACS +from mofdscribe.featurizers.matmineradapter import MatminerAdapter +from mofdscribe.mof import MOF def test_mofmultiplefeaturizer(hkust_structure, irmof_structure): """Test that the calls work. However, I currently do not know of a good way of testing if and how often get_primitive is called (other than looking at the logs)""" - structures = [hkust_structure, irmof_structure] - primitive_structures = [structure.get_primitive_structure() for structure in structures] - featurizer = MOFMultipleFeaturizer([RACS(), DensityFeatures()], primitive=True) - for featurizer_ in featurizer.featurizers: - assert featurizer_.primitive is False - + featurizer = MOFMultipleFeaturizer([RACS(), MatminerAdapter(DensityFeatures())]) # make sure the simple call works - features = featurizer.featurize(irmof_structure) + features = featurizer.featurize(MOF(irmof_structure)) assert ( len(features) == len(featurizer.feature_labels()) == len(RACS().feature_labels()) + len(DensityFeatures().feature_labels()) ) - featurizer_no_primitive = MOFMultipleFeaturizer([RACS(), DensityFeatures()], primitive=False) - features_no_prim = featurizer_no_primitive.featurize(primitive_structures[-1]) - - assert np.allclose(features, features_no_prim, rtol=1e-2, atol=1e-2, equal_nan=True) - - # now compare with using the "original" matminer MultipleFeaturizer - featurizer_orig = MultipleFeaturizer([RACS(), DensityFeatures()]) - features_orig = featurizer_orig.featurize(primitive_structures[-1]) - assert np.allclose(features, features_orig, rtol=1e-2, atol=1e-2, equal_nan=True) - # make sure the multiple calls work - features_many_1 = featurizer.featurize_many(structures) - features_many_1_labels = featurizer.feature_labels() + features_many_1 = featurizer.featurize_many([MOF(irmof_structure), MOF(hkust_structure)]) assert features_many_1.ndim == 2 - - featurizer = MOFMultipleFeaturizer( - [RACS(), DensityFeatures()], iterate_over_entries=False, primitive=True - ) - features_many_2 = featurizer.featurize_many(structures) - features_many_2_labels = featurizer.feature_labels() - assert features_many_2.ndim == 2 - - features_many_1_df = pd.DataFrame(features_many_1, columns=features_many_1_labels) - features_many_2_df = pd.DataFrame(features_many_2, columns=features_many_2_labels) - - # independent of the way we call the featurizer, the features should be the same - values_1 = features_many_1_df.values - values_2 = features_many_2_df[features_many_1_df.columns].values - assert np.allclose(values_1, values_2, rtol=1e-2, atol=1e-2, equal_nan=True) diff --git a/tests/featurizers/topology/test_ph_vect.py b/tests/featurizers/topology/test_ph_vect.py index 4ca80d6d..22ee06cf 100644 --- a/tests/featurizers/topology/test_ph_vect.py +++ b/tests/featurizers/topology/test_ph_vect.py @@ -3,6 +3,7 @@ import pytest from mofdscribe.featurizers.topology.ph_vect import PHVect +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -12,20 +13,20 @@ def test_ph_vect(hkust_structure, irmof_structure, hkust_la_structure): # should raise if not fitted with pytest.raises(ValueError): ph_vect = PHVect() - ph_vect.featurize(hkust_structure) + ph_vect.featurize(MOF(hkust_structure)) # should be able to fit and featurize ph_vect = PHVect(n_components=2, random_state=42) - ph_vect.fit([hkust_structure, irmof_structure]) + ph_vect.fit([MOF(hkust_structure), MOF(irmof_structure)]) # test fit_transform ph_vect = PHVect(n_components=2, random_state=42) - feat = ph_vect.fit_transform([hkust_structure, irmof_structure]) + feat = ph_vect.fit_transform([MOF(hkust_structure), MOF(irmof_structure)]) assert feat.shape == (2, 4 * 2 * 2) assert is_jsonable(dict(zip(ph_vect.feature_labels(), feat[0]))) assert feat.ndim == 2 - feat = ph_vect.featurize(hkust_structure) + feat = ph_vect.featurize(MOF(hkust_structure)) assert feat.ndim == 1 assert len(feat) == len(set(ph_vect.feature_labels())) @@ -36,14 +37,14 @@ def test_ph_vect(hkust_structure, irmof_structure, hkust_la_structure): ph_vect = PHVect( n_components=2, atom_types=None, alpha_weight="atomic_radius_calculated", random_state=42 ) - hkust_feats = ph_vect.fit_transform([hkust_structure]) - features_hkust_la = ph_vect.fit_transform([hkust_la_structure]) + hkust_feats = ph_vect.fit_transform([MOF(hkust_structure)]) + features_hkust_la = ph_vect.fit_transform([MOF(hkust_la_structure)]) assert hkust_feats.shape == features_hkust_la.shape assert (hkust_feats != features_hkust_la).any() # now, to be sure do not encode the same thing with the atomic radius ph_vect = PHVect(n_components=2, atom_types=None, alpha_weight=None, random_state=42) - hkust_feats = ph_vect.fit_transform([hkust_structure]) - features_hkust_la = ph_vect.fit_transform([hkust_la_structure]) + hkust_feats = ph_vect.fit_transform([MOF(hkust_structure)]) + features_hkust_la = ph_vect.fit_transform([MOF(hkust_la_structure)]) assert hkust_feats.shape == features_hkust_la.shape assert hkust_feats == pytest.approx(features_hkust_la, rel=1e-2) diff --git a/tests/test_matmineradapter.py b/tests/test_matmineradapter.py index 24c74bac..c6ae1bfe 100644 --- a/tests/test_matmineradapter.py +++ b/tests/test_matmineradapter.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +"""Test the MatminerAdapter.""" import numpy as np from matminer.featurizers.site import SOAP from matminer.featurizers.structure import DensityFeatures, SiteStatsFingerprint diff --git a/tox.ini b/tox.ini index 2616e35e..66c0a385 100644 --- a/tox.ini +++ b/tox.ini @@ -11,7 +11,7 @@ envlist = lint #manifest pyroma - flake8 < 5 + flake8 #mypy # documentation linters/checkers #doc8 @@ -56,17 +56,17 @@ description = Run linters. skip_install = true deps = darglint - flake8 < 5 + flake8 flake8-black flake8-bugbear flake8-colors flake8-docstrings - flake8-isort < 5 + flake8-isort flake8-print pep8-naming pydocstyle commands = - flake8 src/mofdscribe/ tests/ setup.py + flake8 --color always src/mofdscribe/ tests/ setup.py description = Run the flake8 tool with several plugins (docstrings, import order, pep8 naming). See https://cthoyt.com/2020/04/25/how-to-code-with-me-flake8.html for more information. [testenv:pyroma] From 5fb0c86d3e675de7f75a7a945a702c4e654a1bba Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Fri, 23 Dec 2022 18:41:24 +0100 Subject: [PATCH 07/17] refactoring done --- docs/source/api.rst | 3 +- docs/source/api/featurizers.rst | 17 ++++ docs/source/api/structures.rst | 9 +++ docs/source/contributing.rst | 9 ++- .../featurizers/bu/bu_featurizer.py | 81 ++++++++++++++++--- src/mofdscribe/featurizers/chemistry/price.py | 3 + src/mofdscribe/featurizers/chemistry/racs.py | 7 +- src/mofdscribe/featurizers/graph/__init__.py | 1 + .../featurizers/graph/dimensionality.py | 4 +- .../featurizers/hostguest/guest_aprdf.py | 4 +- .../hostguest/host_guest_featurizer.py | 6 +- src/mofdscribe/featurizers/hostguest/utils.py | 5 +- src/mofdscribe/featurizers/matmineradapter.py | 6 +- src/mofdscribe/featurizers/utils/__init__.py | 12 +-- src/mofdscribe/featurizers/utils/extend.py | 3 +- tests/featurizers/bu/test_bu_featurizer.py | 8 ++ .../featurizers/graph/test_dimensionality.py | 4 +- .../hostguest/test_host_guest_featurizer.py | 13 ++- 18 files changed, 155 insertions(+), 40 deletions(-) create mode 100644 docs/source/api/structures.rst diff --git a/docs/source/api.rst b/docs/source/api.rst index 537a6680..9e09f56f 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -10,4 +10,5 @@ API documentation api/splitters api/metrics api/bench - api/helpers \ No newline at end of file + api/helpers + api/structures \ No newline at end of file diff --git a/docs/source/api/featurizers.rst b/docs/source/api/featurizers.rst index 954b8b6f..048cb031 100644 --- a/docs/source/api/featurizers.rst +++ b/docs/source/api/featurizers.rst @@ -94,4 +94,21 @@ Host Guest featurization ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. automodule:: mofdscribe.featurizers.hostguest.host_guest_featurizer + :members: + + +Graph featurization +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: mofdscribe.featurizers.graph.graph_featurizer + :members: + +.. automodule:: mofdscribe.featurizers.graph.dimensionality + :members: + + +Matminer adapter +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: mofdscribe.featurizers.matmineradapter :members: \ No newline at end of file diff --git a/docs/source/api/structures.rst b/docs/source/api/structures.rst new file mode 100644 index 00000000..dc6e7045 --- /dev/null +++ b/docs/source/api/structures.rst @@ -0,0 +1,9 @@ + +Structure inputs +------------------- + +MOF +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: mofdscribe.mof + :members: diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index 713aa7da..96c1fa63 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -5,11 +5,12 @@ Extending and contributing to mofdscribe Implementing a new featurizer ----------------------------- -To implement a new featurizer, you typically need to create a new class that inherits from the :py:class:`~mofdscribe.featurizers.base.MOFBaseFeaturizer`. In this class, you need to implement three methods: -:py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize`, :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.feature_labels` and :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.citation`. +To implement a new featurizer, you typically need to create a new class that inherits from the :py:class:`~mofdscribe.featurizers.base.MOFBaseFeaturizer`. In this class, you need to implement four methods: +:py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize`, :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer._featurize`, :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.feature_labels` and :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.citation`. + +The main featurization logic happens in :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer._featurize`. +Your method should accept as input a :py:class:`~pymatgen.core.Structure` (:py:class:`~pymatgen.core.IStructure`, :py:class:`~pymatgen.core.Molecule`, :py:class:`~pymatgen.core.StructureGraph`) object and return a :py:class:`numpy.array`. The :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer._featurize` is supposed to extract the relevant object from a :py:object:`~mofdscribe.mof.MOF` object. -The main featurization logic happens in :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize`. -Your method should accept as input a :py:class:`~pymatgen.core.Structure` object and return a :py:class:`numpy.array`. The :py:meth:`mofdscribe.featurizers.base.MOFBaseFeaturizer.feature_labels` method should return a list of strings that describe the features returned by :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize`. The number of feature names should match the number of features returned by :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize` (i.e. the number of columns in the feature matrix). The :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.citation` method should return a list of strings of BibTeX citations for the featurizer. Generally, you also want to decorate your structure with the diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py index 2641c133..38e2ee7a 100644 --- a/src/mofdscribe/featurizers/bu/bu_featurizer.py +++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py @@ -6,6 +6,7 @@ import numpy as np from matminer.featurizers.base import MultipleFeaturizer from pydantic import BaseModel +from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph from pymatgen.core import IMolecule, IStructure, Molecule, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer, MOFMultipleFeaturizer @@ -14,6 +15,8 @@ from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS from mofdscribe.mof import MOF +if False: + from moffragmentor import SBU __all__ = ("BUFeaturizer", "BranchingSitesFeaturizer", "BindingSitesFeaturizer") @@ -26,6 +29,18 @@ class MOFBBs(BaseModel): linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]] +def _structuregraph_from_indices(mof, indices) -> StructureGraph: + from moffragmentor.utils import remove_all_nodes_not_in_indices + + structure_graph = mof.structure_graph.__copy__() + remove_all_nodes_not_in_indices(structure_graph, indices) + return structure_graph + + +def structuregraph_from_bu(mof: "MOF", bu: "SBU") -> StructureGraph: + return _structuregraph_from_indices(mof, bu._original_indices) + + class BUFeaturizer(MOFBaseFeaturizer): """ Compute features on the BUs and then aggregate them. @@ -104,10 +119,15 @@ def _extract_bbs( if mof is not None: fragments = mof.fragments - - if self._operates_on in ("both", "molecule"): + if self._operates_on & set([Molecule]): linkers = [linker.molecule for linker in fragments.linkers] nodes = [node.molecule for node in fragments.nodes] + elif self._operates_on & set([StructureGraph]): + linkers = [structuregraph_from_bu(mof, linker) for linker in fragments.linkers] + nodes = [structuregraph_from_bu(mof, node) for node in fragments.nodes] + elif self._operates_on & set([MoleculeGraph]): + linkers = [linker.molecule_graph for linker in fragments.linkers] + nodes = [node.molecule_graph for node in fragments.nodes] else: # create a boxed structure linkers = [linker._get_boxed_structure() for linker in fragments.linkers] @@ -122,18 +142,22 @@ def _extract_bbs( raise ValueError("All nodes and linkers must be of the same type.") this_type = types[0] - if this_type in (Structure, IStructure) and self._operates_on in ("both", "structure"): + if this_type in (Structure, IStructure) and self._operates_on & set( + [Molecule, Structure] + ): # this is the simple case, we do not need to convert to molecules pass - elif this_type in (Molecule, IMolecule) and self._operates_on in ("both", "molecule"): + elif this_type in (Molecule, IMolecule) and self._operates_on & set( + [Molecule, Structure] + ): # again simple case, we do not need to convert to structures pass - elif this_type in (Molecule, IMolecule) and (self._operates_on == "structure"): + elif this_type in (Molecule, IMolecule) and (self._operates_on & set([Molecule])): # we need to convert to structures nodes = [boxed_molecule(node) for node in nodes] linkers = [boxed_molecule(linker) for linker in linkers] - elif this_type in (Structure, IStructure) and (self._operates_on == "molecule"): + elif this_type in (Structure, IStructure) and (self._operates_on & set([Molecule])): raise ValueError( "You provided structures for a featurizer that operates on molecules. " / "Cannot automatically convert to molecules from structures." @@ -291,6 +315,7 @@ def implementors(self) -> List[str]: return self._featurizer.implementors() +# ToDo: generalize extract depending on what the featurizer operates on class BindingSitesFeaturizer(_BUSubBaseFeaturizer): """A special BU featurizer that operates on structures spanned by "binding sites". @@ -315,9 +340,25 @@ class BindingSitesFeaturizer(_BUSubBaseFeaturizer): _NAME = "BindingSitesFeaturizer" def _extract(self, mof: MOF): - linkers = [linker._get_binding_sites_structure() for linker in mof.fragments.linkers] - nodes = [node._get_binding_sites_structure() for node in mof.fragments.nodes] - return nodes, linkers + if Structure in self._operates_on: + linkers = [linker._get_binding_sites_structure() for linker in mof.fragments.linkers] + nodes = [node._get_binding_sites_structure() for node in mof.fragments.nodes] + return nodes, linkers + elif StructureGraph in self._operates_on: + linkers = [ + _structuregraph_from_indices(mof, linker._original_binding_indices) + for linker in mof.fragments.linkers + ] + nodes = [ + _structuregraph_from_indices(mof, node._original_binding_indices) + for node in mof.fragments.nodes + ] + elif Molecule in self._operates_on: + raise NotImplementedError("BindingSitesFeaturizer does not support Molecule yet.") + else: + raise NotImplementedError( + f"BindingSitesFeaturizer does not support featurizers operating on {self._operates_on}." + ) class BranchingSitesFeaturizer(_BUSubBaseFeaturizer): @@ -343,6 +384,22 @@ class BranchingSitesFeaturizer(_BUSubBaseFeaturizer): _NAME = "BranchingSitesFeaturizer" def _extract(self, mof: MOF): - linkers = [linker._get_branching_sites_structure() for linker in mof.fragments.linkers] - nodes = [node._get_branching_sites_structure() for node in mof.fragments.nodes] - return nodes, linkers + if Structure in self._operates_on: + linkers = [linker._get_branching_sites_structure() for linker in mof.fragments.linkers] + nodes = [node._get_branching_sites_structure() for node in mof.fragments.nodes] + return nodes, linkers + elif StructureGraph in self._operates_on: + linkers = [ + _structuregraph_from_indices(mof, linker._original_binding_indices) + for linker in mof.fragments.linkers + ] + nodes = [ + _structuregraph_from_indices(mof, node._original_binding_indices) + for node in mof.fragments.nodes + ] + elif Molecule in self._operates_on: + raise NotImplementedError("BranchingSitesFeaturizer does not support Molecule yet.") + else: + raise NotImplementedError( + f"BranchingSitesFeaturizer does not support featurizers operating on {self._operates_on}." + ) diff --git a/src/mofdscribe/featurizers/chemistry/price.py b/src/mofdscribe/featurizers/chemistry/price.py index 3e80205f..a89f5d83 100644 --- a/src/mofdscribe/featurizers/chemistry/price.py +++ b/src/mofdscribe/featurizers/chemistry/price.py @@ -9,6 +9,7 @@ import pandas as pd from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType @@ -77,6 +78,8 @@ def _price_per_atom(element_price_fractions_kg, structure): ) +@operates_on_istructure +@operates_on_structure class PriceLowerBound(MOFBaseFeaturizer): """Compute a lower bound for the price based on the element prices. diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py index 11012092..23159ce8 100644 --- a/src/mofdscribe/featurizers/chemistry/racs.py +++ b/src/mofdscribe/featurizers/chemistry/racs.py @@ -12,7 +12,11 @@ from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.chemistry._fragment import get_bb_indices from mofdscribe.featurizers.utils.aggregators import AGGREGATORS, ARRAY_AGGREGATORS -from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.featurizers.utils.extend import ( + operates_on_istructure, + operates_on_structure, + operates_on_structuregraph, +) from mofdscribe.featurizers.utils.structure_graph import ( get_connected_site_indices, get_neighbors_at_distance, @@ -124,6 +128,7 @@ def _get_racs_for_bbs( @operates_on_istructure @operates_on_structure +@operates_on_structuregraph class RACS(MOFBaseFeaturizer): r"""Modified version of the revised autocorrelation functions (RACs) for MOFs. diff --git a/src/mofdscribe/featurizers/graph/__init__.py b/src/mofdscribe/featurizers/graph/__init__.py index e69de29b..5a4e7539 100644 --- a/src/mofdscribe/featurizers/graph/__init__.py +++ b/src/mofdscribe/featurizers/graph/__init__.py @@ -0,0 +1 @@ +from .dimensionality import Dimensionality # noqa: F401 diff --git a/src/mofdscribe/featurizers/graph/dimensionality.py b/src/mofdscribe/featurizers/graph/dimensionality.py index d966b544..ff6814e8 100644 --- a/src/mofdscribe/featurizers/graph/dimensionality.py +++ b/src/mofdscribe/featurizers/graph/dimensionality.py @@ -7,9 +7,11 @@ from pymatgen.analysis.graphs import StructureGraph from mofdscribe.featurizers.graph.graphfeaturizer import GraphFeaturizer +from mofdscribe.featurizers.utils.extend import operates_on_structuregraph from mofdscribe.mof import MOF +@operates_on_structuregraph class Dimensionality(GraphFeaturizer): def __init__(self) -> None: """Construct a new Dimensionality featurizer.""" @@ -19,7 +21,7 @@ def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(mof.structure_graph) def _featurize(self, structure_graph: StructureGraph) -> np.ndarray: - return get_dimensionality_larsen(structure_graph) + return np.array([get_dimensionality_larsen(structure_graph)]) def feature_labels(self) -> List[str]: return ["dimensionality"] diff --git a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py index 7de9c50d..3dce89d2 100644 --- a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py +++ b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py @@ -5,7 +5,7 @@ import numpy as np from element_coder import encode -from pymatgen.core import IStructure, Structure +from pymatgen.core import IStructure, Structure, Molecule from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.hostguest.utils import _extract_host_guest @@ -91,7 +91,7 @@ def _extract_host_guest( return _extract_host_guest( structure=structure, remove_guests=True, - operates_on="molecule", + operates_on=set([Molecule]), local_env_method=self._local_env_method, ) diff --git a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py index 1d725074..f4c97249 100644 --- a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py +++ b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py @@ -120,7 +120,7 @@ def _fit( all_hosts.append(host_guest.host) all_guests.extend(host_guest.guests) - self._featurizer.fit(all_hosts + all_guests) + self._featurizer._fit(all_hosts + all_guests) def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(structure=mof.structure) @@ -151,10 +151,10 @@ def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: ) guest_feats = [] - host_feats = self._featurizer.featurize(host_guest.host) + host_feats = self._featurizer._featurize(host_guest.host) for guest in host_guest.guests: - guest_feats.append(self._featurizer.featurize(guest)) + guest_feats.append(self._featurizer._featurize(guest)) aggregated_feats = [] for aggregation in self._aggregations: diff --git a/src/mofdscribe/featurizers/hostguest/utils.py b/src/mofdscribe/featurizers/hostguest/utils.py index f53d7390..43a6acdb 100644 --- a/src/mofdscribe/featurizers/hostguest/utils.py +++ b/src/mofdscribe/featurizers/hostguest/utils.py @@ -39,7 +39,7 @@ class HostGuest(BaseModel): def _extract_host_guest( structure: Union[Structure, IStructure], - operates_on: str = "structure", + operates_on: str = set([Structure]), remove_guests: bool = True, local_env_method: str = "vesta", ): @@ -55,7 +55,8 @@ def _extract_host_guest( else: host = structure_graph.structure - if operates_on == "structure": + # ToDo: generalize this to other types + if operates_on & set([Structure]): mols = [boxed_molecule(mol) for mol in mols] return HostGuest(host=host, guests=mols) diff --git a/src/mofdscribe/featurizers/matmineradapter.py b/src/mofdscribe/featurizers/matmineradapter.py index 24be9ba5..91702df9 100644 --- a/src/mofdscribe/featurizers/matmineradapter.py +++ b/src/mofdscribe/featurizers/matmineradapter.py @@ -7,9 +7,11 @@ from pymatgen.core.structure import Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.utils.extend import operates_on_structure from mofdscribe.mof import MOF +@operates_on_structure class MatminerAdapter(MOFBaseFeaturizer): """MatminerAdapter is a wrapper for matminer featurizers to be used in mofdscribe. @@ -60,7 +62,9 @@ def featurize(self, mof: MOF) -> np.ndarray: def _fit(self, structures: List[Structure]) -> np.ndarray: """Call the fit method of the matminer featurizer.""" - self.matminer_featurizer.fit(structures) + self.matminer_featurizer.fit( + [Structure.from_sites(structure.sites) for structure in structures] + ) def fit(self, mofs: List[MOF]): """Call the fit method of the matminer featurizer using a list of MOFs as input. diff --git a/src/mofdscribe/featurizers/utils/__init__.py b/src/mofdscribe/featurizers/utils/__init__.py index 57db31df..fa7b1c78 100644 --- a/src/mofdscribe/featurizers/utils/__init__.py +++ b/src/mofdscribe/featurizers/utils/__init__.py @@ -5,7 +5,7 @@ from shutil import which import numpy as np -from pymatgen.core import Molecule, Structure +from pymatgen.core import Structure if sys.version_info.major == 3 and sys.version_info.minor >= 10: from collections.abc import MutableMapping @@ -45,12 +45,6 @@ def flatten(d: dict, parent_key: str = "", sep: str = ".") -> dict: def set_operates_on(self, featurizer): try: _operates_on = featurizer.operates_on() - if (Structure in _operates_on) and (Molecule in _operates_on): - self._operates_on = "both" - elif Structure in _operates_on: - self._operates_on = "structure" - elif Molecule in _operates_on: - self._operates_on = "molecule" - + self._operates_on = set(_operates_on) except AttributeError: - self._operates_on = "structure" + self._operates_on = {Structure} diff --git a/src/mofdscribe/featurizers/utils/extend.py b/src/mofdscribe/featurizers/utils/extend.py index ed00c907..5d307996 100644 --- a/src/mofdscribe/featurizers/utils/extend.py +++ b/src/mofdscribe/featurizers/utils/extend.py @@ -3,7 +3,7 @@ from functools import partial from typing import Type -from pymatgen.analysis.graphs import StructureGraph +from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph from pymatgen.core import IMolecule, IStructure, Molecule, Structure @@ -37,3 +37,4 @@ def operates_on(self): operates_on_molecule = partial(add_operates_on, type=Molecule) operates_on_imolecule = partial(add_operates_on, type=IMolecule) operates_on_structuregraph = partial(add_operates_on, type=StructureGraph) +operates_on_moleculegraph = partial(add_operates_on, type=MoleculeGraph) diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py index 51eac95e..14031012 100644 --- a/tests/featurizers/bu/test_bu_featurizer.py +++ b/tests/featurizers/bu/test_bu_featurizer.py @@ -14,6 +14,7 @@ MOFBBs, ) from mofdscribe.featurizers.bu.lsop_featurizer import LSOP +from mofdscribe.featurizers.graph.dimensionality import Dimensionality from mofdscribe.featurizers.matmineradapter import MatminerAdapter from mofdscribe.featurizers.topology import PHStats from mofdscribe.mof import MOF @@ -35,6 +36,13 @@ def test_bu_featurizer(hkust_structure, molecule): assert features[0] > 0 assert features[0] < 2 + # test if it works with graph featurizers + featurizer = BUFeaturizer(Dimensionality(), aggregations=("mean",)) + features = featurizer.featurize(mof=MOF(hkust_structure)) + assert features.shape == (2,) + assert features[0] == 0 + assert features[1] == 0 + def test_bu_featurizer_with_matminer_featurizer(hkust_structure, hkust_linker_structure): """Test that we can call BU featurizers with matminer molecules.""" diff --git a/tests/featurizers/graph/test_dimensionality.py b/tests/featurizers/graph/test_dimensionality.py index 52e3e85e..f8943113 100644 --- a/tests/featurizers/graph/test_dimensionality.py +++ b/tests/featurizers/graph/test_dimensionality.py @@ -7,5 +7,5 @@ def test_dimensionality(hkust_structure): dim = Dimensionality() sg = get_structure_graph(hkust_structure, "vesta") - assert dim.featurize(MOF(hkust_structure)) == 3 - assert dim._featurize(sg) == 3 + assert dim.featurize(MOF(hkust_structure))[0] == 3 + assert dim._featurize(sg)[0] == 3 diff --git a/tests/featurizers/hostguest/test_host_guest_featurizer.py b/tests/featurizers/hostguest/test_host_guest_featurizer.py index dbbd1bbf..b2993909 100644 --- a/tests/featurizers/hostguest/test_host_guest_featurizer.py +++ b/tests/featurizers/hostguest/test_host_guest_featurizer.py @@ -1,14 +1,25 @@ # -*- coding: utf-8 -*- from matminer.featurizers.structure.sites import SiteStatsFingerprint +from mofdscribe.featurizers.chemistry import APRDF from mofdscribe.featurizers.hostguest import HostGuestFeaturizer +from mofdscribe.featurizers.matmineradapter import MatminerAdapter from mofdscribe.mof import MOF def test_host_guest_featurizer(floating_structure): """Test the HostGuestFeaturizer.""" featurizer = HostGuestFeaturizer( - featurizer=SiteStatsFingerprint.from_preset("SOAP_formation_energy"), + featurizer=APRDF(), + aggregations=("mean",), + ) + features = featurizer.featurize(MOF(floating_structure)) + labels = featurizer._featurizer.feature_labels() + assert len(features) == 2 * len(labels) + + # Test the matminer adapter + featurizer = HostGuestFeaturizer( + featurizer=MatminerAdapter(SiteStatsFingerprint.from_preset("SOAP_formation_energy")), aggregations=("mean",), ) featurizer.fit([MOF(floating_structure)]) From 2d5981e06c05528608929ecb38ded79ce0a69ffd Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Fri, 23 Dec 2022 19:11:54 +0100 Subject: [PATCH 08/17] remove lint --- src/mofdscribe/featurizers/hostguest/guest_aprdf.py | 2 +- src/mofdscribe/featurizers/hostguest/utils.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py index 3dce89d2..b779786b 100644 --- a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py +++ b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py @@ -5,7 +5,7 @@ import numpy as np from element_coder import encode -from pymatgen.core import IStructure, Structure, Molecule +from pymatgen.core import IStructure, Molecule, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.hostguest.utils import _extract_host_guest diff --git a/src/mofdscribe/featurizers/hostguest/utils.py b/src/mofdscribe/featurizers/hostguest/utils.py index 43a6acdb..28e72e8a 100644 --- a/src/mofdscribe/featurizers/hostguest/utils.py +++ b/src/mofdscribe/featurizers/hostguest/utils.py @@ -39,10 +39,12 @@ class HostGuest(BaseModel): def _extract_host_guest( structure: Union[Structure, IStructure], - operates_on: str = set([Structure]), + operates_on: Optional[str] = None, remove_guests: bool = True, local_env_method: str = "vesta", ): + if operates_on is None: + operates_on = set([Structure]) if not isinstance(structure, (IStructure)): structure = IStructure.from_sites(structure.sites) From 0995bd3228889ebe39f95478fdbb0b58b993370f Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Sat, 24 Dec 2022 09:49:19 +0100 Subject: [PATCH 09/17] ready for test --- src/mofdscribe/mof.py | 11 ++++ tests/conftest.py | 6 ++ tests/featurizers/bu/test_bu_featurizer.py | 10 ++- tests/test_files/Fe-MOF-74.cif | 77 ++++++++++++++++++++++ 4 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 tests/test_files/Fe-MOF-74.cif diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py index dd5b3956..450ec4c7 100644 --- a/src/mofdscribe/mof.py +++ b/src/mofdscribe/mof.py @@ -64,6 +64,17 @@ def __init__( Defaults to ``"vesta"``. fragmentation_kwargs: The fragmentation kwargs to use for the fragmentation using ``moffragmentor``. Defaults to ``None``. + Some relevant kwargs are: + * check_dimensionality (bool): Check if the node is 0D. + If not, split into isolated metals. + Defaults to True. + * create_single_metal_bus (bool): Create a single metal BUs. + Defaults to False. + * break_organic_nodes_at_metal (bool): Break nodes into single metal BU + if they appear "too organic". + + For the dimensionality featurizers applied to building blocks, you want + to be careful wtih ``check_dimensionality``. """ self.__structure = ( IStructure.from_sites(structure.sites) diff --git a/tests/conftest.py b/tests/conftest.py index 87cf7b86..433b6286 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,6 +18,12 @@ def hkust_structure(): return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "HKUST-1.cif")) +@pytest.fixture(scope="session") +def mof74_structure(): + """Return a pymatgen Structure for MOF74""" + return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "Fe-MOF-74.cif")) + + @pytest.fixture(scope="session") def hkust_path(): """Return a path to a HKUST structure""" diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py index 14031012..510b435a 100644 --- a/tests/featurizers/bu/test_bu_featurizer.py +++ b/tests/featurizers/bu/test_bu_featurizer.py @@ -20,7 +20,7 @@ from mofdscribe.mof import MOF -def test_bu_featurizer(hkust_structure, molecule): +def test_bu_featurizer(hkust_structure, molecule, mof74_structure): """Test that we can call BU featurizers with pymatgen molecules.""" featurizer = BUFeaturizer(PHStats(no_supercell=True)) @@ -43,6 +43,14 @@ def test_bu_featurizer(hkust_structure, molecule): assert features[0] == 0 assert features[1] == 0 + # make sure we find the rod node + featurizer = BUFeaturizer(Dimensionality(), aggregations=("mean",)) + mof4 = MOF(mof74_structure, fragmentation_kwargs={"check_dimensionality": False}) + features = featurizer.featurize(mof=mof4) + assert features.shape == (2,) + assert features[0] == 1 + assert features[1] == 0 + def test_bu_featurizer_with_matminer_featurizer(hkust_structure, hkust_linker_structure): """Test that we can call BU featurizers with matminer molecules.""" diff --git a/tests/test_files/Fe-MOF-74.cif b/tests/test_files/Fe-MOF-74.cif new file mode 100644 index 00000000..2a6fd4bf --- /dev/null +++ b/tests/test_files/Fe-MOF-74.cif @@ -0,0 +1,77 @@ +data_crystal + +_cell_length_a 6.96775 +_cell_length_b 15.40000 +_cell_length_c 15.42110 +_cell_angle_alpha 117.56400 +_cell_angle_beta 98.67390 +_cell_angle_gamma 98.70470 + +_symmetry_space_group_name_Hall 'P 1' +_symmetry_space_group_name_H-M 'P 1' + +loop_ +_symmetry_equiv_pos_as_xyz + 'x,y,z' + +loop_ +_atom_site_label +_atom_site_type_symbol +_atom_site_fract_x +_atom_site_fract_y +_atom_site_fract_z +_atom_site_charge +C C 0.03198 0.90054 0.46577 0.1870000000 +C C 0.08103 0.46326 0.91497 -0.1740000000 +C C 0.13137 0.56584 0.09913 0.1870000000 +C C 0.16623 0.08519 0.54897 -0.1750000000 +C C 0.17945 0.43074 0.82994 0.3860000000 +C C 0.19214 0.98722 0.51494 -0.0710000000 +C C 0.20483 0.52779 0.01259 -0.0730000000 +C C 0.25129 0.60109 0.43035 0.3860000000 +C C 0.32290 0.51504 0.52768 -0.0720000000 +C C 0.34978 0.17032 0.60199 0.3860000000 +C C 0.38223 0.54845 0.46308 -0.1750000000 +C C 0.43442 0.46640 0.56592 0.1880000000 +C C 0.56558 0.53360 0.43408 0.1880000000 +C C 0.61777 0.45155 0.53692 -0.1750000000 +C C 0.65022 0.82968 0.39801 0.3860000000 +C C 0.67710 0.48496 0.47232 -0.0720000000 +C C 0.74871 0.39891 0.56965 0.3860000000 +C C 0.79517 0.47221 0.98741 -0.0730000000 +C C 0.80786 0.01278 0.48506 -0.0710000000 +C C 0.82055 0.56926 0.17006 0.3860000000 +C C 0.83377 0.91481 0.45103 -0.1750000000 +C C 0.86863 0.43416 0.90087 0.1870000000 +C C 0.91897 0.53674 0.08503 -0.1730000000 +C C 0.96802 0.09946 0.53423 0.1880000000 +H H 0.18345 0.52756 0.54920 0.0590000000 +H H 0.34407 0.97808 0.52739 0.0590000000 +H H 0.36584 0.54940 0.02155 0.0590000000 +H H 0.63416 0.45060 0.97845 0.0590000000 +H H 0.65593 0.02192 0.47261 0.0590000000 +H H 0.81655 0.47244 0.45080 0.0590000000 +O O 0.06914 0.36860 0.73969 -0.5690000000 +O O 0.07325 0.80916 0.43511 -0.5130000000 +O O 0.09643 0.61781 0.46347 -0.4540000000 +O O 0.26402 0.62706 0.19021 -0.5130000000 +O O 0.29956 0.62958 0.36820 -0.5690000000 +O O 0.32998 0.26088 0.63057 -0.5690000000 +O O 0.36301 0.43627 0.62724 -0.5130000000 +O O 0.36745 0.46401 0.84623 -0.4540000000 +O O 0.47868 0.84617 0.38113 -0.4540000000 +O O 0.52132 0.15383 0.61887 -0.4540000000 +O O 0.63255 0.53599 0.15377 -0.4540000000 +O O 0.63699 0.56373 0.37276 -0.5130000000 +O O 0.67002 0.73912 0.36943 -0.5690000000 +O O 0.70044 0.37042 0.63180 -0.5690000000 +O O 0.73598 0.37294 0.80979 -0.5130000000 +O O 0.90357 0.38219 0.53653 -0.4540000000 +O O 0.92675 0.19084 0.56489 -0.5130000000 +O O 0.93086 0.63140 0.26031 -0.5690000000 +Fe Fe 0.10807 0.32430 0.59200 1.1500000000 +Fe Fe 0.21621 0.73303 0.32332 1.1500000000 +Fe Fe 0.48320 0.59129 0.26667 1.1500000000 +Fe Fe 0.51680 0.40871 0.73333 1.1500000000 +Fe Fe 0.78379 0.26697 0.67668 1.1500000000 +Fe Fe 0.89193 0.67570 0.40800 1.1500000000 From 230a694be22a555d1cdb8573ad78712adf403d70 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Sat, 24 Dec 2022 10:59:20 +0100 Subject: [PATCH 10/17] pass fragmentation kwargs --- src/mofdscribe/mof.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py index 450ec4c7..f0cc03b5 100644 --- a/src/mofdscribe/mof.py +++ b/src/mofdscribe/mof.py @@ -102,13 +102,14 @@ def structure_graph(self) -> StructureGraph: return self.__structure_graph @classmethod - def from_file(cls, path: PathType, primitive: bool = True) -> "MOF": + def from_file(cls, path: PathType, primitive: bool = True, fragmentation_kwargs: Optional[dict]=None) -> "MOF": """Create a MOF class from a file. Args: path (PathType): The path to the file. primitive (bool): Whether to use the primitive cell or not. Defaults to ``True``. + fragmentation_kwargs: The fragmentation kwargs to use for the fragmentation using ``moffragmentor``. Returns: MOF: The MOF class. @@ -116,7 +117,8 @@ def from_file(cls, path: PathType, primitive: bool = True) -> "MOF": return cls( IStructure.from_file(path).get_primitive_structure() if primitive - else IStructure.from_file(path) + else IStructure.from_file(path), + fragmentation_kwargs=fragmentation_kwargs, ) @cached_property From c063c2d45019236b3fcb7d0eb00968d09168cbd4 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Sat, 24 Dec 2022 11:20:01 +0100 Subject: [PATCH 11/17] docs: add graph-based feats [skip ci] --- docs/source/background.rst | 11 +++++++++++ .../featurizers/graph/dimensionality.rst | 18 ++++++++++++++++++ .../host_guest/host_guest_aprdf.rst | 2 ++ docs/source/references.rst | 2 ++ 4 files changed, 33 insertions(+) create mode 100644 docs/source/featurizers/graph/dimensionality.rst diff --git a/docs/source/background.rst b/docs/source/background.rst index d47926d7..e19d032a 100644 --- a/docs/source/background.rst +++ b/docs/source/background.rst @@ -60,6 +60,17 @@ descriptor that only considers the local environment (of e.g., 3 atoms). For thi featurizers/global/* +Graph featurizers +-------------------- + +Graph featurizers are a special class of featurizers that are based on the structure graph. The structure graph is a periodic graph in which the edges are bonds and the nodes are atoms. + +.. toctree:: + :glob: + :maxdepth: 1 + + featurizers/graph/* + BU-centered featurizers ----------------------------- diff --git a/docs/source/featurizers/graph/dimensionality.rst b/docs/source/featurizers/graph/dimensionality.rst new file mode 100644 index 00000000..ad8330af --- /dev/null +++ b/docs/source/featurizers/graph/dimensionality.rst @@ -0,0 +1,18 @@ +Dimensionality +.................. + +Returns the dimensionality of the structure. This measure is based on [LarsenDimensionality]_, where the structure graph is analyzed. + +In the case of MOFs, rod like structures are considered 1D, sheet like structures are considered 2D, and 3D structures are considered 3D. + +This can be interesting for the metal nodes, where the typical SBUs such as Cu-paddlewheels are 0D. However, many well-known MOFs such as MOF-74 have infinite rod nodes, that this featurizer would consider 1D. + +.. featurizer:: Dimensionality + :id: Dimensionality + :considers_geometry: False + :considers_structure_graph: True + :encodes_chemistry: false + :scope: global + :scalar: True + + Returns the dimensionality of the structure. This measure is based on [LarsenDimensionality]_, where the structure graph is analyzed. \ No newline at end of file diff --git a/docs/source/featurizers/host_guest/host_guest_aprdf.rst b/docs/source/featurizers/host_guest/host_guest_aprdf.rst index 2fba836e..89867792 100644 --- a/docs/source/featurizers/host_guest/host_guest_aprdf.rst +++ b/docs/source/featurizers/host_guest/host_guest_aprdf.rst @@ -5,6 +5,8 @@ This featurizer builds on the :ref:`APRDF` featurizer, but instead of using the correlations between all atoms, it only considers the ones between the guest and all host atoms (within some cutoff distance). +The APRDF is defined as: + .. math:: \operatorname{RDF}^{P}(R)=f \sum_{i, j}^{\text {all atom pairs }} P_{i} P_{j} \mathrm{e}^{-B\left(r_{i j}-R\right)^{2}} diff --git a/docs/source/references.rst b/docs/source/references.rst index 2c1201fe..3bf3bd47 100644 --- a/docs/source/references.rst +++ b/docs/source/references.rst @@ -112,3 +112,5 @@ References .. [Trappe] `Potoff, J. J.; Siepmann, J. I. Vapor–Liquid Equilibria of Mixtures Containing Alkanes, Carbon Dioxide, and Nitrogen. AIChE Journal 2001, 47 (7), 1676–1682. `_ .. [Varoquaux] `Varoquaux, G. Cross-Validation Failure: Small Sample Sizes Lead to Large Error Bars. NeuroImage 2018, 180, 68–77. `_ + +.. [LarsenDimensionality] `Larsen, P. M.; Pandey, M.; Strange, M.; Jacobsen, K. W. Definition of a Scoring Parameter to Identify Low-Dimensional Materials Components. Phys. Rev. Materials 2019, 3 (3), 034003. `_ \ No newline at end of file From 13b127706a8635b34bc2e09722b55264fd7e19f4 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Thu, 29 Dec 2022 13:46:46 +0100 Subject: [PATCH 12/17] add more BU featurizers Fixes #419 --- .../source/featurizers/bu_centered/angles.rst | 24 ++++ .../featurizers/bu_centered/num_hops.rst | 25 ++++ docs/source/featurizers/global/size.rst | 4 + docs/source/getting_started.rst | 16 +++ src/mofdscribe/featurizers/bu/__init__.py | 11 +- .../featurizers/bu/angle_hist_featurizer.py | 82 +++++++++++ .../featurizers/bu/angle_stats_featurizer.py | 57 ++++++++ .../featurizers/bu/bu_featurizer.py | 128 +++++++++++++++++- .../bu/distance_hist_featurizer.py | 13 +- .../featurizers/chemistry/__init__.py | 1 + .../featurizers/chemistry/energygrid.py | 12 +- .../featurizers/chemistry/numatoms.py | 46 +++++++ .../chemistry/partialchargehistogram.py | 13 +- src/mofdscribe/featurizers/graph/numhops.py | 66 +++++++++ src/mofdscribe/featurizers/utils/mixins.py | 10 ++ src/mofdscribe/mof.py | 4 +- .../bu/test_angle_hist_featurizer.py | 11 ++ .../bu/test_angle_stats_featurizer.py | 11 ++ tests/featurizers/bu/test_bu_featurizer.py | 59 ++++++++ tests/featurizers/chemistry/test_num_atoms.py | 8 ++ tests/featurizers/graph/test_num_hop.py | 17 +++ 21 files changed, 597 insertions(+), 21 deletions(-) create mode 100644 docs/source/featurizers/bu_centered/angles.rst create mode 100644 docs/source/featurizers/bu_centered/num_hops.rst create mode 100644 docs/source/featurizers/global/size.rst create mode 100644 src/mofdscribe/featurizers/bu/angle_hist_featurizer.py create mode 100644 src/mofdscribe/featurizers/bu/angle_stats_featurizer.py create mode 100644 src/mofdscribe/featurizers/chemistry/numatoms.py create mode 100644 src/mofdscribe/featurizers/graph/numhops.py create mode 100644 src/mofdscribe/featurizers/utils/mixins.py create mode 100644 tests/featurizers/bu/test_angle_hist_featurizer.py create mode 100644 tests/featurizers/bu/test_angle_stats_featurizer.py create mode 100644 tests/featurizers/chemistry/test_num_atoms.py create mode 100644 tests/featurizers/graph/test_num_hop.py diff --git a/docs/source/featurizers/bu_centered/angles.rst b/docs/source/featurizers/bu_centered/angles.rst new file mode 100644 index 00000000..71d23d87 --- /dev/null +++ b/docs/source/featurizers/bu_centered/angles.rst @@ -0,0 +1,24 @@ +Angle-based description of BU shape +======================================= + +The following featurizers compute the angles between all pairs of atoms in a building block. +We always compute the angles A-COM-B, where COM is the center of mass of the building block. + +Given the distribution of the angles, we can compute fixed-length descriptors by either converting +the distribution to a histogram or computing some statistics (mean, standard deviation, etc.) of the distribution. + +.. featurizer:: PairWiseAngleHist + :id: PairWiseAngleHist + :considers_geometry: True + :considers_structure_graph: False + :encodes_chemistry: False + :scope: bu + :scalar: False + +.. featurizer:: PairWiseAngleStats + :id: PairWiseAngleStats + :considers_geometry: True + :considers_structure_graph: False + :encodes_chemistry: False + :scope: bu + :scalar: False diff --git a/docs/source/featurizers/bu_centered/num_hops.rst b/docs/source/featurizers/bu_centered/num_hops.rst new file mode 100644 index 00000000..597afb72 --- /dev/null +++ b/docs/source/featurizers/bu_centered/num_hops.rst @@ -0,0 +1,25 @@ +Shortest-Path Based Description of Building Blocks +====================================================== + +For certain targets, the proximity of connecting groups in a building unit +(e.g. carboxy groups) can be interesting features. + +One way to describe this generally is to compute the distribution +of shortest paths between special sites in the building units that +our ``moffragmentor`` package calls "binding sites" and "branching sites". + +.. featurizer:: BranchingNumHopFeaturizer + :id: BranchingNumHopFeaturizer + :considers_geometry: False + :considers_structure_graph: True + :encodes_chemistry: False + :scope: bu + :scalar: False + +.. featurizer:: BindingNumHopFeaturizer + :id: BindingNumHopFeaturizer + :considers_geometry: False + :considers_structure_graph: True + :encodes_chemistry: False + :scope: bu + :scalar: False diff --git a/docs/source/featurizers/global/size.rst b/docs/source/featurizers/global/size.rst new file mode 100644 index 00000000..1625e9cf --- /dev/null +++ b/docs/source/featurizers/global/size.rst @@ -0,0 +1,4 @@ +Extensive size descriptors +================================ + + diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 755c594d..fb45d341 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -62,6 +62,22 @@ directory. and notebook in the `examples folder `_. +.. admonition:: Saving time using the MOF object + :class: tip + + From our experience, the most time-consuming part of featurization is the + the computation of the structure graph or the fragments. + + Additionally, you often do not know in advance which featurizers you will + use. + + If you want to save in the case you need to compute additional features, + it can be practical to serialize the :py:class:`~mofdscribe.mof.MOF` objects + after the first featurization. + The objects will already contain the structure graph and the fragments (if + they have been computed in the first featurization). + + Using a reference dataset -------------------------- diff --git a/src/mofdscribe/featurizers/bu/__init__.py b/src/mofdscribe/featurizers/bu/__init__.py index b1658dfd..1add3acf 100644 --- a/src/mofdscribe/featurizers/bu/__init__.py +++ b/src/mofdscribe/featurizers/bu/__init__.py @@ -1,6 +1,15 @@ # -*- coding: utf-8 -*- """Featurizers operating on the secondary building units.""" -from .bu_featurizer import BUFeaturizer, MOFBBs # noqa: F401 +from .angle_hist_featurizer import PairWiseAngleHist # noqa: F401 +from .angle_stats_featurizer import PairWiseAngleStats # noqa: F401 +from .bu_featurizer import ( # noqa: F401 + BindingNumHopFeaturizer, + BindingSitesFeaturizer, + BranchingNumHopFeaturizer, + BranchingSitesFeaturizer, + BUFeaturizer, + MOFBBs, +) from .bu_matches import BUMatch # noqa: F401 from .compositionstats_featurizer import CompositionStats # noqa: F401 from .distance_hist_featurizer import PairwiseDistanceHist # noqa: F401 diff --git a/src/mofdscribe/featurizers/bu/angle_hist_featurizer.py b/src/mofdscribe/featurizers/bu/angle_hist_featurizer.py new file mode 100644 index 00000000..e51ea5c4 --- /dev/null +++ b/src/mofdscribe/featurizers/bu/angle_hist_featurizer.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +"""Describe molecules by computing histogram of angles A-COM-B between atoms A, B and COM. + +COM is the center of mass of the molecule. +""" +from typing import List, Union + +import numpy as np +from matminer.featurizers.base import BaseFeaturizer +from pymatgen.core import IMolecule, Molecule +from pymatgen.util.coord import get_angle + +from mofdscribe.featurizers.utils.extend import operates_on_imolecule, operates_on_molecule +from mofdscribe.featurizers.utils.histogram import get_rdf +from mofdscribe.featurizers.utils.mixins import GetGridMixin + + +@operates_on_molecule +@operates_on_imolecule +class PairWiseAngleHist(BaseFeaturizer, GetGridMixin): + _NAME = "PairWiseAngleHist" + + def __init__( + self, + lower_bound: float = 0.0, + upper_bound: float = 180.0, + bin_size: float = 20, + density: bool = True, + ) -> None: + """Create a new PairwiseDistanceHist featurizer. + + Args: + lower_bound (float): Lower bound of the histogram. + Defaults to 0.0. + upper_bound (float): Upper bound of the histogram. + Defaults to 180. + bin_size (float): Size of the bins. + Defaults to 20. + density (bool): Whether to return the density or the counts. + Defaults to True. + """ + self.lower_bound = lower_bound + self.upper_bound = upper_bound + self.bin_size = bin_size + self.density = density + + def feature_labels(self) -> List[str]: + return [ + f"{self._NAME}_{a}" + for a in self._get_grid(self.lower_bound, self.upper_bound, self.bin_size) + ] + + def _featurize(self, molecule: Union[Molecule, IMolecule]) -> np.ndarray: + # Get the pairwise distances + com = molecule.center_of_mass + angles = [] + for i in range(len(molecule)): + for j in range(len(molecule)): + if i > j: + v1 = molecule.cart_coords[i] - com + v2 = molecule.cart_coords[j] - com + angles.append(get_angle(v1, v2)) + + # Compute the histogram + features = get_rdf( + angles, + lower_lim=self.lower_bound, + upper_lim=self.upper_bound, + bin_size=self.bin_size, + density=self.density, + normalized=False, + ) + return features + + def featurize(self, molecule: Union[Molecule, IMolecule]) -> np.ndarray: + return self._featurize(molecule) + + def citations(self) -> List[str]: + return [] + + def implementors(self) -> List[str]: + return ["Kevin Maik Jablonka"] diff --git a/src/mofdscribe/featurizers/bu/angle_stats_featurizer.py b/src/mofdscribe/featurizers/bu/angle_stats_featurizer.py new file mode 100644 index 00000000..8469b012 --- /dev/null +++ b/src/mofdscribe/featurizers/bu/angle_stats_featurizer.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +"""Describe molecules by computing statistics for the distribution of angles A-COM-B between atoms A, B and COM. + +COM is the center of mass of the molecule. +""" + +from typing import List, Tuple, Union + +import numpy as np +from matminer.featurizers.base import BaseFeaturizer +from pymatgen.core import IMolecule, Molecule +from pymatgen.util.coord import get_angle + +from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS +from mofdscribe.featurizers.utils.extend import operates_on_imolecule, operates_on_molecule + + +@operates_on_molecule +@operates_on_imolecule +class PairWiseAngleStats(BaseFeaturizer): + _NAME = "PairWiseAngleStats" + + def __init__(self, aggreations: Tuple[str] = ("mean", "std", "min", "max")) -> None: + """Create a new PairwiseDistanceStats featurizer. + + Args: + aggreations (Tuple[str]): Aggreations to compute. + Defaults to ("mean", "std", "min", "max"). + """ + self.aggreations = aggreations + + def feature_labels(self) -> List[str]: + return [f"{self._NAME}_{a}" for a in self.aggreations] + + def _featurize(self, molecule: Union[Molecule, IMolecule]) -> np.ndarray: + # Get the pairwise distances + com = molecule.center_of_mass + angles = [] + for i in range(len(molecule)): + for j in range(len(molecule)): + if i > j: + v1 = molecule.cart_coords[i] - com + v2 = molecule.cart_coords[j] - com + angles.append(get_angle(v1, v2)) + + feats = [ARRAY_AGGREGATORS[a](angles) for a in self.aggreations] + + return np.array(feats) + + def featurize(self, molecule: Union[Molecule, IMolecule]) -> np.ndarray: + return self._featurize(molecule) + + def citations(self) -> List[str]: + return [] + + def implementors(self) -> List[str]: + return ["Kevin Maik Jablonka"] diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py index 38e2ee7a..d67677b4 100644 --- a/src/mofdscribe/featurizers/bu/bu_featurizer.py +++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """Compute features on the BUs and then aggregate them.""" from abc import abstractmethod -from typing import Collection, List, Optional, Tuple, Union +from typing import Callable, Collection, List, Optional, Tuple, Union import numpy as np from matminer.featurizers.base import MultipleFeaturizer @@ -11,13 +11,20 @@ from mofdscribe.featurizers.base import MOFBaseFeaturizer, MOFMultipleFeaturizer from mofdscribe.featurizers.bu.utils import boxed_molecule +from mofdscribe.featurizers.graph.numhops import NumHops from mofdscribe.featurizers.utils import nan_array, set_operates_on from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS from mofdscribe.mof import MOF if False: - from moffragmentor import SBU -__all__ = ("BUFeaturizer", "BranchingSitesFeaturizer", "BindingSitesFeaturizer") + from moffragmentor import SBU # for type hints +__all__ = ( + "BUFeaturizer", + "BranchingSitesFeaturizer", + "BindingSitesFeaturizer", + "BranchingNumHopFeaturizer", + "BindingNumHopFeaturizer", +) # since fragmentation is not super straightforward, @@ -353,6 +360,7 @@ def _extract(self, mof: MOF): _structuregraph_from_indices(mof, node._original_binding_indices) for node in mof.fragments.nodes ] + return nodes, linkers elif Molecule in self._operates_on: raise NotImplementedError("BindingSitesFeaturizer does not support Molecule yet.") else: @@ -397,9 +405,123 @@ def _extract(self, mof: MOF): _structuregraph_from_indices(mof, node._original_binding_indices) for node in mof.fragments.nodes ] + return nodes, linkers elif Molecule in self._operates_on: raise NotImplementedError("BranchingSitesFeaturizer does not support Molecule yet.") else: raise NotImplementedError( f"BranchingSitesFeaturizer does not support featurizers operating on {self._operates_on}." ) + + +def _extract_branching_indices(bu): + return bu.graph_branching_indices + + +def _extract_binding_indices(bu): + return bu.binding_indices + + +class _NumSiteHops(BUFeaturizer): + _NAME = "NumBranchingSiteHops" + + def __init__(self, hop_stat_aggregations, aggregations, index_extractor: Callable): + self.hop_stat_aggregations = hop_stat_aggregations + self.aggregations = aggregations + self.index_extractor = index_extractor + self._featurizer = NumHops(self.hop_stat_aggregations) + + def _extract(self, mof: MOF): + + linkers = [linker.molecule_graph for linker in mof.fragments.linkers] + nodes = [node.molecule_graph for node in mof.fragments.nodes] + + linker_indices = [self.index_extractor(linker) for linker in mof.fragments.linkers] + node_indices = [self.index_extractor(node) for node in mof.fragments.nodes] + return nodes, linkers, node_indices, linker_indices + + def featurize(self, mof: Optional[MOF] = None) -> np.ndarray: + nodes, linkers, node_indices, linker_indices = self._extract(mof) + node_feats = np.array( + [ + self._featurizer._featurize(node, node_idx) + for node, node_idx in zip(nodes, node_indices) + ] + ) + linker_feats = np.array( + [ + self._featurizer._featurize(linker, linker_idx) + for linker, linker_idx in zip(linkers, linker_indices) + ] + ) + + node_aggregated = np.concatenate( + [ARRAY_AGGREGATORS[agg](node_feats, axis=0) for agg in self.aggregations] + ) + linker_aggregated = np.concatenate( + [ARRAY_AGGREGATORS[agg](linker_feats, axis=0) for agg in self.aggregations] + ) + return np.concatenate([node_aggregated, linker_aggregated]) + + def feature_labels(self) -> List[str]: + base_feature_labels = self._featurizer.feature_labels() + feature_labels = [] + for bb in ["node", "linker"]: + for agg in self.aggregations: + for feat in base_feature_labels: + feature_labels.append(f"{self._NAME}_{bb}_{agg}_{feat}") + return feature_labels + + def implementors(self) -> List[str]: + return ["Kevin Maik Jablonka"] + + def citations(self) -> List[str]: + return [] + + +class BranchingNumHopFeaturizer(_NumSiteHops): + """Compute statistics on the shortest path lengths between branching sites.""" + + _NAME = "BranchingNumHopFeaturizer" + + def __init__( + self, + hop_stat_aggregations: Tuple[str] = ("mean", "std", "min", "max"), + aggregations: Tuple[str] = ("mean", "std", "min", "max"), + ): + """Construct a BranchingNumHopFeaturizer. + + Args: + hop_stat_aggregations (Tuple[str]): Aggregation functions to apply to the + shortest path lengths between branching sites on a building blocks. + Defaults to ("mean", "std", "min", "max"). + aggregations (Tuple[str]): Aggregation functions to apply to the + aggregated statistcs of the shortest path lengths between branching sites + of different building blocks of the same . + Defaults to ("mean", "std", "min", "max"). + """ + super().__init__(hop_stat_aggregations, aggregations, _extract_branching_indices) + + +class BindingNumHopFeaturizer(_NumSiteHops): + """Compute statistics on the shortest path lengths between binding sites.""" + + _NAME = "BindingNumHopFeaturizer" + + def __init__( + self, + hop_stat_aggregations: Tuple[str] = ("mean", "std", "min", "max"), + aggregations: Tuple[str] = ("mean", "std", "min", "max"), + ): + """Construct a BindingNumHopFeaturizer. + + Args: + hop_stat_aggregations (Tuple[str]): Aggregation functions to apply to the + shortest path lengths between branching sites on a building blocks. + Defaults to ("mean", "std", "min", "max"). + aggregations (Tuple[str]): Aggregation functions to apply to the + aggregated statistcs of the shortest path lengths between branching sites + of different building blocks of the same . + Defaults to ("mean", "std", "min", "max"). + """ + super().__init__(hop_stat_aggregations, aggregations, _extract_binding_indices) diff --git a/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py b/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py index dc5fea1f..ef443fe4 100644 --- a/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py +++ b/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py @@ -13,13 +13,14 @@ operates_on_structure, ) from mofdscribe.featurizers.utils.histogram import get_rdf +from mofdscribe.featurizers.utils.mixins import GetGridMixin @operates_on_molecule @operates_on_imolecule @operates_on_istructure @operates_on_structure -class PairwiseDistanceHist(BaseFeaturizer): +class PairwiseDistanceHist(BaseFeaturizer, GetGridMixin): """ Describe the shape of molecules by computing a histogram of pairwise distances. @@ -32,6 +33,8 @@ class PairwiseDistanceHist(BaseFeaturizer): reported in [Zhang2022]_. """ + _NAME = "PairwiseDistanceHist" + def __init__( self, lower_bound: float = 0.0, @@ -56,11 +59,11 @@ def __init__( self.bin_size = bin_size self.density = density - def _get_grid(self): - return np.arange(self.lower_bound, self.upper_bound, self.bin_size) - def feature_labels(self) -> List[str]: - return [f"pairwise_distance_hist_{a}" for a in self._get_grid()] + return [ + f"{self._NAME}_{a}" + for a in self._get_grid(self.lower_bound, self.upper_bound, self.bin_size) + ] def featurize(self, structure: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray: return self._featurize(structure) diff --git a/src/mofdscribe/featurizers/chemistry/__init__.py b/src/mofdscribe/featurizers/chemistry/__init__.py index ade4df4c..f569d3e3 100644 --- a/src/mofdscribe/featurizers/chemistry/__init__.py +++ b/src/mofdscribe/featurizers/chemistry/__init__.py @@ -4,6 +4,7 @@ from .aprdf import APRDF # noqa: F401 from .energygrid import EnergyGridHistogram # noqa: F401 from .henry import Henry # noqa: F401 +from .numatoms import NumAtoms # noqa: F401 from .partialchargehistogram import PartialChargeHistogram # noqa: F401 from .partialchargestats import PartialChargeStats # noqa: F401 from .price import PriceLowerBound # noqa: F401 diff --git a/src/mofdscribe/featurizers/chemistry/energygrid.py b/src/mofdscribe/featurizers/chemistry/energygrid.py index e8a8dd5b..1ccee4d1 100644 --- a/src/mofdscribe/featurizers/chemistry/energygrid.py +++ b/src/mofdscribe/featurizers/chemistry/energygrid.py @@ -11,6 +11,7 @@ from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.histogram import get_rdf +from mofdscribe.featurizers.utils.mixins import GetGridMixin from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa from mofdscribe.mof import MOF @@ -64,7 +65,7 @@ def read_ascii_grid(filename: Union[str, os.PathLike]) -> pd.DataFrame: @operates_on_istructure @operates_on_structure -class EnergyGridHistogram(MOFBaseFeaturizer): +class EnergyGridHistogram(MOFBaseFeaturizer, GetGridMixin): """Computes the energy grid histograms as originally proposed by Bucior et al. [Bucior2019]_. Conventionally, energy grids can be used to speed up molecular simulations. @@ -88,6 +89,8 @@ class EnergyGridHistogram(MOFBaseFeaturizer): https://doi.org/10.1063/5.0050823. """ + _NAME = "EnergyGridHistogram" + def __init__( self, raspa_dir: Union[str, os.PathLike, None] = None, @@ -180,15 +183,12 @@ def fit_transform(self, structures: List[Union[Structure, IStructure]]): def fit(self, structure: Union[Structure, IStructure]): ... - def _get_grid(self): - return np.arange(self.min_energy_vdw, self.max_energy_vdw, self.bin_size_vdw) - def feature_labels(self) -> List[str]: - grid = self._get_grid() + grid = self._get_grid(self.min_energy_vdw, self.max_energy_vdw, self.bin_size_vdw) labels = [] for site in self.sites: for grid_point in grid: - labels.append(f"energygridhist_{self.mol_name}_{site}_{grid_point}") + labels.append(f"{self._NAME}_{self.mol_name}_{site}_{grid_point}") return labels def featurize(self, mof: MOF) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/chemistry/numatoms.py b/src/mofdscribe/featurizers/chemistry/numatoms.py new file mode 100644 index 00000000..fd4632fc --- /dev/null +++ b/src/mofdscribe/featurizers/chemistry/numatoms.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +"""Atom count featurizer.""" +from typing import List, Union + +import numpy as np +from pymatgen.core import IMolecule, IStructure, Molecule, Structure + +from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.utils.extend import ( + operates_on_imolecule, + operates_on_istructure, + operates_on_molecule, + operates_on_structure, +) +from mofdscribe.mof import MOF + + +@operates_on_structure +@operates_on_istructure +@operates_on_molecule +@operates_on_imolecule +class NumAtoms(MOFBaseFeaturizer): + """Featurizer that returns the number of atoms in a structure.""" + + def __init__(self) -> None: + """Construct a new NumAtoms featurizer.""" + super().__init__() + + def _featurize( + self, structure_object: Union[Structure, IStructure, Molecule, IMolecule] + ) -> np.ndarray: + if not isinstance(structure_object, (Structure, IStructure, Molecule, IMolecule)): + raise ValueError("Structure object must be a pymatgen Structure or Molecule.") + return np.array([len(structure_object)]) + + def featurize(self, mof: MOF) -> int: + return len(mof.structure) + + def feature_labels(self) -> List[str]: + return ["num_atoms"] + + def citations(self) -> List[str]: + return [] + + def implementors(self) -> List[str]: + return ["Kevin Maik Jablonka"] diff --git a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py index 963e7f78..7c5e43bd 100644 --- a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py +++ b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py @@ -9,6 +9,7 @@ from mofdscribe.featurizers.utils.eqeq import get_eqeq_charges from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.histogram import get_rdf +from mofdscribe.featurizers.utils.mixins import GetGridMixin from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType @@ -17,13 +18,15 @@ @operates_on_istructure @operates_on_structure -class PartialChargeHistogram(MOFBaseFeaturizer): +class PartialChargeHistogram(MOFBaseFeaturizer, GetGridMixin): """Compute partial charges using the EqEq charge equilibration method [Ongari2019]_. Then derive a fix-length feature vector from the partial charges by binning charges in a histogram. """ + _NAME = "PartialChargeHistogram" + def __init__( self, min_charge: float = -4, @@ -44,11 +47,11 @@ def __init__( self.max_charge = max_charge self.bin_size = bin_size - def _get_grid(self): - return np.arange(self.min_charge, self.max_charge, self.bin_size) - def feature_labels(self) -> List[str]: - return [f"chargehist_{val}" for val in self._get_grid()] + return [ + f"{self._NAME}_{val}" + for val in self._get_grid(self.min_charge, self.max_charge, self.bin_size) + ] def featurize(self, mof: MOF) -> np.ndarray: return self._featurize(s=mof.structure) diff --git a/src/mofdscribe/featurizers/graph/numhops.py b/src/mofdscribe/featurizers/graph/numhops.py new file mode 100644 index 00000000..7540aec3 --- /dev/null +++ b/src/mofdscribe/featurizers/graph/numhops.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +"""Featurizer that computes statistics on the number of hops between binding/branching sites.""" + +from typing import Collection, Tuple, Union + +import networkx as nx +import numpy as np +from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph + +from mofdscribe.featurizers.graph.graphfeaturizer import GraphFeaturizer +from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS +from mofdscribe.featurizers.utils.extend import ( + operates_on_moleculegraph, + operates_on_structuregraph, +) +from mofdscribe.mof import MOF + + +def _hop_stats( + structuregraph: Union[StructureGraph, MoleculeGraph], indices: Collection[int], aggregations +) -> np.ndarray: + # for all combinations of indices compute the shortest path + # and return the statistics of the shortest paths + shortest_paths = [] + undirected = structuregraph.graph.to_undirected() + for i_count, i in enumerate(indices): + for j_count, j in enumerate(indices): + if i_count > j_count: + shortest_paths.append(nx.shortest_path_length(undirected, i, j)) + return np.array([ARRAY_AGGREGATORS[agg](shortest_paths) for agg in aggregations]) + + +@operates_on_structuregraph +@operates_on_moleculegraph +class NumHops(GraphFeaturizer): + """Featurizer that computes statistics on the shortest path lengths between sites.""" + + _NAME = "num_hops" + + def __init__(self, aggregations: Tuple[str] = ("mean", "std", "min", "max")) -> None: + """Construct a NumHops featurizer. + + Args: + aggregations: The aggregations to compute on the shortest path lengths. + Defaults to ("mean", "std", "min", "max"). + """ + self.aggregations = aggregations + + def _featurize( + self, structuregraph: Union[StructureGraph, MoleculeGraph], indices: Collection[int] + ) -> np.ndarray: + # for all combinations of indices compute the shortest path + # and return the statistics of the shortest paths + return _hop_stats(structuregraph, indices, self.aggregations) + + def featurize(self, mof: MOF, indices: Collection[int]) -> np.ndarray: + return self._featurize(mof.structure_graph, indices) + + def feature_labels(self) -> Collection[str]: + return [f"{self._NAME}_{agg}" for agg in self.aggregations] + + def citations(self) -> Collection[str]: + return [] + + def implementors(self) -> Collection[str]: + return ["Kevin Maik Jablonka"] diff --git a/src/mofdscribe/featurizers/utils/mixins.py b/src/mofdscribe/featurizers/utils/mixins.py new file mode 100644 index 00000000..d995456a --- /dev/null +++ b/src/mofdscribe/featurizers/utils/mixins.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +"""Mixin classes for featurizers.""" +import numpy as np + + +class GetGridMixin: + """Mixin class for getting a linearly spaced grid.""" + + def _get_grid(self, lower_bound, upper_bound, bin_size): + return np.arange(lower_bound, upper_bound, bin_size) diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py index f0cc03b5..4cb43e75 100644 --- a/src/mofdscribe/mof.py +++ b/src/mofdscribe/mof.py @@ -102,7 +102,9 @@ def structure_graph(self) -> StructureGraph: return self.__structure_graph @classmethod - def from_file(cls, path: PathType, primitive: bool = True, fragmentation_kwargs: Optional[dict]=None) -> "MOF": + def from_file( + cls, path: PathType, primitive: bool = True, fragmentation_kwargs: Optional[dict] = None + ) -> "MOF": """Create a MOF class from a file. Args: diff --git a/tests/featurizers/bu/test_angle_hist_featurizer.py b/tests/featurizers/bu/test_angle_hist_featurizer.py new file mode 100644 index 00000000..e58ab7ab --- /dev/null +++ b/tests/featurizers/bu/test_angle_hist_featurizer.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +from mofdscribe.featurizers.bu.angle_hist_featurizer import PairWiseAngleHist + + +def test_angle_hist_featurizer(molecule): + featurizer = PairWiseAngleHist(density=True) + feats = featurizer.featurize(molecule) + labels = featurizer.feature_labels() + assert len(feats) == len(labels) + assert feats[0] == 0 diff --git a/tests/featurizers/bu/test_angle_stats_featurizer.py b/tests/featurizers/bu/test_angle_stats_featurizer.py new file mode 100644 index 00000000..d4d2f9d1 --- /dev/null +++ b/tests/featurizers/bu/test_angle_stats_featurizer.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +from mofdscribe.featurizers.bu.angle_stats_featurizer import PairWiseAngleStats + + +def test_angle_stats_featurizer(molecule): + featurizer = PairWiseAngleStats() + feats = featurizer.featurize(molecule) + labels = featurizer.feature_labels() + assert len(feats) == len(labels) + assert (feats[0] > 90) & (feats[0] < 120) + assert feats[-1] == 180 diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py index 510b435a..a49f8c87 100644 --- a/tests/featurizers/bu/test_bu_featurizer.py +++ b/tests/featurizers/bu/test_bu_featurizer.py @@ -8,7 +8,9 @@ from pymatgen.core import Structure from mofdscribe.featurizers.bu.bu_featurizer import ( + BindingNumHopFeaturizer, BindingSitesFeaturizer, + BranchingNumHopFeaturizer, BranchingSitesFeaturizer, BUFeaturizer, MOFBBs, @@ -122,3 +124,60 @@ def test_branching_site_featurizer(hkust_structure): assert df["BranchingSitesFeaturizer_node_mean_lsop_sq_plan_max"].values[0] > 0.9 assert df["BranchingSitesFeaturizer_linker_mean_lsop_tri_plan"].values[0] > 0.98 + + +def test_branching_num_hop(hkust_structure): + mof = MOF(hkust_structure) + + featurizer = BranchingNumHopFeaturizer() + feats = featurizer.featurize(mof=mof) + labels = featurizer.feature_labels() + assert len(feats) == len(labels) + assert np.allclose( + feats, + [ + 4.0, + 0.0, + 4.0, + 4.0, + 0.0, + 0.0, + 0.0, + 0.0, + 4.0, + 0.0, + 4.0, + 4.0, + 4.0, + 0.0, + 4.0, + 4.0, + 4.0, + 0.0, + 4.0, + 4.0, + 0.0, + 0.0, + 0.0, + 0.0, + 4.0, + 0.0, + 4.0, + 4.0, + 4.0, + 0.0, + 4.0, + 4.0, + ], + ) + + +def test_binding_num_hop(hkust_structure): + mof = MOF(hkust_structure) + + featurizer = BindingNumHopFeaturizer() + feats = featurizer.featurize(mof=mof) + labels = featurizer.feature_labels() + assert len(feats) == len(labels) + assert feats[2] == 2.0 + assert feats[3] == 4.0 diff --git a/tests/featurizers/chemistry/test_num_atoms.py b/tests/featurizers/chemistry/test_num_atoms.py new file mode 100644 index 00000000..30f52c68 --- /dev/null +++ b/tests/featurizers/chemistry/test_num_atoms.py @@ -0,0 +1,8 @@ +from mofdscribe.featurizers.chemistry import NumAtoms +from mofdscribe.mof import MOF + + +def test_num_atoms(hkust_structure): + mof = MOF(hkust_structure) + num_atoms = NumAtoms() + assert num_atoms.featurize(mof) == 624 diff --git a/tests/featurizers/graph/test_num_hop.py b/tests/featurizers/graph/test_num_hop.py new file mode 100644 index 00000000..a4bb6a44 --- /dev/null +++ b/tests/featurizers/graph/test_num_hop.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +import numpy as np + +from mofdscribe.featurizers.graph.numhops import NumHops +from mofdscribe.mof import MOF + + +def test_num_hops(hkust_structure): + mof = MOF(hkust_structure) + num_hops = NumHops() + features = num_hops.featurize(mof, [0, 463]) + assert len(features) == len(num_hops.feature_labels()) + assert np.allclose(features, [1.0, 0, 1.0, 1.0]) + + features = num_hops.featurize(mof, [0, 272]) + assert len(features) == len(num_hops.feature_labels()) + assert np.allclose(features, [2.0, 0.0, 2.0, 2.0]) From 0990f9a100807d71f9a3cba101c08bd81cfaac4f Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Thu, 29 Dec 2022 18:25:22 +0100 Subject: [PATCH 13/17] wip: update racs [ci skip] --- src/mofdscribe/featurizers/bu/racs.py | 203 ++++++++++++++++++ src/mofdscribe/featurizers/chemistry/racs.py | 42 +++- .../featurizers/utils/structure_graph.py | 80 ++++--- tests/featurizers/chemistry/test_fragment.py | 3 + tests/featurizers/chemistry/test_racs.py | 15 +- tests/utils/test_structure_graph.py | 14 +- 6 files changed, 311 insertions(+), 46 deletions(-) create mode 100644 src/mofdscribe/featurizers/bu/racs.py diff --git a/src/mofdscribe/featurizers/bu/racs.py b/src/mofdscribe/featurizers/bu/racs.py new file mode 100644 index 00000000..5f29e872 --- /dev/null +++ b/src/mofdscribe/featurizers/bu/racs.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- +from typing import Tuple, Union, List, Optional, Collection, Dict, Set +import networkx.algorithms.community as nx_comm +from mofdscribe.featurizers.base import MOFBaseFeaturizer +from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph +from mofdscribe.utils.extend import operates_on_moleculegraph, operates_on_structuregraph +from mofdscribe.mof import MOF +from element_coder.data.coding_data import get_coding_dict +from mofdscribe.utils.structuregraph import get_neighbors_up_to_scope +from collections import defaultdict +from mofdscribe.featurizers.utils.aggregators import AGGREGATORS, ARRAY_AGGREGATORS +from mofdscribe.featurizers.chemistry.racs import compute_racs +from collections import OrderedDict + +def _get_site_iter(structuregraph: Union[StructureGraph, MoleculeGraph]): + if isinstance(structuregraph, StructureGraph): + return structuregraph.structure.sites + elif isinstance(structuregraph, MoleculeGraph): + return structuregraph.molecule.sites + else: + raise ValueError("structuregraph must be either a StructureGraph or a MoleculeGraph") + +def _get_atom_site_indices(structuregraph: Union[StructureGraph, MoleculeGraph], atom_groups: Collection[Tuple[str, List[str], bool]]): + """Get the indices of the sites that belong to the atom groups.""" + atom_grouped_indices = defaultdict(set) + for atom_group_name, elements, _no_terminal in atom_groups: + for site in _get_site_iter(structuregraph): + if site.specie.symbol in elements: + atom_grouped_indices[atom_group_name].add(site.index) + return atom_grouped_indices + +def _split_up_communities( + structuregraph: Union[StructureGraph, MoleculeGraph], + communities: List[List[int]], + atom_groups: Collection[Tuple[str, List[str], bool]], +): + """Create dictionary of communities with atom groups as keys.""" + atom_grouped_communities = defaultdict(set) + indices_to_atom_group = _get_atom_site_indices() + + for atom_group_name, elements, _no_terminal in atom_groups: + for communities in communities: + atom_grouped_communities[atom_group_name].update(indices_to_atom_group & set(communities)) + + return atom_grouped_communities + + + + +_ALL_ELEMENTS = set(get_coding_dict("atomic").keys()) +_ALL_ELEMENTS_EXCEPT_C_H = _ALL_ELEMENTS - {"C", "H"} + + +# ToDo: generalize the code and implement in only one place +# the main racs code is quite similar to this one +def _get_racs_for_community( + community_indices, + structure_graph: StructureGraph, + neighbors_at_distance: Dict[int, Dict[int, Set[int]]], + properties: Tuple[str], + scopes: List[int], + property_aggregations: Tuple[str], + corr_aggregations: Tuple[str], + community_aggregations: Tuple[str], + community_name: str = "", +): + community_racs = defaultdict(list) + + if not community_indices: + # one nested list to make it trigger filling it with nan values + community_indices = [[]] + for start_indices in community_indices: + for scope in scopes: + racs = compute_racs( + start_indices, + structure_graph, + neighbors_at_distance, + properties, + scope, + property_aggregations, + corr_aggregations, + community_name, + ) + for k, v in racs.items(): + community_racs[k].append(v) + + aggregated_racs = {} + for racs_name, racs_values in community_racs.items(): + for community_agg in community_aggregations: + agg_func = ARRAY_AGGREGATORS[community_agg] + name = f"{racs_name}__atomgroupagg-{community_agg}" + aggregated_racs[name] = agg_func(racs_values) + + return aggregated_racs + + +@operates_on_structuregraph +@operates_on_moleculegraph +class ModularityCommunityCenteredRACS(MOFBaseFeaturizer): + _NAME = "ModularityCommunityCenteredRACS" + + def __init__( + self, + atom_groups: Optional[Collection[Tuple[str, Collection[str], bool]]] = None, + attributes: Tuple[Union[int, str]] = ("X", "mod_pettifor", "I", "T"), + scopes: Tuple[int] = (1, 2, 3), + prop_agg: Tuple[str] = ("product", "diff"), + corr_agg: Tuple[str] = ("sum", "avg"), + atom_groups_agg: Tuple[str] = ("avg", "sum"), + bond_heuristic: str = "vesta", + dont_use_communities: bool = False, + ): + # heteroatom groups is a list of atoms which we will aggregate seperately if they are in a community + # additional, we can specify if we only aggregate them seperately if they are not terminal or part of a bridge + # if the atom groups are specified, there should maybe (?) also be an option to specify "all" + if atom_groups is None: + atom_groups = [("all", _ALL_ELEMENTS, False)] + self.atom_groups = atom_groups + self.attributes = attributes + self.scopes = scopes + self.prop_agg = prop_agg + self.corr_agg = corr_agg + self.atom_groups_agg = atom_groups_agg + self.bond_heuristic = bond_heuristic + self.dont_use_communities = dont_use_communities + + def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]): + + if not self.dont_use_communities: + communities = list( + nx_comm.greedy_modularity_communities(structuregraph.graph.to_undirected()) + ) + else: + communities = [range(len(structuregraph))] # make one community with all atoms + + neighbors_at_distance = { + i: get_neighbors_up_to_scope(structuregraph, i, max(self.scopes)) + for i in range(len(structuregraph)) + } + + groups = _split_up_communities(structuregraph, communities, self.atom_groups) + + racs = {} + for group_name, group_indices in groups.items(): + racs.update( + _get_racs_for_community( + group_indices, + structuregraph, + neighbors_at_distance, + self.attributes, + self.scopes, + self.prop_agg, + self.corr_agg, + self.atom_groups_agg, + group_name, + ) + ) + + racs_ordered = OrderedDict(sorted(racs.items())) + return np.array(list(racs_ordered.values())) + + + def featurize(self, mof: MOF): + return self._featurize(mof.structure_graph) + + def _get_feature_labels(self): + names = [] + for atom_group_name, _, _ in self.atom_groups: + for scope in self.scopes: + for prop in self.attributes: + for property_agg in self.prop_agg: + for cor_agg in self.corr_agg: + for atom_group_agg in self.atom_groups_agg: + names.append( + f"{self._NAME}-{atom_group_name}_prop-{prop}_scope-{scope}_propagg-{property_agg}_corragg-{cor_agg}_atomgroupagg-{atom_group_agg}" # noqa: E501 + ) + + names = sorted(names) + return names + + def feature_labels(self): + return self._get_feature_labels() + + def citations(self) -> List[str]: + return [ + "@article{Janet2017," + "doi = {10.1021/acs.jpca.7b08750}," + "url = {https://doi.org/10.1021/acs.jpca.7b08750}," + "year = {2017}," + "month = nov," + "publisher = {American Chemical Society ({ACS})}," + "volume = {121}," + "number = {46}," + "pages = {8939--8954}," + "author = {Jon Paul Janet and Heather J. Kulik}," + "title = {Resolving Transition Metal Chemical Space: " + "Feature Selection for Machine Learning and Structure{\textendash}Property Relationships}," + "journal = {The Journal of Physical Chemistry A}" + "}", + ] + + def implementors(self): + return ["Kevin Maik Jablonka"] diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py index 23159ce8..baf78958 100644 --- a/src/mofdscribe/featurizers/chemistry/racs.py +++ b/src/mofdscribe/featurizers/chemistry/racs.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """Revised autocorrelation functions (RACs) for MOFs.""" from collections import OrderedDict, defaultdict -from typing import Collection, List, Optional, Tuple, Union +from typing import Collection, List, Optional, Tuple, Union, Dict, Set import numpy as np from element_coder import encode @@ -19,18 +19,19 @@ ) from mofdscribe.featurizers.utils.structure_graph import ( get_connected_site_indices, - get_neighbors_at_distance, get_structure_graph, + get_neighbors_up_to_scope, ) from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType -__all__ = ("RACS",) +__all__ = ("RACS", "compute_racs") -def _compute_racs( +def compute_racs( start_indices: Collection[int], structure_graph: StructureGraph, + neighbors_at_distance: Dict[int, Dict[int, Set[int]]], properties: Tuple[str], scope: int, property_aggregations: Tuple[str], @@ -38,6 +39,22 @@ def _compute_racs( part_name: str = "", nan_value: float = np.nan, ): + """Compute the RACs for a given set of properties and scope. + + Args: + start_indices (Collection[int]): The indices of the sites to start from. + structure_graph (StructureGraph): The structure graph. + neighbors_at_distance (Dict[int, Dict[int, Set[int]]], optional): The neighbors at distance. Defaults to None. + properties (Tuple[str]): The properties that are correlated + scope (int): The scope of the RACs. + property_aggregations (Tuple[str]): The aggregations to perform on the properties. + corr_aggregations (Tuple[str]): The aggregations to perform on the correlations. + part_name (str, optional): The name of the part. Defaults to "". + nan_value (float, optional): The value to use for missing values. Defaults to np.nan. + + Returns: + Dict[str, float]: The RACs. + """ racs = defaultdict(lambda: defaultdict(list)) if len(start_indices) == 0: logger.debug(f"No start indices for {part_name}") @@ -46,7 +63,7 @@ def _compute_racs( racs[prop][agg].append(nan_value) else: for start_atom in start_indices: - _, neighbors = get_neighbors_at_distance(structure_graph, start_atom, scope) + neighbors = neighbors_at_distance[start_atom][scope] num_neighbors = len(neighbors) # We only branch if there are actually neighbors scope many bonds away ... @@ -81,7 +98,7 @@ def _compute_racs( for aggregation_name, aggregation_values in property_values.items(): for corr_agg in corr_aggregations: agg_func = ARRAY_AGGREGATORS[corr_agg] - name = f"racs_bb-{part_name}_prop-{property_name}_scope-{scope}_propagg-{aggregation_name}_corragg-{corr_agg}" # noqa: E501 + name = f"racs-{part_name}_prop-{property_name}_scope-{scope}_propagg-{aggregation_name}_corragg-{corr_agg}" # noqa: E501 aggregated_racs[name] = agg_func(aggregation_values) return aggregated_racs @@ -90,6 +107,7 @@ def _compute_racs( def _get_racs_for_bbs( bb_indices: Collection[int], structure_graph: StructureGraph, + neighbors_at_distance: Dict[int, Dict[int, Set[int]]], properties: Tuple[str], scopes: List[int], property_aggregations: Tuple[str], @@ -104,9 +122,10 @@ def _get_racs_for_bbs( bb_indices = [[]] for start_indices in bb_indices: for scope in scopes: - racs = _compute_racs( + racs = compute_racs( start_indices, structure_graph, + neighbors_at_distance, properties, scope, property_aggregations, @@ -156,7 +175,7 @@ class RACS(MOFBaseFeaturizer): To use to original implementation, see `molSimplify `_. """ - + _MAME = "RACS" def __init__( self, attributes: Tuple[Union[int, str]] = ("X", "mod_pettifor", "I", "T"), @@ -222,6 +241,10 @@ def _featurize(self, structure: Union[StructureIStructureType, StructureGraph]) f"Structure must be pymatgen Structure or StructureGraph, found {type(structure)}" ) + neighbors_at_distance = { + i: get_neighbors_up_to_scope(sg, i, max(self.scopes)) for i in range(len(structure)) + } + racs = {} # This finds all indices for a particular subset of atoms # e.g. "nodes", "linker_all", "linker_connecting", "linker_functional", "linker_scaffold" @@ -231,6 +254,7 @@ def _featurize(self, structure: Union[StructureIStructureType, StructureGraph]) _get_racs_for_bbs( bb_indices[bb], sg, + neighbors_at_distance, self.attributes, self.scopes, self.prop_agg, @@ -251,7 +275,7 @@ def _get_feature_labels(self) -> List[str]: for cor_agg in self.corr_agg: for bb_agg in self.bb_agg: names.append( - f"racs_bb-{bb}_prop-{prop}_scope-{scope}_propagg-{property_agg}_corragg-{cor_agg}_bbagg-{bb_agg}" # noqa: E501 + f"{self._NAME}-{bb}_prop-{prop}_scope-{scope}_propagg-{property_agg}_corragg-{cor_agg}_bbagg-{bb_agg}" # noqa: E501 ) names = sorted(names) diff --git a/src/mofdscribe/featurizers/utils/structure_graph.py b/src/mofdscribe/featurizers/utils/structure_graph.py index 8d8a150c..79d19842 100644 --- a/src/mofdscribe/featurizers/utils/structure_graph.py +++ b/src/mofdscribe/featurizers/utils/structure_graph.py @@ -1,45 +1,69 @@ # -*- coding: utf-8 -*- """perform analyses on structure graphs.""" from functools import lru_cache -from typing import List, Optional, Set, Tuple +from typing import List, Optional, Set, Tuple, Dict from pymatgen.analysis.graphs import StructureGraph from pymatgen.core import IStructure from structuregraph_helpers.create import get_structure_graph as get_sg +from collections import defaultdict +import networkx as nx +def leads_to_terminal(nx_graph, edge, bridges: Dict[int, int]=None): + if bridges is None: + bridges = _generate_bridges(nx_graph) + sorted_edge = sorted(edge) + try: + bridge_edge = bridges[sorted_edge[0]] + return sorted_edge[1] in bridge_edge + except KeyError: + return False -def get_neighbors_at_distance( - structure_graph: StructureGraph, start: int, scope: int -) -> Tuple[Set[int], List[int]]: - """For a structure graph and a start site, return all sites within a certain\ - distance (scope) of the start site. + + +def _generate_bridges(nx_graph) -> Dict[int, int]: + + bridges = list(nx.bridges(nx_graph)) + + bridges_dict = defaultdict(list) + for key, value in bridges: + bridges_dict[key].append(value) + + + return bridges_dict + + + + +def get_connected_site_indices(sg, site: int): + """Get list of indices of neighboring sites.""" + return [site.index for site in sg.get_connected_sites(site)] + +def get_neighbors_up_to_scope(structure_graph, site_index, scope): + """Get only the neighbors at a certain scope. + + That is, scope=3 will return all neighbors three bonds away from the + site_index. Args: - structure_graph (StructureGraph): pymatgen StructureGraph object - start (int): starting atom - scope (int): distance to search + structure_graph (StructureGraph): The structure graph. + site_index (int): The site index. + scope (int): The scope. Returns: - Tuple[Set[int], List[int]]: All sites within the scope of the start - site, and the indices of the sites in the last shell + list: The list of neighbors. """ - # Todo: This code is stupid. - neighbors_at_last_level = [start] - all_neighbors = set() - neighbors_at_next_level = [] - for _ in range(scope): - for n in neighbors_at_last_level: - neighbors_at_next_level.extend(get_connected_site_indices(structure_graph, n)) - - all_neighbors.update(neighbors_at_last_level) - neighbors_at_last_level = neighbors_at_next_level - neighbors_at_next_level = [] - all_neighbors.remove(start) - neighbors_at_last_level = set(neighbors_at_last_level) - if start in neighbors_at_last_level: - neighbors_at_last_level.remove(start) - - return all_neighbors, neighbors_at_last_level + neighbors_in_scope = defaultdict(set) + neighbors_in_scope[0] = [site_index] + visited = set([site_index]) + for i in range(1, scope + 1): + for site in neighbors_in_scope[i - 1]: + neighbors = get_connected_site_indices(structure_graph, site) + neighbors = [n for n in neighbors if n not in visited] + neighbors_in_scope[i].update(neighbors) + visited.update(neighbors) + + return neighbors_in_scope @lru_cache() diff --git a/tests/featurizers/chemistry/test_fragment.py b/tests/featurizers/chemistry/test_fragment.py index e49fdfb5..cc386125 100644 --- a/tests/featurizers/chemistry/test_fragment.py +++ b/tests/featurizers/chemistry/test_fragment.py @@ -53,6 +53,8 @@ def test_get_bb_indices(hkust_structure): bb_indices = get_bb_indices(sg) assert len(bb_indices["nodes"]) < len(sg) assert len(bb_indices["nodes"]) > 0 + + # in this fragmentation, we assign as node only the metal part, but not the carboxy or assert ( len(set(sum(bb_indices["nodes"], []))) == dict(hkust_structure.composition)[Element("Cu")] ) @@ -70,6 +72,7 @@ def test_get_bb_indices(hkust_structure): == 0 ) + # HKUST has no functional groups assert len(sum(bb_indices["linker_functional"], [])) == 0 assert len(sum(bb_indices["linker_all"], [])) == len( sum(bb_indices["linker_scaffold"], []) diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py index 6262be54..3a0a2c48 100644 --- a/tests/featurizers/chemistry/test_racs.py +++ b/tests/featurizers/chemistry/test_racs.py @@ -7,7 +7,10 @@ from mofdscribe.featurizers.chemistry._fragment import get_bb_indices from mofdscribe.featurizers.chemistry.racs import RACS, _get_racs_for_bbs -from mofdscribe.featurizers.utils.structure_graph import get_structure_graph +from mofdscribe.featurizers.utils.structure_graph import ( + get_structure_graph, + get_neighbors_up_to_scope, +) from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -21,10 +24,15 @@ def test_racs(hkust_structure, irmof_structure): sg = get_structure_graph(IStructure.from_sites(structure), featurizer.bond_heuristic) racs = {} bb_indices = get_bb_indices(sg) + neighbors_at_distance = { + i: get_neighbors_up_to_scope(sg, i, max(featurizer.scopes)) + for i in range(len(structure)) + } for bb in featurizer._bbs: v = _get_racs_for_bbs( bb_indices[bb], sg, + neighbors_at_distance, featurizer.attributes, featurizer.scopes, featurizer.prop_agg, @@ -55,12 +63,17 @@ def test_racs_functional(irmof_structure, abacuf_structure, floating_structure): feats = featurizer.featurize(MOF(structure)) # assert len(feats) == 4 * 3 * 8 * 5 # 4 properties, 3 scopes, 8 aggregations, 5 bb types sg = get_structure_graph(IStructure.from_sites(structure), featurizer.bond_heuristic) + neighbors_at_distance = { + i: get_neighbors_up_to_scope(sg, i, max(featurizer.scopes)) + for i in range(len(structure)) + } racs = {} bb_indices = get_bb_indices(sg) for bb in featurizer._bbs: v = _get_racs_for_bbs( bb_indices[bb], sg, + neighbors_at_distance, featurizer.attributes, featurizer.scopes, featurizer.prop_agg, diff --git a/tests/utils/test_structure_graph.py b/tests/utils/test_structure_graph.py index 6a8233af..0578feaf 100644 --- a/tests/utils/test_structure_graph.py +++ b/tests/utils/test_structure_graph.py @@ -1,11 +1,9 @@ # -*- coding: utf-8 -*- """Test helper functions for dealing with structure graphs.""" -from mofdscribe.featurizers.utils.structure_graph import get_neighbors_at_distance +from mofdscribe.featurizers.utils.structure_graph import get_neighbors_up_to_scope - -def test_get_neighbors_at_distance(hkust_structure_graph): - """Test the get_neighbors_at_distance function - at some case studies (manually verified).""" - assert len(get_neighbors_at_distance(hkust_structure_graph, 1, 1)[1]) == 4 - - assert len(get_neighbors_at_distance(hkust_structure_graph, 1, 2)[1]) == 4 +def test_get_neighbors_up_to_scope(hkust_structure_graph): + res = get_neighbors_up_to_scope(hkust_structure_graph, 1, 3) + assert len(res[1]) == 4 + assert len(res[2]) == 4 + assert len(res[3]) == 8 \ No newline at end of file From d6bc873b1fad3dd7b2f3ff6ff675374dcdf1586a Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Thu, 29 Dec 2022 21:54:57 +0100 Subject: [PATCH 14/17] wip: updated RACS [ci skip] --- .../source/featurizers/atom_centered/racs.rst | 2 + docs/source/featurizers/bu_centered/racs.rst | 20 +++ src/mofdscribe/featurizers/__init__.py | 33 ----- src/mofdscribe/featurizers/bu/racs.py | 123 +++++++++++++----- src/mofdscribe/featurizers/chemistry/racs.py | 5 +- .../featurizers/utils/definitions.py | 107 +++++++++++++++ .../featurizers/utils/structure_graph.py | 16 +-- tests/conftest.py | 6 + tests/featurizers/bu/test_racs.py | 15 +++ tests/featurizers/chemistry/test_racs.py | 2 +- tests/test_files/COF-1.cif | 75 +++++++++++ tests/utils/test_structure_graph.py | 5 +- 12 files changed, 331 insertions(+), 78 deletions(-) create mode 100644 docs/source/featurizers/bu_centered/racs.rst create mode 100644 src/mofdscribe/featurizers/utils/definitions.py create mode 100644 tests/featurizers/bu/test_racs.py create mode 100644 tests/test_files/COF-1.cif diff --git a/docs/source/featurizers/atom_centered/racs.rst b/docs/source/featurizers/atom_centered/racs.rst index 905030ad..5d2d5dc5 100644 --- a/docs/source/featurizers/atom_centered/racs.rst +++ b/docs/source/featurizers/atom_centered/racs.rst @@ -2,6 +2,8 @@ Revised autocorrelation functions (RACs) ............................................. +.. _RACS: + Revised autocorrelation functions have originally been proposed for metal-complexes [Janet2017]_. Autocorrelation functions have been widely used as compact, fixed-length descriptors and are defined as diff --git a/docs/source/featurizers/bu_centered/racs.rst b/docs/source/featurizers/bu_centered/racs.rst new file mode 100644 index 00000000..12018890 --- /dev/null +++ b/docs/source/featurizers/bu_centered/racs.rst @@ -0,0 +1,20 @@ + +Revised autocorrelation functions (RACs) +............................................. + +See also :ref:`RACs `. + +This featurizer is a flavor of the :ref:`RACs ` featurizer, that can split the computation over user-defined atom groups and automatically determined communities. + +In mofdscribe, you can customize the encodings :math:`P` (using all properties that are available in our `element-coder `_ package) as well as the aggregation functions. + +.. featurizer:: ModularityCommunityCenteredRACS + :id: ModularityCommunityCenteredRACS + :considers_geometry: False + :considers_structure_graph: True + :encodes_chemistry: optionally + :scope: local + :scalar: False + :style: only-light + + Initially described in [Janet2017]_ for metal complexes, extended to MOFs in [Moosavi2021]_. diff --git a/src/mofdscribe/featurizers/__init__.py b/src/mofdscribe/featurizers/__init__.py index 73f995e3..e69de29b 100644 --- a/src/mofdscribe/featurizers/__init__.py +++ b/src/mofdscribe/featurizers/__init__.py @@ -1,33 +0,0 @@ -from .bu import LSOP # noqa: F401 -from .bu import PMI1 # noqa: F401 -from .bu import PMI2 # noqa: F401 -from .bu import PMI3 # noqa: F401 -from .bu import Asphericity # noqa: F401 -from .bu import BUFeaturizer # noqa: F401 -from .bu import BUMatch # noqa: F401 -from .bu import CompositionStats # noqa: F401 -from .bu import DiskLikeness # noqa: F401 -from .bu import Eccentricity # noqa: F401 -from .bu import InertialShapeFactor # noqa: F401 -from .bu import MOFBBs # noqa: F401 -from .bu import NConf20 # noqa: F401 -from .bu import PairwiseDistanceHist # noqa: F401 -from .bu import RadiusOfGyration # noqa: F401 -from .bu import RDKitAdaptor # noqa: F401 -from .bu import RodLikeness # noqa: F401 -from .bu import SphereLikeness # noqa: F401 -from .bu import SpherocityIndex # noqa: F401 -from .chemistry import AMD # noqa: F401 -from .chemistry import APRDF # noqa: F401 -from .chemistry import RACS # noqa: F401 -from .chemistry import EnergyGridHistogram # noqa: F401 -from .chemistry import Henry # noqa: F401 -from .chemistry import PartialChargeHistogram # noqa: F401 -from .chemistry import PartialChargeStats # noqa: F401 -from .chemistry import PriceLowerBound # noqa: F401 -from .pore import AccessibleVolume # noqa: F401 -from .pore import PoreDiameters # noqa: F401 -from .pore import PoreSizeDistribution # noqa: F401 -from .pore import RayTracingHistogram # noqa: F401 -from .pore import SurfaceArea # noqa: F401 -from .topology import AtomCenteredPH, PHHist, PHImage, PHStats, PHVect # noqa: F401 diff --git a/src/mofdscribe/featurizers/bu/racs.py b/src/mofdscribe/featurizers/bu/racs.py index 5f29e872..2144e5a9 100644 --- a/src/mofdscribe/featurizers/bu/racs.py +++ b/src/mofdscribe/featurizers/bu/racs.py @@ -1,16 +1,28 @@ # -*- coding: utf-8 -*- -from typing import Tuple, Union, List, Optional, Collection, Dict, Set +""""RACs on molecule and structure graphs with optional community detection.""" +from collections import OrderedDict, defaultdict +from typing import Collection, Dict, List, Optional, Set, Tuple, Union + import networkx.algorithms.community as nx_comm -from mofdscribe.featurizers.base import MOFBaseFeaturizer +import numpy as np from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph -from mofdscribe.utils.extend import operates_on_moleculegraph, operates_on_structuregraph -from mofdscribe.mof import MOF -from element_coder.data.coding_data import get_coding_dict -from mofdscribe.utils.structuregraph import get_neighbors_up_to_scope -from collections import defaultdict -from mofdscribe.featurizers.utils.aggregators import AGGREGATORS, ARRAY_AGGREGATORS + +from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.chemistry.racs import compute_racs -from collections import OrderedDict +from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS +from mofdscribe.featurizers.utils.definitions import ( + ALL_ELEMENTS, + ALL_ELEMENTS_EXCEPT_C_H, + ALL_METAL_ELEMENTS, + ALL_NONMETAL_ELEMENTS, +) +from mofdscribe.featurizers.utils.extend import ( + operates_on_moleculegraph, + operates_on_structuregraph, +) +from mofdscribe.featurizers.utils.structure_graph import get_neighbors_up_to_scope +from mofdscribe.mof import MOF + def _get_site_iter(structuregraph: Union[StructureGraph, MoleculeGraph]): if isinstance(structuregraph, StructureGraph): @@ -20,38 +32,43 @@ def _get_site_iter(structuregraph: Union[StructureGraph, MoleculeGraph]): else: raise ValueError("structuregraph must be either a StructureGraph or a MoleculeGraph") -def _get_atom_site_indices(structuregraph: Union[StructureGraph, MoleculeGraph], atom_groups: Collection[Tuple[str, List[str], bool]]): + +def _get_atom_site_indices( + structuregraph: Union[StructureGraph, MoleculeGraph], + atom_groups: Collection[Tuple[str, List[str], bool]], +): """Get the indices of the sites that belong to the atom groups.""" atom_grouped_indices = defaultdict(set) for atom_group_name, elements, _no_terminal in atom_groups: - for site in _get_site_iter(structuregraph): + for i, site in enumerate(_get_site_iter(structuregraph)): if site.specie.symbol in elements: - atom_grouped_indices[atom_group_name].add(site.index) + atom_grouped_indices[atom_group_name].add(i) return atom_grouped_indices + def _split_up_communities( structuregraph: Union[StructureGraph, MoleculeGraph], communities: List[List[int]], atom_groups: Collection[Tuple[str, List[str], bool]], ): """Create dictionary of communities with atom groups as keys.""" - atom_grouped_communities = defaultdict(set) - indices_to_atom_group = _get_atom_site_indices() + atom_grouped_communities = defaultdict(list) + indices_to_atom_group = _get_atom_site_indices( + structuregraph=structuregraph, atom_groups=atom_groups + ) for atom_group_name, elements, _no_terminal in atom_groups: for communities in communities: - atom_grouped_communities[atom_group_name].update(indices_to_atom_group & set(communities)) + if not isinstance(communities, set): + communities = set([communities]) + atom_grouped_communities[atom_group_name].append( + indices_to_atom_group[atom_group_name] & set(communities) + ) return atom_grouped_communities - - - -_ALL_ELEMENTS = set(get_coding_dict("atomic").keys()) -_ALL_ELEMENTS_EXCEPT_C_H = _ALL_ELEMENTS - {"C", "H"} - -# ToDo: generalize the code and implement in only one place +# ToDo: generalize the code and implement in only one place # the main racs code is quite similar to this one def _get_racs_for_community( community_indices, @@ -97,6 +114,8 @@ def _get_racs_for_community( @operates_on_structuregraph @operates_on_moleculegraph class ModularityCommunityCenteredRACS(MOFBaseFeaturizer): + """RACs on molecule and structure graphs with optional community detection.""" + _NAME = "ModularityCommunityCenteredRACS" def __init__( @@ -107,31 +126,72 @@ def __init__( prop_agg: Tuple[str] = ("product", "diff"), corr_agg: Tuple[str] = ("sum", "avg"), atom_groups_agg: Tuple[str] = ("avg", "sum"), - bond_heuristic: str = "vesta", dont_use_communities: bool = False, ): - # heteroatom groups is a list of atoms which we will aggregate seperately if they are in a community - # additional, we can specify if we only aggregate them seperately if they are not terminal or part of a bridge - # if the atom groups are specified, there should maybe (?) also be an option to specify "all" + """Constuct a ModularityCommunityCenteredRACS featurizer. + + Features are computed for each atom group and for each community. The features are then aggregated + over the communities. + + Args: + atom_groups (Optional[Collection[Tuple[str, Collection[str], bool]]], optional): + Elements which form start scopes for RACs. + The first element is the name of the atom group, and will appear in the feature name. + The second element is a list of elements that belong to the atom group. For example, + ('C-H', ['C', 'H'], False) will create a feature for all carbon atoms and hydrogen atoms. + The third element of the tuple is currently not used (but will be used as a flag to + indicate whether to include terminal/leading to terminal atoms in the start scope). + If set to None, there will be one atom group with all atoms. + Defaults to None. + attributes (Tuple[Union[int, str]], optional): + Elemental properties used for the construction of RACs. Defaults to ("X", "mod_pettifor", "I", "T"). + scopes (Tuple[int], optional): Scopes used for the construction of RACs. + Defaults to (1, 2, 3). + prop_agg (Tuple[str], optional): Aggregation methods used for the aggregation of + properties. "Product" corresponds to "product-RACs" and "diff" to "difference-RACs". + Defaults to ("product", "diff"). + corr_agg (Tuple[str], optional): Aggregation methods used for the aggregation of the + correlated properties. Defaults to ("sum", "avg"). + atom_groups_agg (Tuple[str], optional): Aggregation methods used for the pooling + over atom communities within one atom group. Defaults to ("avg", "sum"). + dont_use_communities (bool, optional): If set to true, we do not use modularity-based community detection. + Features are then simply averaged over all atoms. + Defaults to False. + """ if atom_groups is None: - atom_groups = [("all", _ALL_ELEMENTS, False)] + atom_groups = [("all", ALL_ELEMENTS, False)] self.atom_groups = atom_groups self.attributes = attributes self.scopes = scopes self.prop_agg = prop_agg self.corr_agg = corr_agg self.atom_groups_agg = atom_groups_agg - self.bond_heuristic = bond_heuristic self.dont_use_communities = dont_use_communities + @classmethod + def from_preset(cls, preset: str, **kwargs): + if preset.lower() == "cof": + atom_groups = [ + ("all", ALL_ELEMENTS, False), + ("C-H", {"C", "H"}, False), + ("not_C-H", ALL_ELEMENTS_EXCEPT_C_H, False), + ] + elif preset.lower() == "mof": + atom_groups = [ + ("all", ALL_ELEMENTS, False), + ("metal", ALL_METAL_ELEMENTS, False), + ("nonmetal", ALL_NONMETAL_ELEMENTS, False), + ] + return cls(atom_groups=atom_groups, **kwargs) + def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]): - + if not self.dont_use_communities: communities = list( nx_comm.greedy_modularity_communities(structuregraph.graph.to_undirected()) ) else: - communities = [range(len(structuregraph))] # make one community with all atoms + communities = [range(len(structuregraph))] # make one community with all atoms neighbors_at_distance = { i: get_neighbors_up_to_scope(structuregraph, i, max(self.scopes)) @@ -140,6 +200,8 @@ def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]): groups = _split_up_communities(structuregraph, communities, self.atom_groups) + print(groups) + racs = {} for group_name, group_indices in groups.items(): racs.update( @@ -159,7 +221,6 @@ def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]): racs_ordered = OrderedDict(sorted(racs.items())) return np.array(list(racs_ordered.values())) - def featurize(self, mof: MOF): return self._featurize(mof.structure_graph) diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py index baf78958..7c35fd49 100644 --- a/src/mofdscribe/featurizers/chemistry/racs.py +++ b/src/mofdscribe/featurizers/chemistry/racs.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """Revised autocorrelation functions (RACs) for MOFs.""" from collections import OrderedDict, defaultdict -from typing import Collection, List, Optional, Tuple, Union, Dict, Set +from typing import Collection, Dict, List, Optional, Set, Tuple, Union import numpy as np from element_coder import encode @@ -19,8 +19,8 @@ ) from mofdscribe.featurizers.utils.structure_graph import ( get_connected_site_indices, - get_structure_graph, get_neighbors_up_to_scope, + get_structure_graph, ) from mofdscribe.mof import MOF from mofdscribe.types import StructureIStructureType @@ -176,6 +176,7 @@ class RACS(MOFBaseFeaturizer): `_. """ _MAME = "RACS" + def __init__( self, attributes: Tuple[Union[int, str]] = ("X", "mod_pettifor", "I", "T"), diff --git a/src/mofdscribe/featurizers/utils/definitions.py b/src/mofdscribe/featurizers/utils/definitions.py new file mode 100644 index 00000000..d3f33bd0 --- /dev/null +++ b/src/mofdscribe/featurizers/utils/definitions.py @@ -0,0 +1,107 @@ +from element_coder.data.coding_data import get_coding_dict + +ALL_ELEMENTS = set(get_coding_dict("atomic").keys()) +ALL_ELEMENTS_EXCEPT_C_H = ALL_ELEMENTS - {"C", "H"} +ALL_METAL_ELEMENTS = set( + ( + "Li", + "Be", + "Na", + "Mg", + "Al", + "K", + "Ca", + "Sc", + "Ti", + "V", + "Cr", + "Mn", + "Fe", + "Co", + "Ni", + "Cu", + "Zn", + "Ga", + "Rb", + "Sr", + "Y", + "Zr", + "Nb", + "Mo", + "Tc", + "Ru", + "Rh", + "Pd", + "Ag", + "Cd", + "In", + "Sn", + "Cs", + "Ba", + "La", + "Ce", + "Pr", + "Nd", + "Pm", + "Sm", + "Eu", + "Gd", + "Tb", + "Dy", + "Ho", + "Er", + "Tm", + "Yb", + "Lu", + "Hf", + "Ta", + "W", + "Re", + "Os", + "Ir", + "Pt", + "Au", + "Hg", + "Tl", + "Pb", + "Bi", + "Fr", + "Ra", + "Ac", + "Th", + "Pa", + "U", + "Np", + "Pu", + "Am", + "Cm", + "Bk", + "Cf", + "Es", + "Fm", + "Md", + "No", + "Lr", + "Rf", + "Db", + "Sg", + "Bh", + "Hs", + "Mt", + "Ds", + "Rg", + "Cn", + "Al", + "Ga", + "In", + "Tl", + "Ge", + "Sn", + "Pb", + "Sb", + "Bi", + "Po", + ) +) + +_ALL_NONMETAL_ELEMENTS = ALL_ELEMENTS - ALL_METAL_ELEMENTS diff --git a/src/mofdscribe/featurizers/utils/structure_graph.py b/src/mofdscribe/featurizers/utils/structure_graph.py index 79d19842..4fa05e69 100644 --- a/src/mofdscribe/featurizers/utils/structure_graph.py +++ b/src/mofdscribe/featurizers/utils/structure_graph.py @@ -1,15 +1,16 @@ # -*- coding: utf-8 -*- """perform analyses on structure graphs.""" +from collections import defaultdict from functools import lru_cache -from typing import List, Optional, Set, Tuple, Dict +from typing import Dict, List, Optional, Set, Tuple +import networkx as nx from pymatgen.analysis.graphs import StructureGraph from pymatgen.core import IStructure from structuregraph_helpers.create import get_structure_graph as get_sg -from collections import defaultdict -import networkx as nx -def leads_to_terminal(nx_graph, edge, bridges: Dict[int, int]=None): + +def leads_to_terminal(nx_graph, edge, bridges: Dict[int, int] = None): if bridges is None: bridges = _generate_bridges(nx_graph) sorted_edge = sorted(edge) @@ -20,25 +21,22 @@ def leads_to_terminal(nx_graph, edge, bridges: Dict[int, int]=None): return False - def _generate_bridges(nx_graph) -> Dict[int, int]: - + bridges = list(nx.bridges(nx_graph)) bridges_dict = defaultdict(list) for key, value in bridges: bridges_dict[key].append(value) - return bridges_dict - - def get_connected_site_indices(sg, site: int): """Get list of indices of neighboring sites.""" return [site.index for site in sg.get_connected_sites(site)] + def get_neighbors_up_to_scope(structure_graph, site_index, scope): """Get only the neighbors at a certain scope. diff --git a/tests/conftest.py b/tests/conftest.py index 433b6286..c6a5e65e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,6 +18,12 @@ def hkust_structure(): return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "HKUST-1.cif")) +@pytest.fixture(scope="session") +def cof_1_structure(): + """Return a pymatgen Structure for COF""" + return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "COF-1.cif")) + + @pytest.fixture(scope="session") def mof74_structure(): """Return a pymatgen Structure for MOF74""" diff --git a/tests/featurizers/bu/test_racs.py b/tests/featurizers/bu/test_racs.py new file mode 100644 index 00000000..379dfff7 --- /dev/null +++ b/tests/featurizers/bu/test_racs.py @@ -0,0 +1,15 @@ +from mofdscribe.featurizers.bu.racs import ModularityCommunityCenteredRACS +from mofdscribe.mof import MOF + + +def test_modularitycommunity_racs(cof_1_structure): + featurizer = ModularityCommunityCenteredRACS() + feats = featurizer.featurize(MOF(cof_1_structure)) + labels = featurizer.feature_labels() + assert len(feats) == len(labels) + + # cof preset + featurizer = ModularityCommunityCenteredRACS.from_preset("cof") + feats = featurizer.featurize(MOF(cof_1_structure)) + labels = featurizer.feature_labels() + assert len(feats) == len(labels) diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py index 3a0a2c48..7027452a 100644 --- a/tests/featurizers/chemistry/test_racs.py +++ b/tests/featurizers/chemistry/test_racs.py @@ -8,8 +8,8 @@ from mofdscribe.featurizers.chemistry._fragment import get_bb_indices from mofdscribe.featurizers.chemistry.racs import RACS, _get_racs_for_bbs from mofdscribe.featurizers.utils.structure_graph import ( - get_structure_graph, get_neighbors_up_to_scope, + get_structure_graph, ) from mofdscribe.mof import MOF diff --git a/tests/test_files/COF-1.cif b/tests/test_files/COF-1.cif new file mode 100644 index 00000000..5c2a5709 --- /dev/null +++ b/tests/test_files/COF-1.cif @@ -0,0 +1,75 @@ +data_COF-1 + +_audit_creation_method RASPA-1.0 +_audit_creation_date 2011-3-7 +_audit_author_name 'David Dubbeldam' + +_citation_author_name 'A.P. Cote, A.I. Benin, N.W. Ockwig, M. O’Keeffe, A.J. Matzger, and O.M. Yaghi'' +_citation_title 'Porous, crystalline, covalent organic frameworks' +_citation_journal_abbrev 'Science' +_citation_journal_volume 310 +_citation_journal_number 5751 +_citation_page_first 1166 +_citation_page_last 1170 +_citation_year 2005 + +_cell_length_a 15.6529 +_cell_length_b 15.6529 +_cell_length_c 6.7005 +_cell_angle_alpha 90 +_cell_angle_beta 90 +_cell_angle_gamma 120 +_cell_volume 1421.76 + +_symmetry_cell_setting hexagonal +_symmetry_space_group_name_Hall '-P 6c 2c' +_symmetry_space_group_name_H-M 'P 63/m m c' +_symmetry_Int_Tables_number 194 + +loop_ +_symmetry_equiv_pos_as_xyz + 'x,y,z' + '-y,x-y,z' + '-x+y,-x,z' + '-x,-y,z+1/2' + 'y,-x+y,z+1/2' + 'x-y,x,z+1/2' + 'y,x,-z' + 'x-y,-y,-z' + '-x,-x+y,-z' + '-y,-x,-z+1/2' + '-x+y,y,-z+1/2' + 'x,x-y,-z+1/2' + '-x,-y,-z' + 'y,-x+y,-z' + 'x-y,x,-z' + 'x,y,-z+1/2' + '-y,x-y,-z+1/2' + '-x+y,-x,-z+1/2' + '-y,-x,z' + '-x+y,y,z' + 'x,x-y,z' + 'y,x,z+1/2' + 'x-y,-y,z+1/2' + '-x,-x+y,z+1/2' + +loop_ +_atom_site_label +_atom_site_type_symbol +_atom_site_fract_x +_atom_site_fract_y +_atom_site_fract_z +_atom_site_charge +B1 B 0.05772 0.11543 0.25 0 +B2 B 0.44466 0.72233 0.25 0 +O1 O 0.11133 0.05567 0.25 0 +O2 O 0.389 0.778 0.25 0 +C1 C 0.11184 0.38361 0.25 0 +C2 C 0.21864 0.43728 0.25 0 +C3 C 0.21837 0.27685 0.25 0 +C4 C 0.1103 0.8897 0.25 0 +H1 H 0.57703 0.92773 0.25 0 +H2 H 1.02045 0.76181 0.75 0 + + + diff --git a/tests/utils/test_structure_graph.py b/tests/utils/test_structure_graph.py index 0578feaf..8ea4a227 100644 --- a/tests/utils/test_structure_graph.py +++ b/tests/utils/test_structure_graph.py @@ -2,8 +2,9 @@ """Test helper functions for dealing with structure graphs.""" from mofdscribe.featurizers.utils.structure_graph import get_neighbors_up_to_scope -def test_get_neighbors_up_to_scope(hkust_structure_graph): + +def test_get_neighbors_up_to_scope(hkust_structure_graph): res = get_neighbors_up_to_scope(hkust_structure_graph, 1, 3) assert len(res[1]) == 4 assert len(res[2]) == 4 - assert len(res[3]) == 8 \ No newline at end of file + assert len(res[3]) == 8 From 6a887dc660b2bff230ff358d6a6a8a0d9cb6d960 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Fri, 30 Dec 2022 08:15:36 +0100 Subject: [PATCH 15/17] fix lint and tests, start adding metrics docs [ci skip] --- docs/source/featurizers/bu_centered/racs.rst | 3 +- docs/source/index.rst | 1 + docs/source/metrics.rst | 14 +++++++ docs/source/references.rst | 4 +- src/mofdscribe/featurizers/bu/racs.py | 40 ++++++++++++++----- src/mofdscribe/featurizers/chemistry/racs.py | 7 ++-- .../featurizers/utils/definitions.py | 2 + .../featurizers/utils/structure_graph.py | 7 +--- 8 files changed, 56 insertions(+), 22 deletions(-) create mode 100644 docs/source/metrics.rst diff --git a/docs/source/featurizers/bu_centered/racs.rst b/docs/source/featurizers/bu_centered/racs.rst index 12018890..23458eee 100644 --- a/docs/source/featurizers/bu_centered/racs.rst +++ b/docs/source/featurizers/bu_centered/racs.rst @@ -4,7 +4,8 @@ Revised autocorrelation functions (RACs) See also :ref:`RACs `. -This featurizer is a flavor of the :ref:`RACs ` featurizer, that can split the computation over user-defined atom groups and automatically determined communities. +This featurizer is a flavor of the :ref:`RACs ` featurizer, that can split the computation over user-defined atom groups and automatically determined communities. For determining communities, we use ``networkx``'s implementation of greedy modularity maximization [NewmanModularity]_. +This community detection often corresponds to chemically meaningful parts (often ring systems) of the structure (but does not require an explicit fragmentation algorithm). In mofdscribe, you can customize the encodings :math:`P` (using all properties that are available in our `element-coder `_ package) as well as the aggregation functions. diff --git a/docs/source/index.rst b/docs/source/index.rst index 1a240b14..014a5ebb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -37,6 +37,7 @@ dependencies, the featurizers are currently not integrated in matminer itself. splitters datasets background + metrics leaderboard contributing api diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst new file mode 100644 index 00000000..f27ee506 --- /dev/null +++ b/docs/source/metrics.rst @@ -0,0 +1,14 @@ +Metrics +=================== + +In order to compare our models we need to score them using a metric. +Most commonly used in are scores such as accuracy, precision, recall, or the mean absolute error for regression problem. + +However, these metrics are not always the best choice. +It is well known, for instance, that accuracy is not a good metric for imbalanced datasets. +However, even beyond such considerations it is important to take into consideration for what purpose the model is used. + +For materials discovery, this often implies that a metric that measures how many of the top materials we find is more +important than an averaged, overall score. + +``mofdscribe`` provides some utilities to help with this in the ``mofdscribe.metrics`` subpackage. \ No newline at end of file diff --git a/docs/source/references.rst b/docs/source/references.rst index 3bf3bd47..b959f0ff 100644 --- a/docs/source/references.rst +++ b/docs/source/references.rst @@ -113,4 +113,6 @@ References .. [Varoquaux] `Varoquaux, G. Cross-Validation Failure: Small Sample Sizes Lead to Large Error Bars. NeuroImage 2018, 180, 68–77. `_ -.. [LarsenDimensionality] `Larsen, P. M.; Pandey, M.; Strange, M.; Jacobsen, K. W. Definition of a Scoring Parameter to Identify Low-Dimensional Materials Components. Phys. Rev. Materials 2019, 3 (3), 034003. `_ \ No newline at end of file +.. [LarsenDimensionality] `Larsen, P. M.; Pandey, M.; Strange, M.; Jacobsen, K. W. Definition of a Scoring Parameter to Identify Low-Dimensional Materials Components. Phys. Rev. Materials 2019, 3 (3), 034003. `_ + +.. [NewmanModularity] `Newman, M. E. J. Modularity and community structure in networks. Proceedings of the National Academy of Sciences 2006, 103 (23), 8577–8582. `_ \ No newline at end of file diff --git a/src/mofdscribe/featurizers/bu/racs.py b/src/mofdscribe/featurizers/bu/racs.py index 2144e5a9..f4c55b78 100644 --- a/src/mofdscribe/featurizers/bu/racs.py +++ b/src/mofdscribe/featurizers/bu/racs.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -""""RACs on molecule and structure graphs with optional community detection.""" +"""RACs on molecule and structure graphs with optional community detection.""" from collections import OrderedDict, defaultdict from typing import Collection, Dict, List, Optional, Set, Tuple, Union @@ -57,12 +57,14 @@ def _split_up_communities( structuregraph=structuregraph, atom_groups=atom_groups ) - for atom_group_name, elements, _no_terminal in atom_groups: - for communities in communities: - if not isinstance(communities, set): - communities = set([communities]) + for atom_group_name, _elements, _no_terminal in atom_groups: + for community in communities: + if not isinstance(community, set): + communities_set = set([community]) + else: + communities_set = community atom_grouped_communities[atom_group_name].append( - indices_to_atom_group[atom_group_name] & set(communities) + indices_to_atom_group[atom_group_name] & set(communities_set) ) return atom_grouped_communities @@ -114,7 +116,26 @@ def _get_racs_for_community( @operates_on_structuregraph @operates_on_moleculegraph class ModularityCommunityCenteredRACS(MOFBaseFeaturizer): - """RACs on molecule and structure graphs with optional community detection.""" + """RACs on molecule and structure graphs with optional community detection. + + This featurizer is a flavor of the :ref:`RACs ` featurizer. + It can split the computation over user-defined atom groups and automatically determined communities. + For determining communities, we use ``networkx``'s implementation + of greedy modularity maximization [NewmanModularity]_. + + The features are computed for each communinty within each atom group and then aggregated over the communities. + That is, the number of features will depend on the number of atom groups. + The communities will only impact how the features within an atom group are aggregated. + + There are different ways in which you might want to use this featurizer. + + 1) No prior on communitiy structure. In this case, you can set ``dont_use_communities=True``. + It will still compute the RACs over the atom groups you specify but won't have an inner loop over communities + that are then aggegated for the final features for a given atom group. + + 2) No prior on atom grouping. In this case, you can set ``atom_groups=None``. + This will compute the RACs over all atoms. + """ _NAME = "ModularityCommunityCenteredRACS" @@ -154,7 +175,7 @@ def __init__( correlated properties. Defaults to ("sum", "avg"). atom_groups_agg (Tuple[str], optional): Aggregation methods used for the pooling over atom communities within one atom group. Defaults to ("avg", "sum"). - dont_use_communities (bool, optional): If set to true, we do not use modularity-based community detection. + dont_use_communities (bool): If set to true, we do not use modularity-based community detection. Features are then simply averaged over all atoms. Defaults to False. """ @@ -185,7 +206,6 @@ def from_preset(cls, preset: str, **kwargs): return cls(atom_groups=atom_groups, **kwargs) def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]): - if not self.dont_use_communities: communities = list( nx_comm.greedy_modularity_communities(structuregraph.graph.to_undirected()) @@ -200,8 +220,6 @@ def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]): groups = _split_up_communities(structuregraph, communities, self.atom_groups) - print(groups) - racs = {} for group_name, group_indices in groups.items(): racs.update( diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py index 7c35fd49..a252a554 100644 --- a/src/mofdscribe/featurizers/chemistry/racs.py +++ b/src/mofdscribe/featurizers/chemistry/racs.py @@ -49,8 +49,8 @@ def compute_racs( scope (int): The scope of the RACs. property_aggregations (Tuple[str]): The aggregations to perform on the properties. corr_aggregations (Tuple[str]): The aggregations to perform on the correlations. - part_name (str, optional): The name of the part. Defaults to "". - nan_value (float, optional): The value to use for missing values. Defaults to np.nan. + part_name (str): The name of the part. Defaults to "". + nan_value (float): The value to use for missing values. Defaults to np.nan. Returns: Dict[str, float]: The RACs. @@ -175,7 +175,8 @@ class RACS(MOFBaseFeaturizer): To use to original implementation, see `molSimplify `_. """ - _MAME = "RACS" + + _NAME = "RACS" def __init__( self, diff --git a/src/mofdscribe/featurizers/utils/definitions.py b/src/mofdscribe/featurizers/utils/definitions.py index d3f33bd0..87bd533f 100644 --- a/src/mofdscribe/featurizers/utils/definitions.py +++ b/src/mofdscribe/featurizers/utils/definitions.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +"""Constants used in the featurizers.""" from element_coder.data.coding_data import get_coding_dict ALL_ELEMENTS = set(get_coding_dict("atomic").keys()) diff --git a/src/mofdscribe/featurizers/utils/structure_graph.py b/src/mofdscribe/featurizers/utils/structure_graph.py index 4fa05e69..d7889eae 100644 --- a/src/mofdscribe/featurizers/utils/structure_graph.py +++ b/src/mofdscribe/featurizers/utils/structure_graph.py @@ -2,7 +2,7 @@ """perform analyses on structure graphs.""" from collections import defaultdict from functools import lru_cache -from typing import Dict, List, Optional, Set, Tuple +from typing import Dict, List, Optional import networkx as nx from pymatgen.analysis.graphs import StructureGraph @@ -32,11 +32,6 @@ def _generate_bridges(nx_graph) -> Dict[int, int]: return bridges_dict -def get_connected_site_indices(sg, site: int): - """Get list of indices of neighboring sites.""" - return [site.index for site in sg.get_connected_sites(site)] - - def get_neighbors_up_to_scope(structure_graph, site_index, scope): """Get only the neighbors at a certain scope. From 318968f68427c63b8f041d85a1c45e905cf35a5c Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Fri, 30 Dec 2022 08:50:57 +0100 Subject: [PATCH 16/17] fix failing tests --- src/mofdscribe/featurizers/utils/definitions.py | 3 ++- tests/featurizers/chemistry/test_racs.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/mofdscribe/featurizers/utils/definitions.py b/src/mofdscribe/featurizers/utils/definitions.py index 87bd533f..e9b6400b 100644 --- a/src/mofdscribe/featurizers/utils/definitions.py +++ b/src/mofdscribe/featurizers/utils/definitions.py @@ -30,6 +30,7 @@ "Zr", "Nb", "Mo", + "Tc", "Ru", "Rh", @@ -106,4 +107,4 @@ ) ) -_ALL_NONMETAL_ELEMENTS = ALL_ELEMENTS - ALL_METAL_ELEMENTS +ALL_NONMETAL_ELEMENTS = ALL_ELEMENTS - ALL_METAL_ELEMENTS diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py index 7027452a..939b7125 100644 --- a/tests/featurizers/chemistry/test_racs.py +++ b/tests/featurizers/chemistry/test_racs.py @@ -48,7 +48,7 @@ def test_racs(hkust_structure, irmof_structure): assert np.isnan(np.array(list(v.values()))).sum() == len(v) racs_ordered = OrderedDict(sorted(racs.items())) - assert list(racs_ordered.keys()) == featurizer.feature_labels() + assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list(map(lambda x: x.lower(), featurizer.feature_labels())) # assert len(featurizer.feature_labels()) == 120 assert len(featurizer.citations()) == 2 @@ -85,7 +85,7 @@ def test_racs_functional(irmof_structure, abacuf_structure, floating_structure): # we classify the "O" as a functional group assert np.isnan(np.array(list(v.values()))).sum() == 0 racs_ordered = OrderedDict(sorted(racs.items())) - assert list(racs_ordered.keys()) == featurizer.feature_labels() + assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list(map(lambda x: x.lower(), featurizer.feature_labels())) # assert len(featurizer.feature_labels()) == 120 assert len(featurizer.citations()) == 2 From 294cfe1d6f5d2519141b9e2e3689b0fc651d11fd Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Sun, 1 Jan 2023 11:27:04 +0100 Subject: [PATCH 17/17] capture errors --- src/mofdscribe/featurizers/bu/__init__.py | 1 + .../featurizers/bu/bu_featurizer.py | 70 ++++++++++++++----- src/mofdscribe/featurizers/bu/rdkitadaptor.py | 10 ++- .../featurizers/utils/aggregators.py | 65 +++++++++-------- .../featurizers/utils/definitions.py | 1 - tests/featurizers/chemistry/test_racs.py | 8 ++- 6 files changed, 104 insertions(+), 51 deletions(-) diff --git a/src/mofdscribe/featurizers/bu/__init__.py b/src/mofdscribe/featurizers/bu/__init__.py index 1add3acf..fc999d9e 100644 --- a/src/mofdscribe/featurizers/bu/__init__.py +++ b/src/mofdscribe/featurizers/bu/__init__.py @@ -30,3 +30,4 @@ SphereLikeness, SpherocityIndex, ) +from .smarts_matches import AcidGroupCounter, BaseGroupCounter, SmartsMatchCounter # noqa: F401 diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py index d67677b4..5090538f 100644 --- a/src/mofdscribe/featurizers/bu/bu_featurizer.py +++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py @@ -348,8 +348,16 @@ class BindingSitesFeaturizer(_BUSubBaseFeaturizer): def _extract(self, mof: MOF): if Structure in self._operates_on: - linkers = [linker._get_binding_sites_structure() for linker in mof.fragments.linkers] - nodes = [node._get_binding_sites_structure() for node in mof.fragments.nodes] + try: + linkers = [ + linker._get_binding_sites_structure() for linker in mof.fragments.linkers + ] + except Exception: + linkers = [] + try: + nodes = [node._get_binding_sites_structure() for node in mof.fragments.nodes] + except Exception: + nodes = [] return nodes, linkers elif StructureGraph in self._operates_on: linkers = [ @@ -393,8 +401,16 @@ class BranchingSitesFeaturizer(_BUSubBaseFeaturizer): def _extract(self, mof: MOF): if Structure in self._operates_on: - linkers = [linker._get_branching_sites_structure() for linker in mof.fragments.linkers] - nodes = [node._get_branching_sites_structure() for node in mof.fragments.nodes] + try: + linkers = [ + linker._get_branching_sites_structure() for linker in mof.fragments.linkers + ] + except Exception: + linkers = [] + try: + nodes = [node._get_branching_sites_structure() for node in mof.fragments.nodes] + except Exception: + nodes = [] return nodes, linkers elif StructureGraph in self._operates_on: linkers = [ @@ -434,7 +450,9 @@ def __init__(self, hop_stat_aggregations, aggregations, index_extractor: Callabl def _extract(self, mof: MOF): linkers = [linker.molecule_graph for linker in mof.fragments.linkers] - nodes = [node.molecule_graph for node in mof.fragments.nodes] + nodes = [ + node.molecule_graph for node in mof.fragments.nodes + ] # fixme: should use the dummy graph linker_indices = [self.index_extractor(linker) for linker in mof.fragments.linkers] node_indices = [self.index_extractor(node) for node in mof.fragments.nodes] @@ -442,18 +460,36 @@ def _extract(self, mof: MOF): def featurize(self, mof: Optional[MOF] = None) -> np.ndarray: nodes, linkers, node_indices, linker_indices = self._extract(mof) - node_feats = np.array( - [ - self._featurizer._featurize(node, node_idx) - for node, node_idx in zip(nodes, node_indices) - ] - ) - linker_feats = np.array( - [ - self._featurizer._featurize(linker, linker_idx) - for linker, linker_idx in zip(linkers, linker_indices) - ] - ) + node_feats = [] + + # np.array( + # [ + # self._featurizer._featurize(node, node_idx) + # for node, node_idx in zip(nodes, node_indices) + # ] + # ) + + for node, node_idx in zip(nodes, node_indices): + try: + node_feats.append(self._featurizer._featurize(node, node_idx)) + except Exception: + node_feats.append([np.nan] * len(self._featurizer.feature_labels())) + + node_feats = np.array(node_feats) + + linker_feats = [] # np.array( + # [ + # self._featurizer._featurize(linker, linker_idx) + # for linker, linker_idx in zip(linkers, linker_indices) + # ] + # ) + + for linker, linker_idx in zip(linkers, linker_indices): + try: + linker_feats.append(self._featurizer._featurize(linker, linker_idx)) + except Exception: + linker_feats.append([np.nan] * len(self._featurizer.feature_labels())) + linker_feats = np.array(linker_feats) node_aggregated = np.concatenate( [ARRAY_AGGREGATORS[agg](node_feats, axis=0) for agg in self.aggregations] diff --git a/src/mofdscribe/featurizers/bu/rdkitadaptor.py b/src/mofdscribe/featurizers/bu/rdkitadaptor.py index b8e1ee35..02b23d24 100644 --- a/src/mofdscribe/featurizers/bu/rdkitadaptor.py +++ b/src/mofdscribe/featurizers/bu/rdkitadaptor.py @@ -97,9 +97,13 @@ def _featurize(self, molecule: Union[Molecule, MoleculeGraph]) -> np.ndarray: molecule_graph = MoleculeGraph.with_local_env_strategy( molecule, get_local_env_method(self._local_env_strategy) ) - rdkit_mol = create_rdkit_mol_from_mol_graph( - molecule_graph, force_sanitize=self._force_sanitize - ) + try: + rdkit_mol = create_rdkit_mol_from_mol_graph( + molecule_graph, force_sanitize=self._force_sanitize + ) + except Exception: + logger.warning("Could not create RDKit molecule from pymatgen molecule.") + return np.array([np.nan] * len(self._feature_labels)) feats = self._featurizer(rdkit_mol) if isinstance(feats, (list, tuple, np.ndarray)): return np.asarray(feats) diff --git a/src/mofdscribe/featurizers/utils/aggregators.py b/src/mofdscribe/featurizers/utils/aggregators.py index 01628776..ae3fa976 100644 --- a/src/mofdscribe/featurizers/utils/aggregators.py +++ b/src/mofdscribe/featurizers/utils/aggregators.py @@ -48,6 +48,13 @@ def masked_trimean(values): return (q1 + 2 * q2 + q3) / 4 +def try_except_nan(func, *args, **kwargs): + try: + return func(*args, **kwargs) + except Exception: + return np.nan + + AGGREGATORS = { "sum": lambda x: x[0] + x[1], "avg": lambda x: (x[0] + x[1]) / 2, @@ -62,36 +69,38 @@ def masked_trimean(values): ARRAY_AGGREGATORS = { - "sum": lambda x, **kwargs: np.sum(x, **kwargs), - "avg": lambda x, **kwargs: np.mean(x, **kwargs), - "max": lambda x, **kwargs: np.max(x, **kwargs), - "min": lambda x, **kwargs: np.min(x, **kwargs), - "std": lambda x, **kwargs: np.std(x, **kwargs), - "range": lambda x, **kwargs: np.max(x, **kwargs) - np.min(x, **kwargs), - "mean": lambda x, **kwargs: np.mean(x, **kwargs), - "median": lambda x, **kwargs: np.median(x, **kwargs), - "geomean": lambda x, **kwargs: gmean(x, **kwargs), - "harmean": lambda x, **kwargs: hmean(x, **kwargs), - "mad": lambda x, **kwargs: mad(x, **kwargs), - "trimean": lambda x, **kwargs: trimean(x, **kwargs), - "inf": lambda x, **kwargs: np.linalg.norm(x, ord=np.inf, **kwargs), - "manhattan": lambda x, **kwargs: np.linalg.norm(x, ord=1, **kwargs), + "sum": lambda x, **kwargs: try_except_nan(np.sum, x, **kwargs), + "avg": lambda x, **kwargs: try_except_nan(np.mean, x, **kwargs), + "max": lambda x, **kwargs: try_except_nan(np.max, x, **kwargs), + "min": lambda x, **kwargs: try_except_nan(np.min, x, **kwargs), + "std": lambda x, **kwargs: try_except_nan(np.std, x, **kwargs), + "range": lambda x, **kwargs: try_except_nan(np.max, x, **kwargs) + - try_except_nan(np.min, x, **kwargs), + "mean": lambda x, **kwargs: try_except_nan(np.mean, x, **kwargs), + "median": lambda x, **kwargs: try_except_nan(np.median, x, **kwargs), + "geomean": lambda x, **kwargs: try_except_nan(gmean, x, **kwargs), + "harmean": lambda x, **kwargs: try_except_nan(hmean, x, **kwargs), + "mad": lambda x, **kwargs: try_except_nan(mad, x, **kwargs), + "trimean": lambda x, **kwargs: try_except_nan(trimean, x, **kwargs), + "inf": lambda x, **kwargs: try_except_nan(np.linalg.norm, x, ord=np.inf, **kwargs), + "manhattan": lambda x, **kwargs: try_except_nan(np.linalg.norm, x, ord=1, **kwargs), } MA_ARRAY_AGGREGATORS = { - "sum": lambda x, **kwargs: np.ma.sum(x, **kwargs), - "avg": lambda x, **kwargs: np.ma.mean(x, **kwargs), - "max": lambda x, **kwargs: np.ma.max(x, **kwargs), - "min": lambda x, **kwargs: np.ma.min(x, **kwargs), - "std": lambda x, **kwargs: np.ma.std(x, **kwargs), - "range": lambda x, **kwargs: np.ma.max(x, **kwargs) - np.ma.min(x, **kwargs), - "mean": lambda x, **kwargs: np.ma.mean(x, **kwargs), - "median": lambda x, **kwargs: np.ma.median(x, **kwargs), - "geomean": lambda x, **kwargs: mstast_gmean(x, **kwargs), - "harmean": lambda x, **kwargs: mstast_hmean(x, **kwargs), - "mad": lambda x, **kwargs: masked_mad(x, **kwargs), - "trimean": lambda x, **kwargs: masked_trimean(x, **kwargs), - "inf": lambda x, **kwargs: np.linalg.norm(x, ord=np.inf, **kwargs), - "manhattan": lambda x, **kwargs: np.linalg.norm(x, ord=1, **kwargs), + "sum": lambda x, **kwargs: try_except_nan(np.ma.sum, x, **kwargs), + "avg": lambda x, **kwargs: try_except_nan(np.ma.mean, x, **kwargs), + "max": lambda x, **kwargs: try_except_nan(np.ma.max, x, **kwargs), + "min": lambda x, **kwargs: try_except_nan(np.ma.min, x, **kwargs), + "std": lambda x, **kwargs: try_except_nan(np.ma.std, x, **kwargs), + "range": lambda x, **kwargs: try_except_nan(np.ma.max, x, **kwargs) + - try_except_nan(np.ma.min, x, **kwargs), + "mean": lambda x, **kwargs: try_except_nan(np.ma.mean, x, **kwargs), + "median": lambda x, **kwargs: try_except_nan(np.ma.median, x, **kwargs), + "geomean": lambda x, **kwargs: try_except_nan(mstast_gmean, x, **kwargs), + "harmean": lambda x, **kwargs: try_except_nan(mstast_hmean, x, **kwargs), + "mad": lambda x, **kwargs: try_except_nan(masked_mad, x, **kwargs), + "trimean": lambda x, **kwargs: try_except_nan(masked_trimean, x, **kwargs), + "inf": lambda x, **kwargs: try_except_nan(np.linalg.norm, x, ord=np.inf, **kwargs), + "manhattan": lambda x, **kwargs: try_except_nan(np.linalg.norm, x, ord=1, **kwargs), } diff --git a/src/mofdscribe/featurizers/utils/definitions.py b/src/mofdscribe/featurizers/utils/definitions.py index e9b6400b..d1bae93e 100644 --- a/src/mofdscribe/featurizers/utils/definitions.py +++ b/src/mofdscribe/featurizers/utils/definitions.py @@ -30,7 +30,6 @@ "Zr", "Nb", "Mo", - "Tc", "Ru", "Rh", diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py index 939b7125..218ff977 100644 --- a/tests/featurizers/chemistry/test_racs.py +++ b/tests/featurizers/chemistry/test_racs.py @@ -48,7 +48,9 @@ def test_racs(hkust_structure, irmof_structure): assert np.isnan(np.array(list(v.values()))).sum() == len(v) racs_ordered = OrderedDict(sorted(racs.items())) - assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list(map(lambda x: x.lower(), featurizer.feature_labels())) + assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list( + map(lambda x: x.lower(), featurizer.feature_labels()) + ) # assert len(featurizer.feature_labels()) == 120 assert len(featurizer.citations()) == 2 @@ -85,7 +87,9 @@ def test_racs_functional(irmof_structure, abacuf_structure, floating_structure): # we classify the "O" as a functional group assert np.isnan(np.array(list(v.values()))).sum() == 0 racs_ordered = OrderedDict(sorted(racs.items())) - assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list(map(lambda x: x.lower(), featurizer.feature_labels())) + assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list( + map(lambda x: x.lower(), featurizer.feature_labels()) + ) # assert len(featurizer.feature_labels()) == 120 assert len(featurizer.citations()) == 2